In [1]:
import polars as pl

from youtube import IngestYtHistory

In [2]:
pl.Config.set_fmt_str_lengths(50)

polars.config.Config

## Create dataframe for model training

In [8]:
def create_df_for_ctt(*objs: str | pl.DataFrame) -> pl.DataFrame:
    if not all(isinstance(i, str | pl.DataFrame) for i in objs):
        raise TypeError("`paths` params must be either 'str' or 'polars.DataFrame'")

    imp_cols = ["title", "channelTitle", "channelId"]
    df = pl.concat(
        [pl.read_json(i)[imp_cols] if isinstance(i, str) else i[imp_cols] for i in objs]
    )
    return df.unique().drop_nulls()

In [10]:
# Best final shape is (15129, 3)

df_for_ctt = create_df_for_ctt(
    # "data/allVideoDetails.json",
    IngestYtHistory("data/raw/arv-anshul.json").initiate(),
    IngestYtHistory("data/raw/itsarv.json").initiate(),
    IngestYtHistory(
        "data/zip/Takeout/YouTube and YouTube Music/history/watch-history.json"
    ).initiate(),
)
print(df_for_ctt.shape)
df_for_ctt.head()

(14071, 3)


title,channelTitle,channelId
str,str,str
"""Google is payi…","""Tech Junkies""","""UCn4fhtzOgtoEK…"
"""Is Monaspace Y…","""Syntax""","""UCyU5wkjgQYGRB…"
"""Graphic Design…","""KibaKibi""","""UCfEO65bSgEQjS…"
"""Speed Up Data …","""NeuralNine""","""UC8wZnXYK_CGKl…"
"""Bihar Houses I…","""Backstage with…","""UCnpekFV93kB1O…"


## Initiate `ContentTypeTagging` class

In [5]:
ctt = TaggingWithTitle(df_for_ctt)
print(ctt.df.shape)
ctt.df.head()

(10578, 3)


title,channelTitle,contentType
str,str,str
"""Make Fractal Art With Python!""","""Tech With Tim""","""Programming"""
"""The EU Will Break Apple.""","""TechLinked""","""Tech"""
"""Makefiles in Python For Professional Automation""","""NeuralNine""","""Programming"""
"""BAMBOO is the real GOLD of INDIA ! 🇮🇳""","""Aevy TV""","""Pseudo Education"""
"""JSON Schema Validation in Python: Bring Structure…","""NeuralNine""","""Programming"""


### Build and calculate score

In [6]:
# Build ContentTypeTagging model
ctt.build(force=True)

# Calculate the model accuracy
ctt.model_acc_score()

0.7776517300056721

### Predict with model

In [7]:
pred_df = ctt.predict()
print(pred_df.shape)
pred_df.head()

(10578, 4)


title,channelTitle,contentType,contentTypePred
str,str,str,str
"""Make Fractal Art With Python!""","""Tech With Tim""","""Programming""","""Programming"""
"""The EU Will Break Apple.""","""TechLinked""","""Tech""","""Tech"""
"""Makefiles in Python For Professional Automation""","""NeuralNine""","""Programming""","""Programming"""
"""BAMBOO is the real GOLD of INDIA ! 🇮🇳""","""Aevy TV""","""Pseudo Education""","""Reaction"""
"""JSON Schema Validation in Python: Bring Structure…","""NeuralNine""","""Programming""","""Programming"""


### ValueCount of actual and predicted contentType

In [8]:
ctt.df["contentType"].value_counts().sort("contentType").join(
    pred_df["contentTypePred"].value_counts(),
    left_on="contentType",
    right_on="contentTypePred",
    suffix="Pred",
).sort("countPred", descending=True)

contentType,count,countPred
str,u32,u32
"""Programming""",2337,1958
"""Movies & Reviews""",2112,1924
"""News""",2407,1868
"""Pseudo Education""",1291,1363
"""Tech""",333,817
"""Education""",298,590
"""Entertainment""",711,554
"""Reaction""",428,510
"""Music""",202,410
"""Vlogs""",266,319


In [9]:
# Get corresponding prediction with channelTitle
pred_df.group_by("channelTitle", "contentTypePred").count().filter(
    pl.col("channelTitle").str.contains(r"(?i)comic"),
).sort("count", descending=True)

channelTitle,contentTypePred,count
str,str,u32
"""ComicVerse""","""Movies & Reviews""",109
"""ComicVerse""","""Entertainment""",8
"""ComicVerse""","""Tech""",7
"""ComicVerse""","""Pseudo Education""",5
"""ComicVerse""","""Vlogs""",3
"""ComicVerse""","""News""",3
"""ComicVerse""","""Education""",3
"""ComicVerse""","""Shorts""",2
"""ComicVerse""","""Music""",1
"""ComicVerse""","""Reaction""",1


In [10]:
pred_df.filter(
    pl.col("channelTitle").str.contains(r"(?i)comic"),
    pl.col("contentTypePred").str.contains(r"(?i)tech"),
)

title,channelTitle,contentType,contentTypePred
str,str,str,str
"""Sabse Best Batman Jo Apne Nahi Dekha!""","""ComicVerse""","""Movies & Reviews""","""Tech"""
"""Black Adam Was Hiding This From us!""","""ComicVerse""","""Movies & Reviews""","""Tech"""
"""Ekdum Tatti! - Best & Worst of 2022""","""ComicVerse""","""Movies & Reviews""","""Tech"""
"""Please! Ise Miss Mat Karna - Watch This Before Th…","""ComicVerse""","""Movies & Reviews""","""Tech"""
"""Dharam Ya Karam Kise Chunoge? - Under The Banner …","""ComicVerse""","""Movies & Reviews""","""Tech"""
"""This is The Saddest Part of Revenge😔""","""ComicVerse""","""Movies & Reviews""","""Tech"""
"""Loki Season 1 Recap, Watch This Before Season 2!""","""ComicVerse""","""Movies & Reviews""","""Tech"""


## Calculate the `mode` **contentType** of a channel

In [11]:
mode_ctt = (
    pred_df.group_by("channelTitle")
    .agg(pl.col("contentTypePred").mode())
    .with_columns(
        pl.col("contentTypePred").list.get(0).alias("contentTypePredMode"),
    )
    .drop("contentTypePred")
)
mode_ctt.head()

channelTitle,contentTypePredMode
str,str
"""Krish Naik""","""Programming"""
"""TechLinked""","""Tech"""
"""PJ Explained""","""Movies & Reviews"""
"""Colors""","""Entertainment"""
"""Dhruv Rathee""","""Pseudo Education"""


In [12]:
print(f"Total rows in pred_df = {pred_df.height:,}")
(
    pred_df.join(mode_ctt, on="channelTitle").filter(
        # pl.col("contentTypePred").ne(pl.col("contentTypePredMode")),  # Pred != PredMode
        # pl.col("contentType").ne(pl.col("contentTypePredMode")),  # Actual != Pred
        pl.col("contentType").eq(pl.col("contentTypePredMode")),  # Actual == PredMode
    )
)

Total rows in pred_df = 10,578


title,channelTitle,contentType,contentTypePred,contentTypePredMode
str,str,str,str,str
"""Make Fractal Art With Python!""","""Tech With Tim""","""Programming""","""Programming""","""Programming"""
"""The EU Will Break Apple.""","""TechLinked""","""Tech""","""Tech""","""Tech"""
"""Makefiles in Python For Professional Automation""","""NeuralNine""","""Programming""","""Programming""","""Programming"""
"""BAMBOO is the real GOLD of INDIA ! 🇮🇳""","""Aevy TV""","""Pseudo Education""","""Reaction""","""Pseudo Education"""
"""JSON Schema Validation in Python: Bring Structure…","""NeuralNine""","""Programming""","""Programming""","""Programming"""
"""Why I Switched To Linux! And How It's Going...""","""NeuralNine""","""Programming""","""Tech""","""Programming"""
"""Ye nahi bolna tha!! | Sunday Show!""","""Sarthak Goswami""","""News""","""News""","""News"""
"""Text Classification | NLP Lecture 6 | End to End …","""CampusX""","""Programming""","""Programming""","""Programming"""
"""How To Structure A Programming Project…""","""Tech With Tim""","""Programming""","""Education""","""Programming"""
"""DeepFakes: When Fake Is Real & REAL Is FAKE!""","""Sarthak Goswami""","""News""","""Pseudo Education""","""News"""


### Which are those channels whose (prediction mode != actual contentType)

In [13]:
(
    pred_df.join(mode_ctt, on="channelTitle")
    .filter(
        pl.col("contentType").ne(pl.col("contentTypePredMode")),
    )
    .select(pl.col("channelTitle").unique())
)

channelTitle
str
"""100x Engineers"""
"""The Quint"""
"""Naveensingh_05"""
"""Labour Law Advisor"""
"""YOGI BABA PRODUCTIONS"""
"""Ankit Inspires India"""
"""Baaten Bazar Ki (BBK)"""
"""Finance Wallah"""
"""Ishan Sharma"""
"""Goldmines Bollywood"""
