In [1]:
import re
import string

import emoji
import numpy as np
import polars as pl
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

pl.Config.set_fmt_str_lengths(40)

polars.config.Config

## Recommender System

I am Creating a ML model which will recommend "Youtube Channels" to users and for that we are **training more than one model** which are trained on different datasets:
  1. First will train on **channel's videos titles** only.
  3. Second will train on **channel's videos tags** only.
  2. Other will train on other data like **video duration, categoryId** and more.

In [2]:
df = pl.read_json("data/ctt/titles_data.json")
print(df.shape)
df.head()

(2275, 9)


categoryId,channelId,channelTitle,description,duration,id,publishedAt,tags,title
str,str,str,str,str,str,str,list[str],str
"""28""","""UCs5Y5_7XK8HLDX0SLNwkd3w""","""Visual Studio Code""","""#shorts""","""PT38S""","""4dgSwnKtVLo""","""2023-09-22T05:12:53""","[""vscode"", ""visual studio code"", ""vs code""]","""Arrow Function: Think Backwards"""
"""28""","""UC46xhU1EH7aywEgvA9syS3w""","""anthonywritescode""","""it's here! with ugly new typing syntax…","""PT41M42S""","""IV8OZY4194U""","""2023-08-02T15:00:25""",,"""python 3.12 release highlights (beginne…"
"""27""","""UCsDTy8jvHcwMvSZf_JGi-FA""","""Abhi and Niyu""","""To support our work you can consider bu…","""PT14M39S""","""4uM7RIfMLK4""","""2023-10-07T13:00:12""","[""abhi and niyu"", ""abhi and niyu latest"", … ""abi n niu""]","""Australia's economy is WEIRD | Australi…"
"""28""","""UCQV-7R4qhAJXxdkrWNOglDQ""","""Dekho Isko""","""Join me on Social media:- 📷 Instagram •…","""PT4M41S""","""uWLooUi8in0""","""2023-09-06T16:52:42""","[""india rename"", ""india renamed as bharat"", … ""rename india as bharat""]","""Will India Change its Name to Bharat Pe…"
"""24""","""UCgbzclo4Mfy_D68w_Bm_xHg""","""Tried&Refused Productions.""","""There is always so much to watch but I …","""PT10M33S""","""diajsqMk-mo""","""2023-09-02T06:45:02""","[""maaveeran"", ""movie"", … ""yogi babu""]","""3 Amazing Indian Movies In 2023 That De…"


In [3]:
# Get all unique tags used by each channel
# And the video titles of each channel
title_tags_df = (
    df.explode("tags")
    .group_by("channelId", "channelTitle")
    .agg(pl.col("tags", "title").unique())
    .with_columns(pl.col("tags", "title").list.join(" "))
    .filter(
        # Remove those channels which have no tags
        pl.col("tags").ne("null"),
    )
)
print(f"Total data points: {title_tags_df.height}")
title_tags_df.head()

Total data points: 75


channelId,channelTitle,tags,title
str,str,str,str
"""UCXgGY0wkgOzynnHvSEVmE3A""","""Hitesh Choudhary""","""pulumi best coding monitor zoho vite br…","""The Real Engineering Mindset | Degree V…"
"""UCLpbTDb3EKUltVAF-Ko_NXg""","""Loginion""","""garou vs saitama reaction Garou Charact…","""MAKIMA - S*XY but Most EVIL Character o…"
"""UCNCl2n5YZfUXaxSVL3zqlLA""","""Nomad Shubham""","""Flying For The First Time travel experi…","""Scammed By Wizz air in Iceland I took M…"
"""UCR4u702mibx-S0bjqBToQsw""","""Dee & Vee""","""Foreigners reacts to india india reacti…","""African Friends Reacts To G20 Summit: A…"
"""UC4JX40jDee_tINbkjycV4Sg""","""Tech With Tim""","""how to use a debugger django crash cour…","""How to Use a Debugger - Debugger Tutori…"


### Split dataset

In [4]:
x_train: pl.DataFrame  # For type hinting
x_test: pl.DataFrame

x_train, x_test = train_test_split(title_tags_df, test_size=0.2, random_state=42)

print(f"{x_train.shape = }\n {x_test.shape = }")

x_train.shape = (60, 4)
 x_test.shape = (15, 4)


In [5]:
x_test.head()

channelId,channelTitle,tags,title
str,str,str,str
"""UC4JX40jDee_tINbkjycV4Sg""","""Tech With Tim""","""how to use a debugger django crash cour…","""How to Use a Debugger - Debugger Tutori…"
"""UCu4X846OSea5YU6S8fIpy1A""","""BigDawsTv""","""Photoshopping Wanted Posters of Strange…","""TIPPING DELIVERY DRIVERS $10,000 I Gave…"
"""UCnU9c8lf6Cvfz8VkD4fxbVQ""","""Avanti Nagral""","""where is your family from Avanti nagral…","""Meet @JoshuaOtusanya 🤍 Happy Raksha Ban…"
"""UCXgGY0wkgOzynnHvSEVmE3A""","""Hitesh Choudhary""","""pulumi best coding monitor zoho vite br…","""The Real Engineering Mindset | Degree V…"
"""UCw7xjxzbMwgBSmbeYwqYRMg""","""Sony PAL""","""Jetha and Popatlal taarak Mehta Ka Oolt…","""Gada Electronics में हुई Problem | Taar…"


## Build Model

In [6]:
def preprocess_title(s: str) -> str:
    """Preprocessor for vectorizer to preprocess titles data."""
    s = re.sub(r"\b\w{1,3}\b", " ", s)
    s = s.translate(str.maketrans("", "", string.punctuation + string.digits))
    s = emoji.replace_emoji(s, "")
    s = re.sub(r"\s+", " ", s)
    s = s.lower()
    return s


def preprocess_tags(s: str) -> str:
    """Preprocessor for vectorizer to preprocess titles data."""
    s = re.sub(r"\b\w{1,2}\b", " ", s)
    s = s.translate(str.maketrans("", "", string.punctuation + string.digits))
    s = emoji.replace_emoji(s, "")
    s = re.sub(r"\s+", " ", s)
    s = s.lower()
    return s

In [7]:
title_transformer = TfidfVectorizer(
    max_features=7000,
    ngram_range=(1, 2),
    preprocessor=preprocess_title,
    stop_words="english",
)
tags_transformer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    preprocessor=preprocess_tags,
    stop_words="english",
)

In [8]:
transformer = ColumnTransformer(
    [
        ("title_trf", title_transformer, "title"),
        ("tags_trf", tags_transformer, "tags"),
    ]
)

### Calculate Similarity

**How to approach?**

1. Transform the data with both the **Transformers**.
2. Multiply both the transformed data with the weights\*.
3. Then add both the similarity values to get the final similarity value.
4. Sort the value and return the similar channels.

In [9]:
# Fit and tranform the x_train data
title_tags_vector = transformer.fit_transform(
    x_train.select("title", "tags").to_pandas()
)

In [10]:
# Combine title_tags_vector, channelId, channelTitle as DataFrame
title_tags_trf_df = x_train.select("channelId", "channelTitle").with_columns(
    pl.lit(title_tags_vector.toarray()).alias("transformed_data")  # type: ignore
)

In [14]:
def get_similar_channels(data: pl.DataFrame) -> pl.DataFrame:
    transformed_data = transformer.transform(data.to_pandas())
    similarity = cosine_similarity(
        np.array(title_tags_trf_df["transformed_data"].to_list()),
        transformed_data.toarray(),
    )
    return x_train.with_columns(
        pl.lit(np.ravel(similarity)).mul(100).round(2).alias("similarity")
    ).sort("similarity", descending=True)

In [12]:
x_test["channelTitle"].to_pandas()

0           Tech With Tim
1               BigDawsTv
2           Avanti Nagral
3        Hitesh Choudhary
4                Sony PAL
5               Dr. Swole
6             Coding Tech
7            Tech Junkies
8     Unfold Data Science
9             The S2 Life
10     Passenger Paramvir
11              Kishanell
12           Dhruv Rathee
13     Elvish Yadav Vlogs
14            anime freak
Name: channelTitle, dtype: object

In [15]:
sl_data = x_train[2]
print(f"Selected Channel: {sl_data['channelTitle'].item(0)!r}")
get_similar_channels(sl_data).head(10)

Selected Channel: 'Sony LIV'


channelId,channelTitle,tags,title,similarity
str,str,str,str,f64
"""UCOQNJjhXwvAScuELTT_i7cQ""","""Sony LIV""","""mcis8 Asian Games 2018 countries MCI In…","""Lakshya Sen vs. Lee Y. | Badminton | Me…",100.0
"""UCBIT1FSJW6yTlzqK-31FDWg""","""LIV Comedy""","""Sodhi ka hungama TMKOC tmkoc husband te…","""Weekly Reliv - Taarak Mehta Ka Ooltah C…",20.5
"""UC6-F5tO8uklgE9Zy8IvbdFw""","""Sony SAB""","""tapu sena new year episode saree sony t…","""Taarak Mehta Ka Ooltah Chashmah | Throw…",18.18
"""UCnSFZ-olBoLGLRUS_3RI2Aw""","""Taarak Mehta Ka Ooltah Chashmah""","""tmkoc Taarak Mehta Ka Ooltah Chashmah t…","""FULL EPISODE! 3847 - Taarak Mehta Ka Oo…",15.09
"""UCF10AG_t1AYW3mlmX7g1VJA""","""FanCode""","""chris gayle Highlights fastest fifty ll…","""Highlights: India win Asian Champions T…",11.16
"""UCCJsQKOKArvDksacfT2ryQw""","""World Affairs by Unacademy""","""Justin Trudeau fatf grey list world aff…","""Canada Apologises to the World | Video …",9.07
"""UCpEhnqL0y41EpW2TvWAHD7Q""","""SET India""","""new Superstar Singer Season 2 promo kri…","""This Musicianship Has A ""Swag Of Rajast…",5.02
"""UC5fcjujOsqD-126Chn_BAuA""","""Sarthak Goswami""","""mumbaikarnikhil education neuzboy oppen…","""Ye nahi bolna tha!! | Sunday Show! Why …",4.51
"""UCjAds6NXzUMkhpSI3_s90WA""","""Zuzana Reacts""","""history of india chandryan foreigners v…","""Cricket WorldCup 2023 in India | Reacti…",3.27
"""UC4eIbi6N7KJJLqRQsJdH0CQ""","""News,views, & updates.""","""dlf new projects in gurugram mega proje…","""Pak Reacts PAV BHAJI | Mumbai Special S…",3.22
