In [1]:
import re
import string
from typing import Any

import emoji
import polars as pl
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

pl.Config.set_fmt_str_lengths(40)

polars.config.Config

## Recommender System

I am Creating a ML model which will recommend "Youtube Channels" to users and for that we are **training more than one model** which are trained on different datasets:
  1. First will train on **channel's videos titles** only.
  3. Second will train on **channel's videos tags** only.
  2. Other will train on other data like **video duration, categoryId** and more.

In [2]:
df = pl.read_json("data.arv/ctt/titles_data.json")
print(df.shape)
df.head()

(2275, 9)


categoryId,channelId,channelTitle,description,duration,id,publishedAt,tags,title
str,str,str,str,str,str,str,list[str],str
"""28""","""UCs5Y5_7XK8HLDX0SLNwkd3w""","""Visual Studio Code""","""#shorts""","""PT38S""","""4dgSwnKtVLo""","""2023-09-22T05:12:53""","[""vscode"", ""visual studio code"", ""vs code""]","""Arrow Function: Think Backwards"""
"""28""","""UC46xhU1EH7aywEgvA9syS3w""","""anthonywritescode""","""it's here! with ugly new typing syntax…","""PT41M42S""","""IV8OZY4194U""","""2023-08-02T15:00:25""",,"""python 3.12 release highlights (beginne…"
"""27""","""UCsDTy8jvHcwMvSZf_JGi-FA""","""Abhi and Niyu""","""To support our work you can consider bu…","""PT14M39S""","""4uM7RIfMLK4""","""2023-10-07T13:00:12""","[""abhi and niyu"", ""abhi and niyu latest"", … ""abi n niu""]","""Australia's economy is WEIRD | Australi…"
"""28""","""UCQV-7R4qhAJXxdkrWNOglDQ""","""Dekho Isko""","""Join me on Social media:- 📷 Instagram •…","""PT4M41S""","""uWLooUi8in0""","""2023-09-06T16:52:42""","[""india rename"", ""india renamed as bharat"", … ""rename india as bharat""]","""Will India Change its Name to Bharat Pe…"
"""24""","""UCgbzclo4Mfy_D68w_Bm_xHg""","""Tried&Refused Productions.""","""There is always so much to watch but I …","""PT10M33S""","""diajsqMk-mo""","""2023-09-02T06:45:02""","[""maaveeran"", ""movie"", … ""yogi babu""]","""3 Amazing Indian Movies In 2023 That De…"


In [3]:
# Get all unique tags used by each channel
tags_grp = (
    df.explode("tags")
    .group_by("channelTitle")
    .agg(pl.col("tags").unique())
    .with_columns(pl.col("tags").list.join(" "))
)
tags_grp.head()

channelTitle,tags
str,str
"""Soch by Mohak Mangal""","""girlfriend mukesh ambani launch Chandra…"
"""Foodie Saand""","""Haridwar Kadhi Chawal Special Grilled S…"
"""Aaj Tak""","""chandrayaan 2 crash video chandrayaan 3…"
"""Mr Techpedia""","""best smartphone 2023 psychology phone 2…"
"""CineDesi""","""achara kirk Achara Kirk kristen stephen…"


In [4]:
# Get all the video titles of each channel
title_grp = (
    df.group_by("channelTitle")
    .agg(pl.col("title").unique())
    .with_columns(pl.col("title").list.join(" "))
)
title_grp.head()

channelTitle,title
str,str
"""The S2 Life""","""Celebrities in Big Boss | Harsh Beniwal…"
"""BnfTV""","""Kismat hi kharab hai 😂 (Number 1 Hindi …"
"""Foodie Saand""","""Sonu OVERLOADED Chur Chur Naan, Shree R…"
"""Pakistani Reacts""","""MEMES THAT ARE REALLY WILD ❤️ | MEME RE…"
"""Backstage with Millionaires""","""Ola Electric Did WHAT?! Gaming Companie…"


In [5]:
# Merge both titles_grp and tags_grp to create a training set
title_tags_df = title_grp.join(tags_grp, on="channelTitle")
print(title_tags_df.shape)
title_tags_df.head()

(85, 3)


channelTitle,title,tags
str,str,str
"""Soch by Mohak Mangal""","""Will Reservation REALLY help Women? How…","""girlfriend mukesh ambani launch Chandra…"
"""Foodie Saand""","""Sonu OVERLOADED Chur Chur Naan, Shree R…","""Haridwar Kadhi Chawal Special Grilled S…"
"""Aaj Tak""","""Elvish Yadav EXCLUSIVE Interview: Bigg …","""chandrayaan 2 crash video chandrayaan 3…"
"""Mr Techpedia""","""The trap behind expensive smartphones i…","""best smartphone 2023 psychology phone 2…"
"""CineDesi""","""HOUSEFULL Movie Reaction Part 1/3! | Ak…","""achara kirk Achara Kirk kristen stephen…"


In [6]:
title_tags_df = title_tags_df.filter(pl.col("tags").ne("null"))
print(f"Total data points: {title_tags_df.height}")

Total data points: 75


### Split dataset

In [7]:
x_train: pl.DataFrame  # For type hinting
x_test: pl.DataFrame

x_train, x_test = train_test_split(
    title_tags_df,
    test_size=0.2,
    random_state=42,
)

print(f"{x_train.shape = }\n {x_test.shape = }")

x_train.shape = (60, 3)
 x_test.shape = (15, 3)


In [8]:
x_test.head()

channelTitle,title,tags
str,str,str
"""CineDesi""","""HOUSEFULL Movie Reaction Part 1/3! | Ak…","""achara kirk Achara Kirk kristen stephen…"
"""Thugesh Unfiltered""","""MC STAN & BIG BOSS IS FUNNY! Nora Fateh…","""worst street food thugesh wierd foods a…"
"""LIV Comedy""","""नकली Nawaz की Mimicry सुनकर हंस पड़े अस…","""daya comedy gokuldham society ki proble…"
"""Soch by Mohak Mangal""","""Will Reservation REALLY help Women? How…","""girlfriend mukesh ambani launch Chandra…"
"""Abhi and Niyu""","""Why does INDIA flood so easily? | India…","""khalistan in canada israel war hindi Na…"


## Build Model

In [9]:
def preprocess_title(s: str) -> str:
    """Preprocessor for vectorizer to preprocess titles data."""
    s = re.sub(r"\b\w{1,3}\b", " ", s)
    s = s.translate(str.maketrans("", "", string.punctuation + string.digits))
    s = emoji.replace_emoji(s, "")
    s = re.sub(r"\s+", " ", s)
    s = s.lower()
    return s


def preprocess_tags(s: str) -> str:
    """Preprocessor for vectorizer to preprocess titles data."""
    s = re.sub(r"\b\w{1,2}\b", " ", s)
    s = s.translate(str.maketrans("", "", string.punctuation + string.digits))
    s = emoji.replace_emoji(s, "")
    s = re.sub(r"\s+", " ", s)
    s = s.lower()
    return s

In [10]:
titles_transformer = TfidfVectorizer(
    max_features=7000,
    ngram_range=(1, 2),
    preprocessor=preprocess_title,
    stop_words="english",
)
tags_transformer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    preprocessor=preprocess_tags,
    stop_words="english",
)

### Calculate Similarity

**How to approach?**

1. Transform the data with both the **Transformers**.
2. Multiply both the transformed data with the weights\*.
3. Then add both the similarity values to get the final similarity value.
4. Sort the value and return the similar channels.

In [11]:
# Fit and tranform the x_train data
x_train_titles_trf = titles_transformer.fit_transform(x_train["title"])
x_train_tags_trf = tags_transformer.fit_transform(x_train["tags"])

In [12]:
def transform_data(data: pl.DataFrame):
    title_trf = titles_transformer.transform(data["title"])
    tags_trf = tags_transformer.transform(data["tags"])
    return title_trf, tags_trf


def calc_similarity(title, tags, *, weights: tuple[float, float]) -> dict[str, Any]:
    title_simi = cosine_similarity(x_train_titles_trf, title)
    tags_simi = cosine_similarity(x_train_tags_trf, tags)
    # Multiply with weights and add the similarity
    return {
        "titles_similarity": title_simi,
        "tags_similarity": tags_simi,
        "weights_similarity": (title_simi * weights[0]) + (tags_simi * weights[1]),
    }


def get_similar_channels(
    data: pl.DataFrame,
    weights: tuple[float, float] = (0.5, 0.5),
) -> pl.DataFrame:
    title_trf, tags_trf = transform_data(data)
    similarity = calc_similarity(title_trf, tags_trf, weights=weights)
    return x_train.with_columns(
        [pl.lit(v.ravel()).alias(k) for k, v in similarity.items()]
    ).sort("weights_similarity", descending=True)

In [15]:
x_test["channelTitle"].to_pandas()

0                 CineDesi
1       Thugesh Unfiltered
2               LIV Comedy
3     Soch by Mohak Mangal
4            Abhi and Niyu
5             News18 India
6       Elvish Yadav Vlogs
7                 KibaKibi
8             Dhruv Rathee
9                Kishanell
10             anime freak
11             The S2 Life
12      Visual Studio Code
13              ArjanCodes
14                Indently
Name: channelTitle, dtype: object

In [17]:
sl_data = x_test[5]
print(f"Selected Channel: {sl_data['channelTitle'].item()!r}")
get_similar_channels(sl_data).head()

Selected Channel: 'News18 India'


channelTitle,title,tags,titles_similarity,tags_similarity,weights_similarity
str,str,str,f64,f64,f64
"""TV9 Bharatvarsh""","""Israel-Hamas Conflict News Live: तबाही …","""israel war updates today Israel-Palesti…",0.227587,0.471182,0.349385
"""World Affairs by Unacademy""","""USA is Sending 2000 Soldiers to Israel …","""Trump's Vice President india and china …",0.18722,0.265762,0.226491
"""Zee News""","""Ujjain Rape Case Encounter: आरोपी के पि…","""Shoaib Akhtar On Indian Team swastik ri…",0.131865,0.224428,0.178146
"""The Deshbhakt""","""Beyond the Karan Sangwan Controversy & …","""isro live indian student in canada indi…",0.029471,0.229258,0.129364
"""Aaj Tak""","""Elvish Yadav EXCLUSIVE Interview: Bigg …","""chandrayaan 2 crash video chandrayaan 3…",0.079508,0.102036,0.090772


## Conclusion

The system gives promising result because it suggests very similar channels based on the content of passed channel.

**`Problem`:** I am able to create a Pipeline here due to that this seems difficult to deal with.

So, in the next iteration I am going to tackle the Pipeline building part and make this system easy to calcualte the results.