In [None]:
from ..src import utils

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import random

In [None]:
data = utils.initialize_data()

# Calculate Sentiment Score
data = utils.calculate_sentiment(data)

data_df_tf = utils.vectorize_df(data)

num_queries = 5

features, labels = utils.clustering_with_neighbors(data_df_tf, data, user_sentiment="positive", num_queries=num_queries)

In [None]:
# We compute the Silhouette Score and plot it
silhouette_score = utils.sil_score(features, labels, num_queries)

In [None]:
# Aqui falta la grafica para visualizar los clusters

In [None]:
# We group by channel and join tags from all the videos on the channel
tags_por_canal = data.groupby("channel_name")["video_tags"].apply(lambda x: ' '.join(str(v) for v in x.dropna()))

# TF-IDF + KMeans
## converts tags to numbers so you can do math with them. TF-IDF gives more weight to "important" words within the channel
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(tags_por_canal)
km = KMeans(n_clusters=5, random_state=42).fit(X)

# Asignamos clusters
canales_cluster = pd.DataFrame({
    "channel_name": tags_por_canal.index,
    "cluster": km.labels_
})

sns.countplot(data=canales_cluster, x="cluster")
plt.title("Number of channels per cluster")
plt.show()


In [None]:
%run plots.py 

In [None]:
graph_clusters()

In [None]:
# Show the type of tags for each cluster
for cluster_num in range(5):
    print(f"\nCluster {cluster_num}:")
    # We get the channels that belong to this cluster
    cluster_channels = canales_cluster[canales_cluster["cluster"] == cluster_num]
    # We get the tags from those channels
    cluster_tags = tags_por_canal[cluster_channels["channel_name"]].tolist()
    # We join all the tags in a list and display them
    all_tags = ' '.join(cluster_tags).split()
    #We get the most frequent tags in this cluster
    tags_freq = pd.Series(all_tags).value_counts().head(10)
    print(tags_freq)

Cluster 0: Sports-related videos with international creators like IShowSpeed and Squeezie, featuring some Arabic content.

Cluster 1: Viral short videos focused on humor, TikTok trends, and general entertainment.

Cluster 2: Spanish-language football content, mainly about Real Madrid and player Fede Valverde.

Cluster 3: Minecraft and animation-themed gaming content, with multilingual influence including Arabic.

Cluster 4: International movies and TV dramas, especially Nigerian films and popular "natok" series.

In [None]:
#  We select the target cluster (where there are more channels)
cluster_objetivo = 0

# We filter the channels that belong to that cluster
canales_en_cluster = canales_cluster[canales_cluster["cluster"] == cluster_objetivo]

#  We select a random channel from that cluster
canal_elegido = random.choice(canales_en_cluster["channel_name"].values)
print(f"Channel automatically chosen from the cluster {cluster_objetivo}: '{canal_elegido}'\n")

# TF-IDF of the cluster channel tags
tags_cluster = tags_por_canal.loc[canales_en_cluster["channel_name"]]
tfidf = TfidfVectorizer()
X_cluster = tfidf.fit_transform(tags_cluster)

# Index of the chosen channel
index_canal = tags_cluster.index.get_loc(canal_elegido)

# Cosine similarity between the channel and all others
similitudes = cosine_similarity(X_cluster[index_canal], X_cluster).flatten()

# 
TOPK = 10
similares_idx = similitudes.argsort()[::-1][1:TOPK+1]
canales_similares = tags_cluster.index[similares_idx]


print(f"Channels similars to '{canal_elegido}' in the cluster {cluster_objetivo}:\n")
for i, canal in enumerate(canales_similares):
    print(f"{i+1}. {canal} (similarity: {similitudes[similares_idx[i]]:.2f})")

The system recommends channels that use similar tags in their videos.
It's a content-based approach, using video tags as the channel description.

This compares how similar two channels are based on their tags.Using TF-IDF that stands for "Term Frequency - Inverse Document Frequency."

It is used to weight the most important words in each channel.

-If two channels use similar tags with similar frequencies, they will have a similarity close to 1.0.

-If they share almost no tags, the similarity will be close to 0.0.