In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("../data/youtube_shorts_tiktok_trends_2025.csv_ML.csv")


In [44]:
import sys
sys.path.append("../src")

from preprocess import map_labels, clean_features, get_feature_matrix
from model_utils import rate_video


In [45]:
df = map_labels(df)
df['trend_bucket'].value_counts()


trend_bucket
likely      32402
trending    12500
low          5098
Name: count, dtype: int64

In [46]:
features = [
    'title_len', 'text_richness',
    'like_rate', 'comment_rate', 'share_rate',
    'views_per_day', 'likes_per_day',
    'rel_like', 'rel_share', 'rel_combo',
    'like_hashtag_interaction', 'share_hashtag_interaction',
    'platform_cat', 'region_cat', 'language_cat', 
    'category_cat', 'traffic_source_cat',
    'device_brand_cat', 'creator_tier_cat',
    'richness_traffic_interaction', 'weekend_hashtag_boost'
]
df_clean = clean_features(df, features)
X, y = get_feature_matrix(df_clean, features)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

log_reg = LogisticRegression(max_iter=3000, multi_class='multinomial')
log_reg.fit(X_train, y_train)

pred_lr = log_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred_lr))
print("Macro F1:", f1_score(y_test, pred_lr, average='macro'))
print(confusion_matrix(y_test, pred_lr))
print(classification_report(y_test, pred_lr))




Accuracy: 0.8266
Macro F1: 0.7595895921078731
[[5952  157  371]
 [ 435  571   14]
 [ 752    5 1743]]
              precision    recall  f1-score   support

      likely       0.83      0.92      0.87      6480
         low       0.78      0.56      0.65      1020
    trending       0.82      0.70      0.75      2500

    accuracy                           0.83     10000
   macro avg       0.81      0.73      0.76     10000
weighted avg       0.82      0.83      0.82     10000



In [48]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=400, random_state=42)
rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred_rf))
print("Macro F1:", f1_score(y_test, pred_rf, average='macro'))
print(confusion_matrix(y_test, pred_rf))
print(classification_report(y_test, pred_rf))


Accuracy: 0.8379
Macro F1: 0.7985283451796108
[[5883  203  394]
 [ 229  776   15]
 [ 768   12 1720]]
              precision    recall  f1-score   support

      likely       0.86      0.91      0.88      6480
         low       0.78      0.76      0.77      1020
    trending       0.81      0.69      0.74      2500

    accuracy                           0.84     10000
   macro avg       0.82      0.79      0.80     10000
weighted avg       0.84      0.84      0.84     10000



In [49]:
test_videos = [
    {
        "title_len": 10,
        "text_richness": 0.50,
        "like_rate": 0.04,
        "comment_rate": 0.01,
        "share_rate": 0.01,
        "views_per_day": 2000,
        "likes_per_day": 200,
        "rel_like": 0.6,
        "rel_share": 0.3,
        "rel_combo": 0.5,
        "like_hashtag_interaction": 0.2,
        "share_hashtag_interaction": 0.1,
        "platform_cat": 1,
        "region_cat": 7,
        "language_cat": 0,
        "category_cat": 4,
        "traffic_source_cat": 2,
        "device_brand_cat": 1,
        "creator_tier_cat": 1,
        "richness_traffic_interaction": 3.0,
        "weekend_hashtag_boost": 0
    },  # low-ish
    {
        "title_len": 20,
        "text_richness": 0.8,
        "like_rate": 0.15,
        "comment_rate": 0.03,
        "share_rate": 0.04,
        "views_per_day": 15000,
        "likes_per_day": 2000,
        "rel_like": 2.1,
        "rel_share": 1.5,
        "rel_combo": 2.2,
        "like_hashtag_interaction": 1.0,
        "share_hashtag_interaction": 0.8,
        "platform_cat": 0,
        "region_cat": 10,
        "language_cat": 0,
        "category_cat": 2,
        "traffic_source_cat": 4,
        "device_brand_cat": 3,
        "creator_tier_cat": 2,
        "richness_traffic_interaction": 8.0,
        "weekend_hashtag_boost": 1
    },  # trending-like
    {
        "title_len": 5,
        "text_richness": 0.2,
        "like_rate": 0.02,
        "comment_rate": 0.005,
        "share_rate": 0.007,
        "views_per_day": 500,
        "likes_per_day": 30,
        "rel_like": 0.2,
        "rel_share": 0.1,
        "rel_combo": 0.1,
        "like_hashtag_interaction": 0.05,
        "share_hashtag_interaction": 0.03,
        "platform_cat": 1,
        "region_cat": 3,
        "language_cat": 0,
        "category_cat": 3,
        "traffic_source_cat": 1,
        "device_brand_cat": 0,
        "creator_tier_cat": 0,
        "richness_traffic_interaction": 1.0,
        "weekend_hashtag_boost": 0
    }  # weak
]
for i, video in enumerate(test_videos):
    pred, score = rate_video(video, rf)
    print(f"Video {i+1}: Pred = {pred}, Score = {float(score):.2f}")







Video 1: Pred = trending, Score = 70.00
Video 2: Pred = trending, Score = 74.00
Video 3: Pred = likely, Score = 42.50
