In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier  # or GradientBoostingClassifier if you prefer sklearn-only
import shap, seaborn as sns, matplotlib.pyplot as plt
from nltk.sentiment import SentimentIntensityAnalyzer

df = pd.read_csv("../data/youtube_shorts_tiktok_trends_2025.csv_ML.csv")


In [None]:
# Sentiment from raw titles (aligns row-for-row with the _ML file)
sia = SentimentIntensityAnalyzer()
df_raw = pd.read_csv("../data/youtube_shorts_tiktok_trends_2025.csv", usecols=["title"])
df["title_sentiment"] = df_raw["title"].fillna("").apply(lambda t: sia.polarity_scores(t)["compound"])


In [None]:
import sys
sys.path.append("../src")

from preprocess import map_labels, clean_features, get_feature_matrix
from model_utils import rate_video


In [None]:
df_country = pd.read_csv("../data/country_platform_summary_2025.csv")
df_country.head()

df_country.rename(columns={'country': 'region'}, inplace=True)

df_enriched = df.merge(
    df_country[['region', 'platform', 'median_er', 'avg_velocity', 'avg_engagement_per_1k']],
    on=['region', 'platform'],
    how='left'
)
# Compute region+platform "strength" from your own ML data
region_platform_strength = (
    df.groupby(['region', 'platform'])
      .agg({
          'views_per_day': 'mean',
          'like_rate': 'mean',
          'share_rate': 'mean',
          'rel_like': 'mean',
          'rel_share': 'mean'
      })
      .reset_index()
      .rename(columns={
          'views_per_day': 'region_platform_avg_views_per_day',
          'like_rate': 'region_platform_avg_like_rate',
          'share_rate': 'region_platform_avg_share_rate',
          'rel_like': 'region_platform_avg_rel_like',
          'rel_share': 'region_platform_avg_rel_share'
      })
)

region_platform_strength.head()




In [None]:
df = map_labels(df)
df['trend_bucket'].value_counts()




In [None]:
df_enriched = df.merge(
    region_platform_strength,
    on=['region', 'platform'],
    how='left'
)

df_enriched[['region', 'platform',
             'region_platform_avg_views_per_day',
             'region_platform_avg_like_rate']].head()

In [None]:
from sklearn.preprocessing import LabelEncoder
import re, collections
import re, json,os

def extract_hashtags(text):
    return re.findall(r"#\w+", text.lower()) if isinstance(text, str) else []

# Load the top tags you just saved
top_tags_path = "../models/top_trending_hashtags.json"
top_tags = set(json.load(open(top_tags_path))) if os.path.exists(top_tags_path) else set()

try:
    # Use the raw file (it has hashtags); assumes same row order as df_enriched/df
    df_raw_tags = pd.read_csv("../data/youtube_shorts_tiktok_trends_2025.csv", usecols=["hashtag"])
    hashtag_lists = df_raw_tags["hashtag"].fillna("").apply(extract_hashtags)
    hits = hashtag_lists.apply(lambda hs: sum(h in top_tags for h in hs))
    counts = hashtag_lists.apply(len)
    df_enriched["trending_hashtag_hits"] = hits
    df_enriched["trending_hashtag_ratio"] = hits / (counts + 1e-6)
except Exception as e:
    print("Hashtag column not found; defaulting to zeros:", e)
    df_enriched["trending_hashtag_hits"] = 0
    df_enriched["trending_hashtag_ratio"] = 0




features = [
    'title_len', 'text_richness',
    'like_rate', 'comment_rate', 'share_rate',
    'views_per_day', 'likes_per_day',
    'rel_like', 'rel_share', 'rel_combo',
    'like_hashtag_interaction', 'share_hashtag_interaction',
    'platform_cat', 'region_cat', 'language_cat', 
    'category_cat', 'traffic_source_cat',
    'device_brand_cat', 'creator_tier_cat',
    'richness_traffic_interaction', 'weekend_hashtag_boost',
        'region_platform_avg_views_per_day',
    'region_platform_avg_like_rate',
    'region_platform_avg_share_rate',
    'region_platform_avg_rel_like',
    'region_platform_avg_rel_share',
    "title_sentiment",
    'trending_hashtag_hits',     
    'trending_hashtag_ratio',   

]
features = list(dict.fromkeys(features))  # remove dupes

df_clean = clean_features(df_enriched, features)
X, y = get_feature_matrix(df_clean, features)

# encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

In [None]:
balanced_samples = (
    df_clean
    .groupby('trend_bucket', group_keys=False)
    .apply(lambda g: g.sample(min(3, len(g)), random_state=42))
)

balanced_samples[['trend_bucket']].head(10)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    eval_metric="mlogloss",
    random_state=42
)

pipeline = Pipeline([
    ("smote", SMOTE(random_state=42)),
    ("model", xgb)
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y_enc, cv=cv, scoring="f1_macro")
print("CV macro F1 (mean ± std):", cv_scores.mean(), cv_scores.std())

# Fit
pipeline.fit(X_train, y_train)

# Keep label names on the pipeline (custom attrs are allowed)
pipeline.label_encoder_ = le
pipeline.class_labels_ = le.classes_

# Metrics (decode ints back to strings)
preds_int = pipeline.predict(X_test)
preds = le.inverse_transform(preds_int)
y_test_labels = le.inverse_transform(y_test)

print("Holdout Accuracy:", accuracy_score(y_test_labels, preds))
print("Holdout Macro F1:", f1_score(y_test_labels, preds, average="macro"))
print(classification_report(y_test_labels, preds, target_names=le.classes_))


model = pipeline  # for saving






In [None]:
X_bg = X_train.sample(200, random_state=42)
X_explain = X_test.sample(100, random_state=42)

explainer = shap.Explainer(lambda data: model.predict_proba(data), X_bg, feature_names=features)
shap_values = explainer(X_explain)
classes = list(getattr(model, "class_labels_", model.named_steps["model"].classes_))
cls_idx = classes.index("trending") if "trending" in classes else 0
shap.plots.beeswarm(shap_values[:, :, cls_idx], max_display=10)






In [None]:

rf = RandomForestClassifier(n_estimators=400, random_state=42)
rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)


print("Accuracy:", accuracy_score(y_test, pred_rf))
print("Macro F1:", f1_score(y_test, pred_rf, average='macro'))
print(confusion_matrix(y_test, pred_rf))
print(classification_report(y_test, pred_rf))


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

# CV for RandomForest (use your fitted rf)
cv_rf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
cv_scores_rf = cross_val_score(rf, X, y, cv=cv_rf, scoring="f1_macro", n_jobs=1)  # <- force single process
print("RF CV macro F1 (mean ± std):", cv_scores_rf.mean(), cv_scores_rf.std())

# Holdout metrics for RandomForest
pred_rf_holdout = rf.predict(X_test)
print("\nRandom Forest Holdout Performance:")
print("Accuracy:", accuracy_score(y_test, pred_rf_holdout))
print("Macro F1:", f1_score(y_test, pred_rf_holdout, average='macro'))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, pred_rf_holdout))
print("\nClassification Report:")
print(classification_report(y_test, pred_rf_holdout))

# CV for XGB+SMOTE (use your fitted pipeline / cv already defined as cv)
cv_scores_xgb = cv_scores  # if you already computed; otherwise:
# cv_scores_xgb = cross_val_score(pipeline, X, y_enc, cv=cv, scoring="f1_macro", n_jobs=-1)
print("\nXGB+SMOTE CV macro F1 (mean ± std):", cv_scores_xgb.mean(), cv_scores_xgb.std())

# Holdout metrics for XGB+SMOTE (already computed in cell 10, showing again for comparison)
print("\nXGB+SMOTE Holdout Performance:")
print("Accuracy:", accuracy_score(y_test_labels, preds))
print("Macro F1:", f1_score(y_test_labels, preds, average="macro"))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_labels, preds))
print("\nClassification Report:")
print(classification_report(y_test_labels, preds, target_names=le.classes_))


In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=3000, multi_class="multinomial")
log_reg.fit(X_train, y_train)

# CV for logistic regression
cv_log = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
cv_scores_log = cross_val_score(log_reg, X, y, cv=cv_log, scoring="f1_macro", n_jobs=1)
print("LogReg CV macro F1 (mean ± std):", cv_scores_log.mean(), cv_scores_log.std())

# Holdout metrics for log_reg
pred_log = log_reg.predict(X_test)
print("\nLogistic Regression Holdout Performance:")
print("Accuracy:", accuracy_score(y_test, pred_log))
print("Macro F1:", f1_score(y_test, pred_log, average="macro"))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, pred_log))
print("\nClassification Report:")
print(classification_report(y_test, pred_log))


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Build a small MLP with scaling
mlp = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        alpha=1e-4,
        max_iter=500,
        random_state=42
    )),
])

# Use encoded labels if you're encoding (y_enc); otherwise y
y_for_fit = y_enc if "y_enc" in globals() else y
X_for_fit = X

# CV (keep n_jobs=1 if you've had joblib issues)
cv_mlp = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
cv_scores_mlp = cross_val_score(mlp, X_for_fit, y_for_fit, cv=cv_mlp, scoring="f1_macro", n_jobs=1)
print("MLP CV macro F1 (mean ± std):", cv_scores_mlp.mean(), cv_scores_mlp.std())

# Fit and holdout metrics
mlp.fit(X_train, y_train if "y_enc" not in globals() else y_train)
pred_mlp = mlp.predict(X_test)
print("\nMLP (Neural Network) Holdout Performance:")
print("Accuracy:", accuracy_score(y_test, pred_mlp))
print("Macro F1:", f1_score(y_test, pred_mlp, average="macro"))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, pred_mlp))
print("\nClassification Report:")
print(classification_report(y_test, pred_mlp))

In [None]:
sample_rows = df_clean.sample(5, random_state=42)  # or remove if not needed

test_videos = [
    row[features].to_dict()
    for _, row in balanced_samples.iterrows()
]

for _, row in sample_rows.iterrows():
    test_videos.append(row[features].to_dict())

for i, video in enumerate(test_videos):
    print(f"\n--- Test Video {i+1} ---")
    display(pd.DataFrame(video, index=[0]))


In [None]:


platform_map = df[['platform', 'platform_cat']].drop_duplicates().set_index('platform_cat')['platform'].to_dict()

region_map = df[['region', 'region_cat']].drop_duplicates().set_index('region_cat')['region'].to_dict()

for i, video in enumerate(test_videos):
    pred, score = rate_video(video, model)
    score_text = f"{score:.2f}/100" if score is not None else "n/a"
    print(f"Video {i+1}:")
    print(f"  Predicted Category: {pred}")
    print(f"  Trending Score: {score_text}")

    
    print(f"Platform: {platform_map[video['platform_cat']]}")
    print(f"Region: {region_map[video['region_cat']]}")

    
    if score > 60:
        print("  Interpretation: This video is highly likely to trend.")
    elif score > 20:
        print("  Interpretation: Decent performance but not strong enough to trend.")
    else:
        print("  Interpretation: Weak performance relative to region/platform norms.")
    
    print()






In [None]:
import json, os
from sklearn.metrics import accuracy_score, f1_score

def eval_model(clf, X_test, y_test, le=None):
    preds = clf.predict(X_test)
    y_true = y_test
    if le is not None:
        preds = le.inverse_transform(preds)
        y_true = le.inverse_transform(y_test)
    return {
        "acc": accuracy_score(y_true, preds),
        "f1_macro": f1_score(y_true, preds, average="macro"),
    }

results = {}

if "log_reg" in globals():
    results["log_reg"] = {
        "cv_f1_mean": cv_scores_log.mean(),
        "cv_f1_std": cv_scores_log.std(),
        **eval_model(log_reg, X_test, y_test, le if "le" in globals() else None),
    }

if "rf" in globals():
    results["random_forest"] = {
        "cv_f1_mean": cv_scores_rf.mean(),
        "cv_f1_std": cv_scores_rf.std(),
        **eval_model(rf, X_test, y_test, le if "le" in globals() else None),
    }
if "mlp" in globals():
    results["mlp"] = {
        "cv_f1_mean": cv_scores_mlp.mean(),
        "cv_f1_std": cv_scores_mlp.std(),
        **eval_model(mlp, X_test, y_test, le if "le" in globals() else None),
    }

if "pipeline" in globals():  # XGB+SMOTE
    results["xgb_smote"] = {
        "cv_f1_mean": cv_scores.mean(),   # your XGB CV
        "cv_f1_std": cv_scores.std(),
        **eval_model(pipeline, X_test, y_test, le),
    }
    results["selected"] = "xgb_smote"

metrics_path = "../models/metrics.json"
os.makedirs(os.path.dirname(metrics_path), exist_ok=True)
with open(metrics_path, "w") as f:
    json.dump(results, f, indent=2)

print("Saved metrics to", metrics_path)


In [None]:
# Save feature list and model for app explanations
import joblib, os
feature_list_path = "../models/feature_list.pkl"
model_path = "../models/trend_model.pkl"  # reuse existing name
os.makedirs("../models", exist_ok=True)
joblib.dump(features, feature_list_path)
joblib.dump(pipeline, model_path)
joblib.dump(le, "../models/label_encoder.pkl")

print("Saved feature list to", feature_list_path)
print("Re-saved model to", model_path)
print("Feature count:", len(features))