In [None]:
# %% [markdown]
# **Import libraries and dataset**

# %%
import pandas as pd

# Load dataset with spam column
df = pd.read_csv("combined_dataset_with_spam.csv")

print("✅ Dataset loaded")
print("📊 Shape:", df.shape)
print("📝 Features:", df.columns.tolist())
print(df.head(3))


# %% [markdown]
# **Convert Dates**

# %%
# Convert to datetime
df["publishedAt"] = pd.to_datetime(df["publishedAt"], errors="coerce")

# Extract year-week
df["year_week"] = df["publishedAt"].dt.strftime("%Y-%U")

print(df[["publishedAt", "year_week"]].head())


# %%
print(df["videoId"].dtype)   # should be int64 or object
print(df["videoId"].nunique())
print(df["videoId"].head(10))

# %% [markdown]
# **Aggregate Engagement**

# %% [markdown]
# engagement=likeCount+1×(comment)

# %%
# Engagement = likes + 1 per comment
df["engagement"] = df["likeCount"].fillna(0) + 1  

# Group by videoId + week safely
weekly = (
    df.groupby(["videoId", "year_week"], as_index=False)
      .agg(
          total_comments=("commentId", "count"),
          total_likes=("likeCount", "sum"),
          total_engagement=("engagement", "sum"),
          spam_ratio=("spam", "mean")   # average spam flag = spam ratio
      )
)

print("✅ Weekly engagement dataset created")
print("📊 Shape:", weekly.shape)
print("\n🔎 Preview:")
print(weekly.head(10))
print("\nUnique videoIds in weekly:", weekly["videoId"].nunique())


# %% [markdown]
# **Create Features and Target (Shifted Next-Week SoE)**

# %%
# Sort by videoId and week for shifting
weekly = weekly.sort_values(["videoId", "year_week"])

# Create next-week target (SoE)
weekly["next_week_engagement"] = (
    weekly.groupby("videoId")["total_engagement"].shift(-1)
)

# Drop rows where next_week_engagement is NaN (last week per video)
weekly = weekly.dropna(subset=["next_week_engagement"])

print("✅ Target created")
print("📊 Shape:", weekly.shape)
print(weekly.head(10))


# %% [markdown]
# **Feature Engineering**

# %%
features = ["total_comments", "total_likes", "total_engagement", "spam_ratio"]
target = "next_week_engagement"

X = weekly[features]
y = weekly[target]

print("✅ Features and target prepared")
print("📝 Features:", features)
print("🎯 Target:", target)


# %% [markdown]
# **Train/Test Split**

# %%
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("✅ Data split complete")
print("📊 Train shape:", X_train.shape)
print("📊 Test shape:", X_test.shape)

# %% [markdown]
# **Exploratory Plots**

# %%
import matplotlib.pyplot as plt

# 1️⃣ Engagement distribution
plt.figure(figsize=(8,5))
weekly["total_engagement"].hist(bins=50)
plt.xlabel("Total Engagement (per video-week)")
plt.ylabel("Frequency")
plt.title("Distribution of Weekly Engagement")
plt.show()

# 2️⃣ Spam ratio distribution
plt.figure(figsize=(8,5))
weekly["spam_ratio"].hist(bins=50, color="orange")
plt.xlabel("Spam Ratio")
plt.ylabel("Frequency")
plt.title("Distribution of Spam Ratios per Week")
plt.show()

# 3️⃣ Example video trend
example_video = weekly["videoId"].iloc[0]   # pick the first videoId
video_trend = weekly[weekly["videoId"] == example_video]

plt.figure(figsize=(10,5))
plt.plot(video_trend["year_week"], video_trend["total_engagement"], marker="o")
plt.xticks(rotation=45)
plt.xlabel("Year-Week")
plt.ylabel("Engagement")
plt.title(f"Engagement Trend for Video {example_video}")
plt.tight_layout()
plt.show()


# %% [markdown]
# **Train LightGBM Model**

# %%
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# -----------------------------
# 1️⃣ Create target = next week's engagement
# -----------------------------
weekly = weekly.sort_values(["videoId", "year_week"])
weekly["next_engagement"] = weekly.groupby("videoId")["total_engagement"].shift(-1)

# Drop rows where next_engagement is missing (last week per video)
weekly = weekly.dropna(subset=["next_engagement"])

print("✅ Target column created")
print("📊 Shape after shift:", weekly.shape)

# -----------------------------
# 2️⃣ Select features & target
# -----------------------------
features = ["total_comments", "total_likes", "total_engagement", "spam_ratio"]
target = "next_engagement"

X = weekly[features]
y = weekly[target]

# -----------------------------
# 3️⃣ Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 4️⃣ Convert to LightGBM Dataset
# -----------------------------
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_val   = lgb.Dataset(X_test, label=y_test)

# -----------------------------
# 5️⃣ Parameters
# -----------------------------
params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "seed": 42,
    "verbosity": -1
}

# -----------------------------
# 6️⃣ Train model
# -----------------------------
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)

print("✅ LightGBM training complete")

# -----------------------------
# 7️⃣ Evaluate model
# -----------------------------
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"📉 RMSE: {rmse:.2f}")
print(f"📉 MAE: {mae:.2f}")


# %% [markdown]
# **SHAP Explainability**

# %%
import shap
import matplotlib.pyplot as plt

# -----------------------------
# 1️⃣ Initialize SHAP explainer
# -----------------------------
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

print("✅ SHAP values computed")

# -----------------------------
# 2️⃣ Feature importance summary plot
# -----------------------------
plt.figure()
shap.summary_plot(shap_values, X_test, feature_names=features)

# -----------------------------
# 3️⃣ Feature importance bar chart
# -----------------------------
plt.figure()
shap.summary_plot(shap_values, X_test, feature_names=features, plot_type="bar")


# %% [markdown]
# **Save Model & SHAP Results**

# %%
import joblib
import shap
import pandas as pd

# -----------------------------
# 1️⃣ Save trained model
# -----------------------------
joblib.dump(model, "lgbm_soe_model.pkl")
print("✅ Model saved as lgbm_soe_model.pkl")

# -----------------------------
# 2️⃣ Save SHAP values (optional, can be heavy)
# -----------------------------
shap_values_df = pd.DataFrame(shap_values, columns=features)
shap_values_df.to_csv("shap_values.csv", index=False)
print("✅ SHAP values saved as shap_values.csv")

# -----------------------------
# 3️⃣ Save feature importance
# -----------------------------
importance_df = pd.DataFrame({
    "feature": features,
    "importance": model.feature_importance()
}).sort_values(by="importance", ascending=False)

importance_df.to_csv("feature_importance.csv", index=False)
print("✅ Feature importance saved as feature_importance.csv")

# -----------------------------
# 4️⃣ Reload model later
# -----------------------------
loaded_model = joblib.load("lgbm_soe_model.pkl")
print("✅ Model reloaded, ready for new predictions")


# %% [markdown]
# **Prediction Pipeline**

# %%
import pandas as pd
import joblib

# -----------------------------
# 1️⃣ Load model & feature list
# -----------------------------
model = joblib.load("lgbm_soe_model.pkl")
features = ["total_comments", "total_likes", "total_engagement", "spam_ratio"]

print("✅ Model loaded, ready for predictions")

# -----------------------------
# 2️⃣ Example: New incoming video engagement data
# -----------------------------
new_data = pd.DataFrame([
    {
        "total_comments": 120,
        "total_likes": 450,
        "total_engagement": 570,  # likes + comments
        "spam_ratio": 0.25
    },
    {
        "total_comments": 300,
        "total_likes": 1200,
        "total_engagement": 1500,
        "spam_ratio": 0.10
    }
])

print("🔎 New data to predict:")
print(new_data)

# -----------------------------
# 3️⃣ Predict next-week SoE
# -----------------------------
predictions = model.predict(new_data[features])

new_data["predicted_next_week_SoE"] = predictions

print("\n✅ Predictions complete")
print(new_data)


# %%
print(weekly.columns.tolist())


# %%
# Create a SoE column as normalized engagement
weekly['SoE'] = weekly['total_engagement'] / weekly['total_engagement'].max()

# Select features
auth_features = weekly[['SoE', 'spam_ratio']].copy()

# Fill NaN
auth_features = auth_features.fillna(0)

# Train Isolation Forest
from sklearn.ensemble import IsolationForest
iso = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
iso.fit(auth_features)

# Predict authenticity
weekly['authenticity_flag'] = iso.predict(auth_features)
weekly['authenticity_score'] = weekly['authenticity_flag'].map({1: 'Authentic', -1: 'Suspicious'})

# Preview results
print(weekly[['videoId', 'SoE', 'spam_ratio', 'authenticity_score']].head())


# %%
suspicious_videos = weekly[weekly['authenticity_score'] == 'Suspicious']
print(f"⚠️ Number of suspicious videos: {len(suspicious_videos)}")
suspicious_videos[['videoId', 'SoE', 'spam_ratio', 'authenticity_score']].head()


# %%
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,6))
sns.scatterplot(
    data=weekly,
    x='SoE',
    y='spam_ratio',
    hue='authenticity_score',
    palette={'Authentic':'green', 'Suspicious':'red'},
    alpha=0.7
)
plt.title('Fake Engagement Insurance: SoE vs Spam Ratio')
plt.xlabel('Share of Engagement (SoE)')
plt.ylabel('Spam Ratio')
plt.legend(title='Authenticity')
plt.show()


# %%
weekly['suspicion_score'] = 1 - (weekly['authenticity_flag'] + 1)/2  # 1 = suspicious, 0 = authentic
top_suspicious = weekly.sort_values(by='suspicion_score', ascending=False).head(10)
print(top_suspicious[['videoId', 'SoE', 'spam_ratio', 'authenticity_score']])


# %%
import joblib
joblib.dump(iso, "iso_model.pkl")
print("✅ Isolation Forest model saved as iso_model.pkl")


