In [None]:
# LARM Regression - Early Features Only
# --------------------------------------
# Goal: Predict final video views using ONLY features available in the first 24h.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

# ## Load dataset
file_path = "../data/video_timeseries_data.csv"
df = pd.read_csv(file_path)

# ## Drop known leakage columns (post-24h and duplicates of target)
columns_to_drop = [
    'views_48h', 'impressions_48h', 'likes_48h', 'comments_48h',
    'views_72h', 'impressions_72h', 'likes_72h', 'comments_72h',
    'views_96h', 'impressions_96h', 'likes_96h', 'comments_96h',
    'views_120h', 'impressions_120h', 'likes_120h', 'comments_120h',
    'views_144h', 'impressions_144h', 'likes_144h', 'comments_144h',
    'views_168h', 'impressions_168h', 'likes_168h', 'comments_168h',
    'views_672h', 'impressions_672h', 'likes_672h', 'comments_672h'
]
df = df.drop(columns=columns_to_drop)

# ## Select early features (up to 24h) + metadata
early_features = [
    'video_length',
    'views_1h', 'impressions_1h', 'likes_1h', 'comments_1h',
    'views_3h', 'impressions_3h', 'likes_3h', 'comments_3h',
    'views_6h', 'impressions_6h', 'likes_6h', 'comments_6h',
    'views_12h', 'impressions_12h', 'likes_12h', 'comments_12h',
    'views_24h', 'impressions_24h', 'likes_24h', 'comments_24h'
]

# Target variable (same as views_672h in original):
y = df['views_final']
X = df[early_features].copy()

# ## Optional: Feature Engineering (engagement rate)
for h in ['1h','3h','6h','12h','24h']:
    X[f'engagement_rate_{h}'] = (df[f'likes_{h}'] + df[f'comments_{h}']) / (df[f'views_{h}'] + 1)

# ## Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ## Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ## Train model
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"MAPE: {mape:.4f}")

MAE: 460910.24
MAPE: 18.8887
