# Social Media Dataset — EDA & Modeling (Senior DS Workflow)

This notebook analyzes the synthetic `social_media_sample.csv` generated earlier.
It follows a production-style pipeline:
- **EDA**: data quality, distributions, correlations, seasonal trends
- **Feature engineering**: time, text, ratios, interactions
- **Preprocessing**: `ColumnTransformer` with scaling + one-hot
- **Modeling**: compare 6+ models with cross-validation (MAE, RMSE, R²); auto-densify for tree models
- **Evaluation**: test metrics, residuals, calibration
- **Interpretability**: feature importances for tree models
- **Business insights**: posting hour/platform/type patterns

> Data path expected at: `/mnt/data/social_media_sample.csv`

In [None]:
# Setup
import warnings, math, re, os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime

from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Linear (sparse-friendly) + Tree models
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 160)

RANDOM_STATE = 42


In [None]:
# Load
PATH = 'data/social_media_sample.csv'
df = pd.read_csv(PATH, parse_dates=['post_datetime'])
print(df.shape)
df.head()

In [None]:
# Data quality overview
display(df.dtypes)
missing = df.isna().mean().sort_values(ascending=False).to_frame('missing_rate')
display(missing.head(20))

desc_num = df.select_dtypes(include=np.number).describe().T
display(desc_num)


In [None]:
# EDA: posting volume over time
by_month = df.set_index('post_datetime').resample('M').size()
plt.figure()
by_month.plot(title='Number of Posts per Month')
plt.xlabel('Month'); plt.ylabel('Posts'); plt.show()


In [None]:
# EDA: categorical distributions (platform, post_type, language)
for name in ['platform','post_type','language']:
    plt.figure()
    df[name].value_counts().head(10).plot(kind='bar', title=f'Distribution: {name}')
    plt.xlabel(name); plt.ylabel('Count'); plt.tight_layout(); plt.show()


In [None]:
# EDA: target and relationships
target = 'engagement_rate'

plt.figure()
df[target].hist(bins=50)
plt.title('Distribution of engagement_rate')
plt.xlabel('engagement_rate'); plt.ylabel('Frequency'); plt.show()

plt.figure()
df.plot.scatter(x='followers', y='impressions', alpha=0.25)
plt.title('Followers vs Impressions'); plt.show()

plt.figure()
df.plot.scatter(x='impressions', y='likes', alpha=0.25)
plt.title('Impressions vs Likes'); plt.show()


In [None]:
# Feature engineering
df_fe = df.copy()

# Time
df_fe['hour'] = df_fe['post_datetime'].dt.hour
df_fe['dow'] = df_fe['post_datetime'].dt.dayofweek
df_fe['month'] = df_fe['post_datetime'].dt.month
df_fe['year'] = df_fe['post_datetime'].dt.year

# Text
df_fe['content_len'] = df_fe['content'].fillna('').str.len()
df_fe['word_count'] = df_fe['content'].fillna('').str.split().apply(len)
df_fe['hashtag_count'] = df_fe['hashtags'].fillna('').apply(lambda s: 0 if (pd.isna(s) or s == '') else s.count('#'))
df_fe['has_link'] = df_fe['content'].fillna('').str.contains('http').astype(int)

# Ratios & interaction rates
df_fe['follow_ratio'] = (df_fe['followers']+1) / (df_fe['following']+1)
df_fe['likes_per_impression'] = (df_fe['likes']+1) / (df_fe['impressions']+1)
df_fe['comments_per_impression'] = (df_fe['comments']+1) / (df_fe['impressions']+1)
df_fe['shares_per_impression'] = (df_fe['shares']+1) / (df_fe['impressions']+1)

target = 'engagement_rate'
y = df_fe[target]

num_cols = [
    'followers','following','impressions','likes','comments','shares',
    'hour','dow','month','year','content_len','word_count','hashtag_count',
    'has_link','follow_ratio','likes_per_impression','comments_per_impression','shares_per_impression'
]
cat_cols = ['platform','post_type','language','device','country','city','sentiment','topic','is_verified']

X = df_fe[num_cols + cat_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_train.shape, X_test.shape

In [None]:
# Preprocessing
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler(with_mean=False))])
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=True)

preprocessor_sparse = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

to_dense = FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)  # for dense-only models


In [None]:
# Models
model_specs = [
    # name, estimator, needs_dense
    ('SGD-L2 (Ridge-like)', SGDRegressor(loss='squared_error', penalty='l2', alpha=1e-4, max_iter=2000, random_state=RANDOM_STATE), False),
    ('SGD-ElasticNet', SGDRegressor(loss='squared_error', penalty='elasticnet', l1_ratio=0.3, alpha=1e-4, max_iter=3000, random_state=RANDOM_STATE), False),
    ('RandomForest', RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1), True),
    ('GradientBoosting', GradientBoostingRegressor(random_state=RANDOM_STATE), True),
    ('HistGradientBoosting', HistGradientBoostingRegressor(random_state=RANDOM_STATE), True),
    ('DecisionTree', DecisionTreeRegressor(random_state=RANDOM_STATE), True),
]


In [None]:
# Cross-validation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_validate
import pandas as pd

scoring = {'MAE': 'neg_mean_absolute_error','RMSE': 'neg_root_mean_squared_error','R2':'r2'}
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Optional: speed-up by subsampling for CV only (uncomment to use)
# import numpy as np
# sample_size = min(4000, X_train.shape[0])
# sub_idx = np.random.RandomState(RANDOM_STATE).choice(np.arange(X_train.shape[0]), size=sample_size, replace=False)
# X_cv, y_cv = X_train.iloc[sub_idx], y_train.iloc[sub_idx]
# use_subset = True
use_subset = False
X_cv, y_cv = (X_train, y_train)

rows = []
for name, est, needs_dense in model_specs:
    if needs_dense:
        pipe = Pipeline([('prep', preprocessor_sparse), ('to_dense', to_dense), ('mdl', est)])
    else:
        pipe = Pipeline([('prep', preprocessor_sparse), ('mdl', est)])
    scores = cross_validate(pipe, X_cv, y_cv, cv=cv, scoring=scoring, n_jobs=-1, return_estimator=False)
    rows.append({
        'model': name,
        'MAE_mean': -scores['test_MAE'].mean(),
        'MAE_std':  scores['test_MAE'].std(),
        'RMSE_mean': -scores['test_RMSE'].mean(),
        'RMSE_std':  scores['test_RMSE'].std(),
        'R2_mean':  scores['test_R2'].mean(),
        'R2_std':   scores['test_R2'].std(),
    })

cv_df = pd.DataFrame(rows).sort_values('RMSE_mean').reset_index(drop=True)
cv_df

In [None]:
# Fit best model on full training set; evaluate on test
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), 1e-9, None))) * 100

best_name = cv_df.iloc[0]['model']
best_row = cv_df.iloc[0].to_dict()
print('Best by CV:', best_name)
print(best_row)

# Pull estimator and whether it needs dense
spec_map = {name: (est, dense) for name, est, dense in model_specs}
best_est, needs_dense = spec_map[best_name]

from sklearn.pipeline import Pipeline
if needs_dense:
    best_pipe = Pipeline([('prep', preprocessor_sparse), ('to_dense', to_dense), ('mdl', best_est)])
else:
    best_pipe = Pipeline([('prep', preprocessor_sparse), ('mdl', best_est)])

best_pipe.fit(X_train, y_train)
preds = best_pipe.predict(X_test)

mae = mean_absolute_error(y_test, preds)
rmse = mean_squared_error(y_test, preds, squared=False)
r2 = r2_score(y_test, preds)
mape_val = mape(y_test, preds)

print('Test MAE:', round(mae, 6))
print('Test RMSE:', round(rmse, 6))
print('Test R2:', round(r2, 6))
print('Test MAPE (%):', round(mape_val, 4))


In [None]:
# Residuals & actual-vs-predicted
plt.figure()
plt.scatter(preds, (y_test - preds), alpha=0.3)
plt.axhline(0)
plt.title('Residuals vs Predictions')
plt.xlabel('Predicted engagement_rate'); plt.ylabel('Residual'); plt.show()

plt.figure()
plt.scatter(y_test, preds, alpha=0.3)
plt.title('Actual vs Predicted engagement_rate')
plt.xlabel('Actual'); plt.ylabel('Predicted'); plt.show()


In [None]:
# Feature importance (tree models only)
from sklearn.inspection import permutation_importance

is_tree = hasattr(best_pipe.named_steps['mdl'], 'feature_importances_')
if is_tree:
    # Refit to ensure attributes exist
    best_pipe.fit(X_train, y_train)
    # Extract transformed feature names
    ohe = best_pipe.named_steps['prep'].named_transformers_['cat']
    num_features = num_cols
    cat_features = list(ohe.get_feature_names_out(cat_cols))
    feature_names = num_features + cat_features

    importances = getattr(best_pipe.named_steps['mdl'], 'feature_importances_', None)
    if importances is not None:
        importances = np.array(importances)
        idx = np.argsort(importances)[-25:][::-1]
        top = [(feature_names[i], importances[i]) for i in idx]
        imp_df = pd.DataFrame(top, columns=['feature','importance'])
        display(imp_df)

        plt.figure()
        plt.barh(imp_df['feature'][::-1], imp_df['importance'][::-1])
        plt.title('Top 25 Feature Importances'); plt.tight_layout(); plt.show()
else:
    print('Best model is not tree-based; skipping built-in feature importances.')


In [None]:
# Business insights
hourly = df.groupby(df['post_datetime'].dt.hour)['engagement_rate'].mean()
plt.figure(); hourly.plot(title='Average Engagement Rate by Hour'); plt.xlabel('Hour'); plt.ylabel('Avg Engagement Rate'); plt.show()

top_platform = df.groupby('platform')['engagement_rate'].mean().sort_values(ascending=False).head(5)
top_post_type = df.groupby('post_type')['engagement_rate'].mean().sort_values(ascending=False).head(5)
display(top_platform.to_frame('avg_engagement_rate'))
display(top_post_type.to_frame('avg_engagement_rate'))


## Summary & Next Steps

- We engineered strong behavioral and content features and compared diverse models under cross-validation.
- The best model (see leaderboard) is evaluated on a hold-out test set with MAE/RMSE/R²/MAPE.
- Use importances and hourly/platform/post-type views to inform posting strategy.

**Next steps**
- Hyperparameter tuning (RandomizedSearchCV/Bayesian).
- Richer text features (TF–IDF, transformer embeddings).
- Time-based CV to respect temporal leakage and evaluate robustness to drift.
