<a href="https://colab.research.google.com/github/azam123/SocialMediaAnalytics/blob/main/social_media_analytics_prediction_engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Social Media Analytics — Prediction Engine Notebook

This notebook demonstrates how to generate synthetic data, train machine learning models to predict social media engagement metrics (reach, likes, engagement rate), and implement a simple AI assistant for hashtag and description generation.

In [1]:
# Cell 1: Title & imports
import os
import math
from datetime import datetime
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction import _stop_words
import joblib

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
os.makedirs('models', exist_ok=True)
print("Imports ready.")

Imports ready.


In [3]:
# Cell 2: generate synthetic dataset (fixed for pandas >= 2.0)
def generate_synthetic_data(n=6000, seed=RANDOM_SEED):
    np.random.seed(seed)
    d = {}
    d['user_followers'] = np.random.randint(50, 500000, size=n)
    d['user_avg_engagement_rate'] = np.random.beta(2, 50, size=n)
    d['media_type'] = np.random.choice(['none', 'image', 'video'], size=n, p=[0.3, 0.5, 0.2])
    d['num_media_items'] = np.random.poisson(1, size=n)
    d['post_text_len'] = np.random.randint(5, 800, size=n)
    d['num_hashtags'] = np.random.poisson(2, size=n)
    d['num_mentions'] = np.random.poisson(0.3, size=n)
    d['num_emojis'] = np.random.poisson(1, size=n)
    d['has_link'] = np.random.binomial(1, 0.15, size=n)
    d['hour'] = np.random.randint(0, 24, size=n)
    d['day_of_week'] = np.random.randint(0, 7, size=n)
    df = pd.DataFrame(d)

    # synthetic relationships
    media_boost = df['media_type'].map({'none': 0.6, 'image': 1.0, 'video': 1.6})
    time_boost = np.where((df['hour'] >= 18) & (df['hour'] <= 22), 1.2, 1.0)
    base_reach = df['user_followers'] * (0.02 + 0.2 * df['user_avg_engagement_rate'])

    df['reach'] = (base_reach * media_boost * time_boost * (1 + 0.05 * df['num_hashtags'])).clip(lower=5)
    df['reach'] = (df['reach'] * (1 + 0.5 * np.random.randn(n))).clip(lower=5).round().astype(int)

    df['engagement_rate'] = (df['user_avg_engagement_rate'] * (0.8 + 0.6 * np.random.rand(n))).round(4)
    df['likes'] = (df['reach'] * df['engagement_rate'] * (0.2 + 0.8 * np.random.rand(n))).clip(lower=0).round().astype(int)

    return df

# generate dataset
df = generate_synthetic_data(6000)
df.head()


Unnamed: 0,user_followers,user_avg_engagement_rate,media_type,num_media_items,post_text_len,num_hashtags,num_mentions,num_emojis,has_link,hour,day_of_week,reach,engagement_rate,likes
0,122008,0.06104,video,1,765,5,1,0,0,20,1,10921,0.0589,286
1,146917,0.045525,image,1,377,3,0,0,0,2,6,4638,0.0619,267
2,131982,0.037735,video,3,519,3,0,0,0,11,5,6436,0.05,279
3,365888,0.023567,none,3,752,2,0,3,0,19,2,4378,0.0253,89
4,259228,0.070953,none,0,55,3,1,1,0,15,6,6986,0.0905,623


In [4]:
# Cell 3: Feature engineering
df_feat = df.copy()
df_feat['daypart'] = pd.cut(df_feat['hour'], bins=[-1,5,11,17,22,24], labels=['late_night','morning','afternoon','evening','late_night2'])
df_feat['media_type'] = df_feat['media_type'].astype(str)
df_feat['daypart'] = df_feat['daypart'].astype(str)

FEATURE_COLS = [
    'user_followers','user_avg_engagement_rate','post_text_len','num_hashtags',
    'num_mentions','num_emojis','has_link','media_type','daypart','day_of_week'
]

X = df_feat[FEATURE_COLS]
y_reach = df_feat['reach']
y_likes = df_feat['likes']
y_eng = df_feat['engagement_rate']
X.head()

Unnamed: 0,user_followers,user_avg_engagement_rate,post_text_len,num_hashtags,num_mentions,num_emojis,has_link,media_type,daypart,day_of_week
0,122008,0.06104,765,5,1,0,0,video,evening,1
1,146917,0.045525,377,3,0,0,0,image,late_night,6
2,131982,0.037735,519,3,0,0,0,video,morning,5
3,365888,0.023567,752,2,0,3,0,none,evening,2
4,259228,0.070953,55,3,1,1,0,none,afternoon,6


In [5]:
# Cell 4: Train models
numeric_features = ['user_followers','user_avg_engagement_rate','post_text_len','num_hashtags','num_mentions','num_emojis','has_link']
cat_features = ['media_type','daypart','day_of_week']

preprocessor = ColumnTransformer(transformers=[
    ('num', 'passthrough', numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

# Train reach model
model_reach = make_pipeline(preprocessor, GradientBoostingRegressor(random_state=RANDOM_SEED, n_estimators=150, max_depth=4))
X_train, X_test, y_train, y_test = train_test_split(X, y_reach, test_size=0.2, random_state=RANDOM_SEED)
model_reach.fit(X_train, y_train)
pred = model_reach.predict(X_test)
print('Reach RMSE:', math.sqrt(mean_squared_error(y_test, pred)))
print('Reach R2:', r2_score(y_test, pred))
joblib.dump(model_reach, 'models/gbm_reach.joblib')

# Train likes model
model_likes = make_pipeline(preprocessor, GradientBoostingRegressor(random_state=RANDOM_SEED, n_estimators=150, max_depth=4))
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y_likes, test_size=0.2, random_state=RANDOM_SEED)
model_likes.fit(X_train2, y_train2)
pred_l = model_likes.predict(X_test2)
print('Likes RMSE:', math.sqrt(mean_squared_error(y_test2, pred_l)))
print('Likes R2:', r2_score(y_test2, pred_l))
joblib.dump(model_likes, 'models/gbm_likes.joblib')

Reach RMSE: 5089.409172426924
Reach R2: 0.5824064251472376
Likes RMSE: 258.37359722124233
Likes R2: 0.5728030327621003


['models/gbm_likes.joblib']

In [6]:
# Cell 5: Hashtag and description suggester
STOPWORDS = set(_stop_words.ENGLISH_STOP_WORDS)

def suggest_hashtags_from_text(text, top_k=5):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return []
    tokens = re.findall(r"\w+", text.lower())
    tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 2]
    if not tokens:
        return []
    freq = pd.Series(tokens).value_counts()
    top = list(freq.head(top_k).index)
    return ['#' + t for t in top]

def suggest_description_for_media(text, max_words=25):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return ''
    words = text.strip().split()
    preview = ' '.join(words[:max_words])
    suffix = '...' if len(words) > max_words else ''
    return (preview + suffix).strip()

example_text = "Launching our new summer collection — lightweight jackets, breathable fabric, and bold colors! Shop now: https://example.com"
print('Hashtags:', suggest_hashtags_from_text(example_text, top_k=6))
print('Description:', suggest_description_for_media(example_text, max_words=10))

Hashtags: ['#launching', '#new', '#summer', '#collection', '#lightweight', '#jackets']
Description: Launching our new summer collection — lightweight jackets, breathable fabric,...


In [7]:
# Cell 6: Inference wrapper
model_reach = joblib.load('models/gbm_reach.joblib')
model_likes = joblib.load('models/gbm_likes.joblib')

def prepare_features_from_payload(payload):
    post_text = payload.get('post_text', '') or ''
    media_type = payload.get('media_type', 'none')
    num_media_items = int(payload.get('num_media_items', 0))
    scheduled_time = payload.get('scheduled_time', None)
    dt = pd.to_datetime(scheduled_time) if scheduled_time else pd.to_datetime(datetime.utcnow())
    hour = int(dt.hour)
    day_of_week = int(dt.dayofweek)
    user_followers = int(payload.get('user_followers', 100))
    user_avg_engagement_rate = float(payload.get('user_avg_engagement_rate', 0.01))
    features = {
        'user_followers': user_followers,
        'user_avg_engagement_rate': user_avg_engagement_rate,
        'post_text_len': len(post_text),
        'num_hashtags': int(payload.get('num_hashtags', 0)),
        'num_mentions': int(payload.get('num_mentions', 0)),
        'num_emojis': int(payload.get('num_emojis', 0)),
        'has_link': 1 if ('http' in post_text or 'www.' in post_text) else 0,
        'media_type': media_type,
        'daypart': pd.cut([hour], bins=[-1,5,11,17,22,24], labels=['late_night','morning','afternoon','evening','late_night2'])[0],
        'day_of_week': day_of_week
    }
    return pd.DataFrame([features])

def inference(payload):
    suggested_hashtags = suggest_hashtags_from_text(payload.get('post_text',''), top_k=6)
    suggested_description = None
    if payload.get('media_type') in ('image','video'):
        suggested_description = suggest_description_for_media(payload.get('post_text',''), max_words=25)
    feat_payload = payload.copy()
    feat_payload['num_hashtags'] = len(suggested_hashtags)
    feat_df = prepare_features_from_payload(feat_payload)
    pred_reach = int(max(0, round(float(model_reach.predict(feat_df)[0]))))
    pred_likes = int(max(0, round(float(model_likes.predict(feat_df)[0]))))
    pred_eng_rate = round(pred_likes / pred_reach if pred_reach > 0 else 0.0, 4)
    explanations = {'top_features': ['user_followers','media_type','daypart','num_hashtags']}
    return {
        'suggested_hashtags': suggested_hashtags,
        'suggested_description': suggested_description,
        'predicted_reach': pred_reach,
        'predicted_likes': pred_likes,
        'predicted_engagement_rate': pred_eng_rate,
        'explanations': explanations
    }

payload_example = {
    "post_text": "Announcing our biggest sale of the year! 50% off on selected items. Free shipping for orders over $50. Grab yours now!",
    "media_type": "image",
    "num_media_items": 1,
    "scheduled_time": "2025-07-15T19:30:00Z",
    "user_followers": 12500,
    "user_avg_engagement_rate": 0.012
}
print('Inference result:', inference(payload_example))

Inference result: {'suggested_hashtags': ['#announcing', '#biggest', '#sale', '#year', '#selected', '#items'], 'suggested_description': 'Announcing our biggest sale of the year! 50% off on selected items. Free shipping for orders over $50. Grab yours now!', 'predicted_reach': 1378, 'predicted_likes': 20, 'predicted_engagement_rate': 0.0145, 'explanations': {'top_features': ['user_followers', 'media_type', 'daypart', 'num_hashtags']}}
