In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_excel(r"C:\Users\bhara\OneDrive\Desktop\INTERVIEW PREPS\ML PROJECT 1\Audience-Conversion-Propensity\notebooks\data\raw\cohort_conversion_dataset.xlsx")
df.columns = df.columns.str.strip()


In [11]:
best_model = joblib.load("models/logit_pipeline.joblib")
print("Model loaded successfully")

Model loaded successfully


In [3]:
df["clicks_per_user"] = df["clicks_7d"] / (df["users_exposed"] + 1)
df["add_to_cart_rate"] = df["add_to_cart_7d"] / (df["users_exposed"] + 1)
df["frequency_recency_ratio"] = df["avg_frequency_7d"] / (df["recency_hours"] + 1)
df["seasonal_engagement"] = df["add_to_cart_rate"] * df["seasonality_index"]


In [4]:
threshold = df["conversion_rate_7d"].quantile(0.80)
df["converted_7d"] = (df["conversion_rate_7d"] >= threshold).astype(int)


In [5]:
FEATURES = [
    "users_exposed","impressions_7d","avg_frequency_7d","recency_hours",
    "clicks_7d","ctr_7d","site_visits_7d","product_views_7d",
    "add_to_cart_7d","avg_session_time_sec",
    "prev_conv_rate_28d","seasonality_index",
    "clicks_per_user","add_to_cart_rate","frequency_recency_ratio","seasonal_engagement",
    "geo","device","audience_segment","product_category"
]

X = df[FEATURES]
y = df["converted_7d"]


In [7]:
_, X_test, _, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [12]:
y_prob = best_model.predict_proba(X_test)[:,1]


In [13]:
print("Shape:", y_prob.shape)
print("Mean propensity:", y_prob.mean())
print("Min / Max:", y_prob.min(), y_prob.max())

Shape: (1000,)
Mean propensity: 0.20063986351328458
Min / Max: 4.66967071412416e-05 0.961679649948535


In [14]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_test, y_prob)
print("ROC-AUC:", roc_auc)


ROC-AUC: 0.9325843438746665


In [15]:
simulation_results = X_test.copy()
simulation_results["actual_converted"] = y_test.values
simulation_results["propensity_score"] = y_prob

simulation_results = simulation_results.sort_values(
    "propensity_score", ascending=False
)

simulation_results.head(10)


Unnamed: 0,users_exposed,impressions_7d,avg_frequency_7d,recency_hours,clicks_7d,ctr_7d,site_visits_7d,product_views_7d,add_to_cart_7d,avg_session_time_sec,...,clicks_per_user,add_to_cart_rate,frequency_recency_ratio,seasonal_engagement,geo,device,audience_segment,product_category,actual_converted,propensity_score
4613,39996,154297,3.6,15,4606,0.0299,17943,8925,2829,146,...,0.115159,0.07073,0.225,0.076389,IN,Desktop,Book Lovers,Books,1,0.96168
4796,17357,26825,1.6,18,698,0.026,3468,2002,340,195,...,0.040212,0.019588,0.084211,0.022134,IN,Desktop,Tech Shoppers,Electronics,1,0.952446
4626,40192,230999,5.3,93,3600,0.0156,12285,6212,2073,132,...,0.089568,0.051576,0.056383,0.05725,US,Mobile,Fitness Enthusiasts,Fitness App,1,0.950138
4952,47923,109865,2.5,78,1212,0.011,4087,1976,350,125,...,0.02529,0.007303,0.031646,0.008107,UK,Desktop,Casual Browsers,Books,1,0.943357
4593,17574,32993,1.8,10,566,0.0172,1987,960,200,166,...,0.032205,0.01138,0.163636,0.012859,UK,Mobile,New Parents,Baby Care,1,0.940646
4298,28199,92172,3.5,14,1206,0.0131,5455,3062,735,171,...,0.042766,0.026064,0.233333,0.028931,UK,Mobile,New Parents,Baby Care,1,0.938925
4210,55327,217922,4.2,66,5078,0.0233,19577,12262,3481,221,...,0.09178,0.062916,0.062687,0.070466,IN,Mobile,Luxury Shoppers,Luxury,1,0.937971
4158,11407,59031,5.0,77,1607,0.0272,5411,2253,362,203,...,0.140866,0.031732,0.064103,0.03554,US,Desktop,Casual Browsers,Books,1,0.935016
4077,29779,67354,2.1,10,1472,0.0219,5455,3232,499,281,...,0.049429,0.016756,0.190909,0.017929,UK,Mobile,Book Lovers,Books,1,0.93415
4597,48899,265192,5.3,11,5671,0.0214,24633,10145,2858,132,...,0.115971,0.058446,0.441667,0.065459,US,Mobile,Beauty Buyers,Beauty,1,0.92941


In [16]:
segment_summary = (
    simulation_results
    .groupby("audience_segment")
    .agg(
        avg_propensity=("propensity_score", "mean"),
        conversion_rate=("actual_converted", "mean"),
        count=("propensity_score", "size")
    )
    .sort_values("avg_propensity", ascending=False)
)

segment_summary


Unnamed: 0_level_0,avg_propensity,conversion_rate,count
audience_segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tech Shoppers,0.227771,0.19685,127
Luxury Shoppers,0.221862,0.194444,108
Fitness Enthusiasts,0.211167,0.20438,137
New Parents,0.210039,0.208054,149
Book Lovers,0.198185,0.189655,116
Beauty Buyers,0.197811,0.194915,118
Casual Browsers,0.171456,0.158333,120
Deal Seekers,0.164962,0.136,125
