In [16]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score


In [17]:
#Loading the datset
df=pd.read_excel(r"C:\Users\bhara\OneDrive\Desktop\INTERVIEW PREPS\ML PROJECT 1\Audience-Conversion-Propensity\notebooks\data\raw\cohort_conversion_dataset.xlsx")

# Fixing hidden spaces from Excel exports 
df.columns = df.columns.str.strip()


In [18]:
df["conversion_rate_7d"].describe()


count    5000.000000
mean        0.127675
std         0.061288
min         0.021800
25%         0.076200
50%         0.123050
75%         0.172500
max         0.313500
Name: conversion_rate_7d, dtype: float64

The data set contains only converted people's data. So doing a binary classificarion is impossble. 

Taking a new approach of finding the users with high conv rate 
This creates:

high-conversion users → 1

low-conversion users → 0


Trying to find a good threshold

In [19]:
df["converted_7d"] = (df["conversion_rate_7d"] >= 0.05).astype(int)
df["converted_7d"].mean()

np.float64(0.8916)

0.5 as conversion rate wont work as the data is still skewed with ~90% of positive samples

In [20]:
df["conversion_rate_7d"].quantile([0.5, 0.7, 0.8, 0.9, 0.95, 0.98])


0.50    0.123050
0.70    0.161400
0.80    0.183520
0.90    0.213510
0.95    0.235705
0.98    0.257504
Name: conversion_rate_7d, dtype: float64

 

Half the users have conversion rate ≤ 12.3%

Top 20% start at 18.4%

Top 10% start at 21.4%

In [21]:
threshold = df["conversion_rate_7d"].quantile(0.80)

df["converted_7d"] = (df["conversion_rate_7d"] >= threshold).astype(int)

df["converted_7d"].mean()


np.float64(0.2)

In [22]:
df["clicks_per_user"] = df["clicks_7d"] / (df["users_exposed"] + 1)
df["add_to_cart_rate"] = df["add_to_cart_7d"] / (df["users_exposed"] + 1)
df["frequency_recency_ratio"] = df["avg_frequency_7d"] / (df["recency_hours"] + 1)
df["seasonal_engagement"] = df["add_to_cart_rate"] * df["seasonality_index"]


In [23]:
TARGET = "converted_7d"

categorical_features = ["geo", "device", "audience_segment", "product_category"]

numeric_features = [
    "users_exposed", "impressions_7d", "avg_frequency_7d", "recency_hours",
    "clicks_7d", "ctr_7d", "site_visits_7d", "product_views_7d",
    "add_to_cart_7d", "avg_session_time_sec",
    "prev_conv_rate_28d",      # history feature
    "seasonality_index",
    # engineered
    "clicks_per_user", "add_to_cart_rate", "frequency_recency_ratio", "seasonal_engagement"
]

FEATURES = numeric_features + categorical_features

numeric_no_hist = [c for c in numeric_features if c != "prev_conv_rate_28d"]
FEATURES_NO_HIST = numeric_no_hist + categorical_features


In [24]:
X = df[FEATURES].copy()
y = df[TARGET].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

print("Train positive rate:", y_train.mean())
print("Test positive rate :", y_test.mean())


Train positive rate: 0.2035
Test positive rate : 0.186


### Train–Test Class Distribution Check

After using a time-based train–test split, the proportion of converters stays fairly consistent across both datasets:

- **Training set:** ~20.35% converters  
- **Test set:** ~18.6% converters  

This suggests that the split did not introduce any major class imbalance or leakage issues. Since the conversion rates are similar in both sets, the evaluation on the test data should reasonably reflect how the model would perform on future, unseen users.


In [25]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [27]:
def precision_at_k(y_true, y_prob, k):
    topk = np.argsort(y_prob)[::-1][:k]
    return y_true.iloc[topk].mean()

def recall_at_k(y_true, y_prob, k):
    topk = np.argsort(y_prob)[::-1][:k]
    return y_true.iloc[topk].sum() / max(1, y_true.sum())

def evaluate_probs(y_true, y_prob, frac=0.10):
    roc = roc_auc_score(y_true, y_prob)
    pr = average_precision_score(y_true, y_prob)
    k = max(1, int(frac * len(y_true)))
    p_at_k = precision_at_k(y_true, y_prob, k)
    r_at_k = recall_at_k(y_true, y_prob, k)
    return {"ROC_AUC": roc, "PR_AUC": pr, "k": k, "Precision@k": p_at_k, "Recall@k": r_at_k}


Baseline logistic regression model

In [28]:
from sklearn.linear_model import LogisticRegression

logit = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=2000))
])

logit.fit(X_train, y_train)

logit_probs = logit.predict_proba(X_test)[:,1]

baseline_metrics = evaluate_probs(y_test, logit_probs)
baseline_metrics


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'ROC_AUC': 0.5476935880161686,
 'PR_AUC': 0.2277118955442796,
 'k': 100,
 'Precision@k': np.float64(0.27),
 'Recall@k': np.float64(0.14516129032258066)}

Since the data was on different scales,ROC

Standardize numeric features

In [29]:
from sklearn.preprocessing import StandardScaler

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [31]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [32]:
logit = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=3000))
])

logit.fit(X_train, y_train)

logit_probs = logit.predict_proba(X_test)[:,1]
baseline_metrics = evaluate_probs(y_test, logit_probs)
baseline_metrics


{'ROC_AUC': 0.9325843438746665,
 'PR_AUC': 0.7436228890995693,
 'k': 100,
 'Precision@k': np.float64(0.76),
 'Recall@k': np.float64(0.40860215053763443)}


Precision@100 = 76%	

means 76 of the top 100 segments will actually convert


Recall@100 = 41%	

means we  capture 41% of all converting cohorts

ROC-AUC = 0.93	Model separates converters vs non-converters extremely well

Random Forest

In [33]:
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=8,
        min_samples_leaf=50,
        n_jobs=-1,
        random_state=42
    ))
])

rf.fit(X_train, y_train)
rf_probs = rf.predict_proba(X_test)[:,1]

evaluate_probs(y_test, rf_probs)


{'ROC_AUC': 0.9200879765395894,
 'PR_AUC': 0.6341793912017228,
 'k': 100,
 'Precision@k': np.float64(0.7),
 'Recall@k': np.float64(0.3763440860215054)}

XGboost

In [34]:
from xgboost import XGBClassifier

xgb = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", XGBClassifier(
        n_estimators=400,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    ))
])

xgb.fit(X_train, y_train)
xgb_probs = xgb.predict_proba(X_test)[:,1]

evaluate_probs(y_test, xgb_probs)


{'ROC_AUC': 0.9224194869356159,
 'PR_AUC': 0.6785540998875638,
 'k': 100,
 'Precision@k': np.float64(0.71),
 'Recall@k': np.float64(0.3817204301075269)}

extracting feature importance

In [36]:
best_model = logit  

feature_names = best_model.named_steps["preprocessor"].get_feature_names_out()
coefs = best_model.named_steps["model"].coef_[0]

imp = pd.DataFrame({
    "feature": feature_names,
    "coef": coefs,
    "abs_coef": np.abs(coefs)
}).sort_values("abs_coef", ascending=False)

imp.head(20)


Unnamed: 0,feature,coef,abs_coef
10,num__prev_conv_rate_28d,3.199815,3.199815
11,num__seasonality_index,0.631607,0.631607
1,num__impressions_7d,0.589418,0.589418
12,num__clicks_per_user,0.422465,0.422465
2,num__avg_frequency_7d,-0.381492,0.381492
4,num__clicks_7d,-0.374308,0.374308
8,num__add_to_cart_7d,0.315265,0.315265
0,num__users_exposed,-0.243566,0.243566
15,num__seasonal_engagement,-0.21689,0.21689
22,cat__audience_segment_Book Lovers,0.213859,0.213859


Insights from feature importance output

num_prev_conv_rate_28d → coefficient = 3.20


A user who converted in the last 28 days is dramatically more likely to convert again.Retargeting beats cold targeting.

seasonality_index → +0.63


When product demand is high (holiday, sale week, seasonal spike), everyone converts more easily.

clicks_per_user → +0.42  
impressions_7d → +0.59  
clicks_7d → +0.37  
add_to_cart_7d → +0.31


People who actively interact with ads and site are much more likely to convert.


avg_frequency_7d → -0.38

Showing too many ads hurts conversion.


