# Ad Click Prediction - Exact Code with MLflow & PNG Plots

In [120]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

os.makedirs('plots', exist_ok=True)
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.impute import KNNImputer
from sklearn.metrics import confusion_matrix
from scipy.stats import skew
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import lightgbm as lgb

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, auc, roc_curve, roc_auc_score
import mlflow

print('All imports successful')

All imports successful


## Load Data & Create Conversions

In [121]:
data = pd.read_csv('sample_dataset.csv')
np.random.seed(42)
conversion_rate = 0.08
data['conversion'] = ((data['click'] == 1) & (np.random.random(len(data)) < conversion_rate)).astype(int)
print(f'Data shape: {data.shape}')
print(data.head())

Data shape: (10000, 10)
     id full_name   age      gender device_type ad_position browsing_history  \
0   670   User670  22.0         NaN     Desktop         Top         Shopping   
1  3044  User3044   NaN        Male     Desktop         Top              NaN   
2  5912  User5912  41.0  Non-Binary         NaN        Side        Education   
3  5418  User5418  34.0        Male         NaN         NaN    Entertainment   
4  9452  User9452  39.0  Non-Binary         NaN         NaN     Social Media   

  time_of_day  click  conversion  
0   Afternoon      1           0  
1         NaN      1           0  
2       Night      1           0  
3     Evening      1           0  
4     Morning      0           0  


## EDA - Exploratory Data Analysis

In [122]:
data = data.drop(columns=['id','full_name'], axis=1)
print(f'Shape: {data.shape}')
data.info()

Shape: (10000, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               5234 non-null   float64
 1   gender            5307 non-null   object 
 2   device_type       8000 non-null   object 
 3   ad_position       8000 non-null   object 
 4   browsing_history  5218 non-null   object 
 5   time_of_day       8000 non-null   object 
 6   click             10000 non-null  int64  
 7   conversion        10000 non-null  int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 625.1+ KB


In [123]:
num_cols = data.select_dtypes(include=['float64', 'int64'])
cat_cols = data.select_dtypes(include=['object'])
print('Numeric Variables:')
print(num_cols.columns.tolist())
print('\\nCategorical Variables:')
print(cat_cols.columns.tolist())

Numeric Variables:
['age', 'click', 'conversion']
\nCategorical Variables:
['gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']


In [124]:
for col in cat_cols:
    print(f'Column {col}: {len(data[col].unique())} unique values')

Column gender: 4 unique values
Column device_type: 4 unique values
Column ad_position: 4 unique values
Column browsing_history: 6 unique values
Column time_of_day: 5 unique values


In [125]:
print(data.describe().T)

              count       mean        std   min   25%   50%   75%   max
age          5234.0  40.197363  13.126420  18.0  29.0  39.5  52.0  64.0
click       10000.0   0.650000   0.476993   0.0   0.0   1.0   1.0   1.0
conversion  10000.0   0.053100   0.224244   0.0   0.0   0.0   0.0   1.0


In [126]:
for feature in num_cols:
    zero_values = (data[feature] == 0).sum()
    null_values = data[feature].isnull().sum()
    unique_values = len(data[feature].unique())
    print(f'Feature: {feature}, Zeros: {zero_values}, Nulls: {null_values}, Unique: {unique_values}')

Feature: age, Zeros: 0, Nulls: 4766, Unique: 48
Feature: click, Zeros: 3500, Nulls: 0, Unique: 2
Feature: conversion, Zeros: 9469, Nulls: 0, Unique: 2


In [127]:
print('Missing values:')
print(data.isnull().sum())
fig, ax = plt.subplots(figsize=(20,6))
sns.heatmap(data.isnull(), cmap='viridis', ax=ax)
ax.set_title('Missing Data Heatmap')
plt.tight_layout()
plt.savefig('plots/01_missing_data.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/01_missing_data.png')

Missing values:
age                 4766
gender              4693
device_type         2000
ad_position         2000
browsing_history    4782
time_of_day         2000
click                  0
conversion             0
dtype: int64
Saved: plots/01_missing_data.png


In [128]:
data['gender'] = data['gender'].fillna('Unknown')
data['device_type'] = data['device_type'].fillna('Unknown')
data['ad_position'] = data['ad_position'].fillna('Unknown')
data['browsing_history'] = data['browsing_history'].fillna('Unknown')
data['time_of_day'] = data['time_of_day'].fillna('Unknown')
print('Categorical nulls filled')

Categorical nulls filled


In [129]:
fig, ax = plt.subplots(figsize=(10,5))
ax.hist(data['age'], bins=20, edgecolor='black', color='skyblue')
ax.set_title('Age Distribution Before Imputation')
ax.set_xlabel('Age')
ax.set_ylabel('Frequency')
plt.tight_layout()
plt.savefig('plots/02_age_before.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/02_age_before.png')

Saved: plots/02_age_before.png


In [130]:
def knn_impute(data, n_neighbors=5):
    data_encoded = data.copy()
    category_mappings = {}
    for col in data_encoded.select_dtypes(include='object').columns:
        data_encoded[col] = data_encoded[col].astype('category').cat.codes
        category_mappings[col] = dict(enumerate(data[col].astype('category').cat.categories))
    knn_imputer = KNNImputer(n_neighbors=n_neighbors)
    data_imputed = pd.DataFrame(knn_imputer.fit_transform(data_encoded), columns=data_encoded.columns)
    for col in data.select_dtypes(include='object').columns:
        data_imputed[col] = data_imputed[col].round().astype(int).map(category_mappings[col])
    return data_imputed

data_imputed = knn_impute(data, n_neighbors=5)
data = data_imputed
print('KNN imputation complete')

KNN imputation complete


In [131]:
fig, ax = plt.subplots(figsize=(10,5))
ax.hist(data['age'], bins=20, edgecolor='black', color='lightgreen')
ax.set_title('Age Distribution After Imputation')
ax.set_xlabel('Age')
ax.set_ylabel('Frequency')
plt.tight_layout()
plt.savefig('plots/03_age_after.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/03_age_after.png')

Saved: plots/03_age_after.png


In [132]:
print('Missing after imputation:')
print(data.isnull().sum())

Missing after imputation:
age                 0
gender              0
device_type         0
ad_position         0
browsing_history    0
time_of_day         0
click               0
conversion          0
dtype: int64


In [133]:
num_cols = data.select_dtypes(include=['float64', 'int64'])
corr = num_cols.corr()
print('Top correlations with click:')
print(corr['click'].sort_values(ascending=False)[1:11])

Top correlations with click:
conversion    0.173769
age          -0.085882
Name: click, dtype: float64


In [134]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
ax.set_title('Correlation Matrix - Numeric')
plt.tight_layout()
plt.savefig('plots/04_corr_numeric.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/04_corr_numeric.png')

Saved: plots/04_corr_numeric.png


In [135]:
data_encoded = pd.get_dummies(data, drop_first=True)

In [136]:
fig, ax = plt.subplots(figsize=(8, 5))
sns.boxplot(data=data, x='age', color='skyblue', ax=ax)
ax.set_title('Age Boxplot')
ax.axvline(data['age'].median(), color='green', linestyle='--')
plt.tight_layout()
plt.savefig('plots/06_age_box.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/06_age_box.png')

Saved: plots/06_age_box.png


In [137]:
data['age'] = data['age'].astype(int)
data['click'] = data['click'].astype(int)
data['conversion'] = data['conversion'].astype(int)
print('Data types converted')

Data types converted


In [138]:
fig, ax = plt.subplots(figsize=(8, 5))
data['click'].value_counts().plot(kind='bar', color='darkblue', ax=ax)
ax.set_title('Click Distribution')
ax.set_xlabel('Click')
ax.set_ylabel('Count')
ax.set_xticklabels(['No', 'Yes'], rotation=0)
plt.tight_layout()
plt.savefig('plots/07_click_dist.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/07_click_dist.png')

Saved: plots/07_click_dist.png


In [139]:
bins = [17, 24, 34, 44, 54, 64, 100]
labels = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
grouped = (data.assign(age_group=pd.cut(data['age'], bins=bins, labels=labels))
          .groupby(['age_group', 'click'], observed=False).size().unstack(fill_value=0))
fig, ax = plt.subplots(figsize=(12, 6))
grouped.plot(kind='bar', stacked=True, ax=ax)
ax.set_title('Clicks by Age Group')
ax.set_xlabel('Age Group')
ax.set_ylabel('Count')
ax.legend(['No Click', 'Click'])
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('plots/08_click_by_age.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/08_click_by_age.png')

Saved: plots/08_click_by_age.png


In [140]:
click_counts = data[data['click'] == 1]['gender'].value_counts(normalize=True) * 100
fig, ax = plt.subplots(figsize=(10, 5))
click_counts.plot(kind='bar', color='darkblue', ax=ax)
ax.set_title('Gender - Click %')
ax.set_xlabel('Gender')
ax.set_ylabel('Percentage')
plt.tight_layout()
plt.savefig('plots/09_gender.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/09_gender.png')

Saved: plots/09_gender.png


In [141]:
click_counts = data[data['click'] == 1]['device_type'].value_counts(normalize=True) * 100
fig, ax = plt.subplots(figsize=(10, 5))
click_counts.plot(kind='bar', color='darkblue', ax=ax)
ax.set_title('Device Type - Click %')
ax.set_xlabel('Device')
ax.set_ylabel('Percentage')
plt.tight_layout()
plt.savefig('plots/10_device.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/10_device.png')

Saved: plots/10_device.png


In [142]:
click_counts = data[data['click'] == 1]['ad_position'].value_counts(normalize=True) * 100
fig, ax = plt.subplots(figsize=(10, 5))
click_counts.plot(kind='bar', color='darkblue', ax=ax)
ax.set_title('Ad Position - Click %')
ax.set_xlabel('Position')
ax.set_ylabel('Percentage')
plt.tight_layout()
plt.savefig('plots/11_position.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/11_position.png')

Saved: plots/11_position.png


In [143]:
click_counts = data[data['click'] == 1]['browsing_history'].value_counts(normalize=True) * 100
fig, ax = plt.subplots(figsize=(10, 5))
click_counts.plot(kind='bar', color='darkblue', ax=ax)
ax.set_title('Browsing History - Click %')
ax.set_xlabel('History')
ax.set_ylabel('Percentage')
plt.tight_layout()
plt.savefig('plots/12_history.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/12_history.png')

Saved: plots/12_history.png


In [144]:
click_counts = data[data['click'] == 1]['time_of_day'].value_counts(normalize=True) * 100
fig, ax = plt.subplots(figsize=(10, 5))
click_counts.plot(kind='bar', color='darkblue', ax=ax)
ax.set_title('Time of Day - Click %')
ax.set_xlabel('Time')
ax.set_ylabel('Percentage')
plt.tight_layout()
plt.savefig('plots/13_time.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/13_time.png')

Saved: plots/13_time.png


In [145]:
crosstab = pd.crosstab(data['device_type'], data['ad_position'])
fig, ax = plt.subplots(figsize=(10, 6))
crosstab.plot(kind='bar', ax=ax, color=sns.color_palette('pastel'))
ax.set_title('Device Type vs Ad Position')
ax.set_xlabel('Device')
ax.set_ylabel('Count')
ax.legend(title='Position')
plt.tight_layout()
plt.savefig('plots/14_device_pos.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/14_device_pos.png')

Saved: plots/14_device_pos.png


In [146]:
fig, ax = plt.subplots(figsize=(10, 5))
sns.boxplot(x='device_type', y='age', data=data, ax=ax)
ax.set_title('Age by Device Type')
plt.tight_layout()
plt.savefig('plots/15_age_device.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/15_age_device.png')

Saved: plots/15_age_device.png


In [147]:
fig, ax = plt.subplots(figsize=(10, 5))
sns.boxplot(x='time_of_day', y='age', data=data, ax=ax)
ax.set_title('Age by Time of Day')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('plots/16_age_time.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/16_age_time.png')

Saved: plots/16_age_time.png


In [148]:
def create_funnel_plot(data, feature_name, output_filename):
    """
    Create optimized funnel plot for any categorical feature
    
    Parameters:
    -----------
    data : DataFrame
        Input data with 'click' and 'conversion' columns
    feature_name : str
        Column name of categorical feature (e.g., 'time_of_day', 'device_type', 'gender')
    output_filename : str
        Path to save PNG file (e.g., 'plots/04_funnel_time.png')
    """
    from plotly.subplots import make_subplots
    import plotly.graph_objects as go
    
    categories = data[feature_name].unique()
    
    # Determine grid layout
    num_cats = len(categories)
    if num_cats <= 3:
        rows, cols = 1, num_cats
    elif num_cats <= 6:
        rows, cols = 2, 3
    else:
        rows, cols = (num_cats + 2) // 3, 3
    
    fig = make_subplots(
        rows=rows, 
        cols=cols, 
        subplot_titles=categories,
        specs=[[{'type': 'funnel'} for _ in range(cols)] for _ in range(rows)]
    )
    
    colors = ['#667eea', '#764ba2', '#f093fb']
    labels = ['Impressions', 'Clicks', 'Conversions']
    
    for idx, category in enumerate(categories):
        cat_data = data[data[feature_name] == category]
        impressions = len(cat_data)
        clicks = cat_data['click'].sum()
        conversions = cat_data['conversion'].sum()
        
        row = idx // cols + 1
        col = idx % cols + 1
        
        show_legend = idx == 0
        
        fig.add_trace(
            go.Funnel(
                y=labels,  # Always show labels for structure
                x=[impressions, clicks, conversions],
                marker=dict(color=colors),
                textposition='inside',
                textinfo='value',
                textfont=dict(size=12, color='white'),
                name=str(category),
                showlegend=show_legend,
                hovertemplate='<b>%{y}</b><br>Value: %{x:,.0f}<extra></extra>'
            ),
            row=row, col=col
        )
    
    # Hide y-axis labels for all but first funnel
    for i in range(num_cats):
        if i > 0:
            fig.update_yaxes(visible=False, row=(i // cols + 1), col=(i % cols + 1))
    
    height = 600 if rows == 1 else (900 if rows == 2 else 1200)
    width = 1700 if cols >= 3 else (800 if cols == 1 else 1200)
    
    fig.update_layout(
        title_text=f'Conversion Funnels by {feature_name.replace("_", " ").title()}',
        height=height,
        width=width,
        showlegend=True,
        font=dict(size=11)
    )
    
    fig.write_image(output_filename, width=width, height=height)
    print(f'✓ Saved: {output_filename}')
    return fig


# USAGE - Call function for different features
create_funnel_plot(data, 'time_of_day', 'plots/04_funnel_time.png')
create_funnel_plot(data, 'device_type', 'plots/01_funnel_device.png')
create_funnel_plot(data, 'browsing_history', 'plots/02_funnel_history.png')
create_funnel_plot(data, 'gender', 'plots/05_funne_gender.png')
create_funnel_plot(data, 'ad_position', 'plots/06_funnel_position.png')

✓ Saved: plots/04_funnel_time.png
✓ Saved: plots/01_funnel_device.png
✓ Saved: plots/02_funnel_history.png
✓ Saved: plots/05_funne_gender.png
✓ Saved: plots/06_funnel_position.png


In [149]:
from itertools import product

def create_segment_combinations_csv(
    data,
    output_filename='segment_performance.csv',
    driver_features=('device_type', 'browsing_history'),  # controls the Segment_Combination label
    value_per_conversion=40,
    value_per_click=1,
    total_spend=20000
):
    """
    Build a segment_performance CSV where:
      - Segment_Combination string is built only from `driver_features` (e.g., device_type | browsing_history)
      - BUT the output CSV contains ALL categorical columns as separate fields
      - Metrics are computed per valid combination (existing rows only)
    """
    # Detect ALL categorical features (object or category)
    all_categorical = data.select_dtypes(include=['object', 'category']).columns.tolist()
    # Ensure driver features exist and are categorical
    for f in driver_features:
        if f not in all_categorical:
            raise ValueError(f"Driver feature '{f}' is not a categorical column in the dataset.")

    # Build value lists for DRIVER features only (for combinations)
    feature_values = {f: data[f].dropna().unique().tolist() for f in driver_features}
    all_combinations = product(*feature_values.values())

    rows = []

    for combo in all_combinations:
        # Filter rows for this combination using only driver features
        mask = pd.Series(True, index=data.index)
        seg_parts = []
        driver_values_map = {}
        for f, v in zip(driver_features, combo):
            mask &= (data[f] == v)
            seg_parts.append(str(v))
            driver_values_map[f] = v

        subset = data[mask]

        # Skip empty segments
        if subset.empty:
            continue

        impressions = len(subset)
        clicks = int(subset['click'].sum())
        conversions = int(subset['conversion'].sum())

        rev_conv = conversions * value_per_conversion
        rev_click = (clicks - conversions) * value_per_click
        revenue = rev_conv + rev_click

        ctr = (clicks / impressions * 100) if impressions > 0 else 0.0
        cvr = (conversions / clicks * 100) if clicks > 0 else 0.0

        # Proportional spend allocation by impression share
        segment_spend = (impressions / len(data)) * total_spend if len(data) > 0 else 0.0

        roas = (revenue / segment_spend) if segment_spend > 0 else 0.0
        cpa = (segment_spend / conversions) if conversions > 0 else 0.0
        ltv = (revenue / conversions) if conversions > 0 else 0.0
        profit = revenue - segment_spend
        roi_pct = (profit / segment_spend * 100) if segment_spend > 0 else 0.0

        # Start row with the requested segment label (only from driver features)
        row = {
            'Segment_Combination': ' | '.join(seg_parts),
            'Impressions': impressions,
            'Clicks': clicks,
            'Conversions': conversions,
            'Spend': round(segment_spend, 2),
            'Revenue': round(revenue, 2),
            'CTR_Percent': round(ctr, 2),
            'CVR_Percent': round(cvr, 2),
            'ROAS': round(roas, 3),
            'CPA': round(cpa, 2),
            'LTV': round(ltv, 2),
            'Profit': round(profit, 2),
            'ROI_Percent': round(roi_pct, 2),
        }

        # Add ALL categorical feature columns to the output:
        # - For driver features: use the exact value from the combination
        # - For other categorical features: fill with the mode for that subset (or 'Mixed' if tie/empty)
        for cat_col in all_categorical:
            if cat_col in driver_features:
                row[cat_col] = driver_values_map[cat_col]
            else:
                if subset[cat_col].notna().any():
                    mode_vals = subset[cat_col].mode(dropna=True)
                    if len(mode_vals) > 0:
                        row[cat_col] = str(mode_vals.iloc[0])
                    else:
                        row[cat_col] = 'Mixed'
                else:
                    row[cat_col] = 'Unknown'

        rows.append(row)

    # Build DataFrame, sort and save
    out_df = pd.DataFrame(rows)

    # Reorder columns: Segment_Combination, ALL categorical columns, then metrics
    cat_cols_order = [c for c in all_categorical]  # keep original cat order
    metric_cols = [
        'Impressions', 'Clicks', 'Conversions', 'Spend', 'Revenue',
        'CTR_Percent', 'CVR_Percent', 'ROAS', 'CPA', 'LTV', 'Profit', 'ROI_Percent'
    ]
    ordered_cols = ['Segment_Combination'] + cat_cols_order + metric_cols
    # Some categorical columns might duplicate names or not exist; ensure presence
    ordered_cols = [c for c in ordered_cols if c in out_df.columns]

    out_df = out_df[ordered_cols].sort_values('ROAS', ascending=False).reset_index(drop=True)
    out_df.to_csv(output_filename, index=False)

    print(f"\n✓ Saved: {output_filename}")
    print(f"Total Segments: {len(out_df)}")
    print("\nTop 10 Segments by ROAS:")
    print(out_df.head(10)[['Segment_Combination', 'ROAS', 'Profit', 'ROI_Percent']].to_string(index=False))

    return out_df


# Usage (keeps Segment_Combination built from device_type | browsing_history,
# but includes ALL categorical columns in the CSV):
segment_combinations = create_segment_combinations_csv(
    data,
    output_filename='segment_performance.csv',
    driver_features=('device_type', 'browsing_history', 'ad_position', 'time_of_day', 'gender'),
    value_per_conversion=40,
    value_per_click=1,
    total_spend=20000
)

print('\n=== BEST PERFORMING (TOP 5) ===')
print(segment_combinations.head(5)[['Segment_Combination', 'device_type', 'browsing_history', 'ROAS', 'Profit']])



✓ Saved: segment_performance.csv
Total Segments: 1641

Top 10 Segments by ROAS:
                                    Segment_Combination   ROAS  Profit  ROI_Percent
                 Mobile | News | Top | Unknown | Female 20.000    38.0      1900.00
           Tablet | Shopping | Unknown | Morning | Male 20.000    38.0      1900.00
Mobile | Entertainment | Unknown | Evening | Non-Binary 20.000    76.0      1900.00
                 Unknown | News | Side | Unknown | Male 20.000    38.0      1900.00
       Unknown | Shopping | Unknown | Unknown | Unknown 20.000    38.0      1900.00
             Mobile | News | Unknown | Evening | Female 13.333    74.0      1233.33
          Unknown | Education | Bottom | Unknown | Male 10.250    37.0       925.00
       Unknown | Education | Unknown | Morning | Female 10.250    37.0       925.00
         Unknown | Shopping | Bottom | Unknown | Female 10.250    37.0       925.00
              Mobile | News | Unknown | Night | Unknown 10.250    37.0       92

## Feature Engineering & Encoding

In [150]:
cat_cols = data.select_dtypes(include='object').columns
data = pd.get_dummies(data, columns=cat_cols, drop_first=True).astype(int)
print(f'Shape after encoding: {data.shape}')
print('Data ready for modeling')

Shape after encoding: (10000, 21)
Data ready for modeling


In [151]:
# Utility to build importance DataFrame
def importance_df(feature_names, importances, top_n=30):
    imp = pd.DataFrame({
        'feature': feature_names,
        'importance': np.array(importances, dtype=float)
    })
    imp = imp.sort_values('importance', ascending=False).reset_index(drop=True)
    return imp.head(top_n).copy(), imp

print('Computing feature importances...')

Computing feature importances...


## Model Training with MLflow

In [None]:


mlflow.set_tracking_uri("http://localhost:5000")
mlflow.start_run()

mlflow.set_experiment('Ad_Click_Prediction')
KNN_data = data.copy()
XGBoost_data = data.copy()
LightGBM_data = data.copy()
print('Data ready for 3 models')

http://localhost:5000


KeyboardInterrupt: 

### KNN Model

In [None]:
print('\\n=== KNN MODEL ===')
X = KNN_data.drop('click', axis=1)
y = KNN_data['click']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smt = SMOTE()
X_train, y_train = smt.fit_resample(X_train, y_train)

mlflow.set_tracking_uri("http://localhost:5000")
with mlflow.start_run(run_name='KNN'):
    classifier_knn = KNeighborsClassifier(n_neighbors=8, weights='distance', algorithm='auto')
    classifier_knn.fit(X_train, y_train)
    cv_scores_knn = cross_val_score(classifier_knn, X, y, cv=5)
    y_pred = classifier_knn.predict(X_test)
    
    accuracy_knn = accuracy_score(y_test, y_pred)
    precision_knn = precision_score(y_test, y_pred)
    recall_knn = recall_score(y_test, y_pred)
    f1_knn = f1_score(y_test, y_pred)
    
    mlflow.log_params({'model': 'KNN', 'n_neighbors': 8, 'weights': 'distance'})
    mlflow.log_metrics({'accuracy': accuracy_knn, 'precision': precision_knn, 'recall': recall_knn, 'f1': f1_knn, 'cv_mean': cv_scores_knn.mean()})

    
    print(f'Accuracy: {accuracy_knn:.4f}, Precision: {precision_knn:.4f}, Recall: {recall_knn:.4f}, F1: {f1_knn:.4f}')
    print(f'CV Mean: {cv_scores_knn.mean():.4f}')

\n=== KNN MODEL ===
Accuracy: 0.8125, Precision: 0.8423, Recall: 0.8741, F1: 0.8579
CV Mean: 0.8036


In [154]:
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title('KNN Confusion Matrix')
ax.set_ylabel('True')
ax.set_xlabel('Predicted')
plt.tight_layout()
plt.savefig('plots/17_knn_cm.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/17_knn_cm.png')

Saved: plots/17_knn_cm.png


In [155]:
precision, recall, _ = precision_recall_curve(y_test, y_pred)
auc_score = auc(recall, precision)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(recall, precision, label=f'Precision-Recall (AUC={auc_score:.2f})', color='b')
axes[0].set_xlabel('Recall')
axes[0].set_ylabel('Precision')
axes[0].set_title('KNN PR Curve')
axes[0].legend()

axes[1].plot(fpr, tpr, label=f'ROC Curve (AUC={roc_auc:.2f})')
axes[1].plot([0, 1], [0, 1], 'k--')
axes[1].set_xlabel('FPR')
axes[1].set_ylabel('TPR')
axes[1].set_title('KNN ROC Curve')
axes[1].legend()
plt.tight_layout()
plt.savefig('plots/18_knn_curves.png', dpi=300, bbox_inches='tight')
plt.close()
plt.show()
print('Saved: plots/18_knn_curves.png')

Saved: plots/18_knn_curves.png


### XGBoost Model

In [156]:
from plotly import express as px

X = XGBoost_data.drop(['click','conversion'] ,axis=1)
y = XGBoost_data['click']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
smt = SMOTE()
X_train, y_train = smt.fit_resample(X_train, y_train)

with mlflow.start_run(run_name='XGBoost'):
    classifier_xgb = XGBClassifier(n_estimators=400, max_depth=10, learning_rate=0.15, subsample=0.9)
    classifier_xgb.fit(X_train, y_train, verbose=False)
    cv_scores_xgb = cross_val_score(classifier_xgb, X, y, cv=5)
    y_pred = classifier_xgb.predict(X_test)
    
    accuracy_xgb = accuracy_score(y_test, y_pred)
    precision_xgb = precision_score(y_test, y_pred)
    recall_xgb = recall_score(y_test, y_pred)
    f1_xgb = f1_score(y_test, y_pred)
    
    mlflow.log_params({'model': 'XGBoost', 'n_estimators': 400, 'max_depth': 10, 'learning_rate': 0.15})
    mlflow.log_metrics({'accuracy': accuracy_xgb, 'precision': precision_xgb, 'recall': recall_xgb, 'f1': f1_xgb, 'cv_mean': cv_scores_xgb.mean()})
    
    print(f'Accuracy: {accuracy_xgb:.4f}, Precision: {precision_xgb:.4f}, Recall: {recall_xgb:.4f}, F1: {f1_xgb:.4f}')
    print(f'CV Mean: {cv_scores_xgb.mean():.4f}')

xgb_importances = classifier_xgb.feature_importances_
xgb_top, xgb_full = importance_df(X.columns.tolist(), xgb_importances, top_n=30)

# Verify we have data
print(f'XGBoost total features: {len(xgb_importances)}')
print(f'Non-zero features: {(xgb_importances > 0).sum()}')
print(f'Top features:\n{xgb_top.head()}')

# Plot and save
fig_xgb = px.bar(
    xgb_top.sort_values('importance'),
    x='importance',
    y='feature',
    orientation='h',
    title='XGBoost Feature Importance - Top 30',
    color='importance',
    color_continuous_scale='Greens'
)
fig_xgb.update_layout(height=800, width=1000)
fig_xgb.write_image('plots/04_feature_importance_xgboost.png', width=1000, height=800)
xgb_full.to_csv('plots/feature_importance_xgboost_full.csv', index=False)
print('Saved: plots/04_feature_importance_xgboost.png')
print('Saved: plots/feature_importance_xgboost_full.csv')
fig_xgb.show()

Accuracy: 0.8300, Precision: 0.8429, Recall: 0.9077, F1: 0.8741
CV Mean: 0.8352
XGBoost total features: 19
Non-zero features: 19
Top features:
                          feature  importance
0           browsing_history_News    0.071327
1   browsing_history_Social Media    0.065326
2               gender_Non-Binary    0.058643
3  browsing_history_Entertainment    0.058376
4       browsing_history_Shopping    0.058089
Saved: plots/04_feature_importance_xgboost.png
Saved: plots/feature_importance_xgboost_full.csv


In [157]:
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title('XGBoost Confusion Matrix')
plt.tight_layout()
plt.savefig('plots/19_xgb_cm.png', dpi=300, bbox_inches='tight')
plt.close()

In [158]:
precision, recall, _ = precision_recall_curve(y_test, y_pred)
auc_score = auc(recall, precision)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(recall, precision, label=f'Precision-Recall (AUC={auc_score:.2f})', color='b')
axes[0].set_xlabel('Recall')
axes[0].set_ylabel('Precision')
axes[0].set_title('XGBoost PR Curve')
axes[0].legend()

axes[1].plot(fpr, tpr, label=f'ROC Curve (AUC={roc_auc:.2f})')
axes[1].plot([0, 1], [0, 1], 'k--')
axes[1].set_xlabel('FPR')
axes[1].set_ylabel('TPR')
axes[1].set_title('XGBoost ROC Curve')
axes[1].legend()
plt.tight_layout()
plt.savefig('plots/20_xgb_curves.png', dpi=300, bbox_inches='tight')
plt.close()

### LightGBM Model

In [159]:
print('\\n=== LIGHTGBM MODEL ===')
X = LightGBM_data.drop(['click','conversion'], axis=1)
y = LightGBM_data['click']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
smt = SMOTE()
X_train, y_train = smt.fit_resample(X_train, y_train)

num_leaves=250
learning_rate=0.15
n_estimators=300


with mlflow.start_run(run_name='LightGBM'):
    classifier_lgb = lgb.LGBMClassifier(num_leaves=num_leaves, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=0, verbose=-1)
    classifier_lgb.fit(X_train, y_train)
    cv_scores_lgb = cross_val_score(classifier_lgb, X, y, cv=5)
    y_pred = classifier_lgb.predict(X_test)
    
    accuracy_lgb = accuracy_score(y_test, y_pred)
    precision_lgb = precision_score(y_test, y_pred)
    recall_lgb = recall_score(y_test, y_pred)
    f1_lgb = f1_score(y_test, y_pred)
    
    mlflow.log_params({'model': 'LightGBM', 'num_leaves': 116, 'learning_rate': 0.439, 'n_estimators': 170})
    mlflow.log_metrics({'accuracy': accuracy_lgb, 'precision': precision_lgb, 'recall': recall_lgb, 'f1': f1_lgb, 'cv_mean': cv_scores_lgb.mean()})
    
    print(f'Accuracy: {accuracy_lgb:.4f}, Precision: {precision_lgb:.4f}, Recall: {recall_lgb:.4f}, F1: {f1_lgb:.4f}')
    print(f'CV Mean: {cv_scores_lgb.mean():.4f}')

\n=== LIGHTGBM MODEL ===
Accuracy: 0.8365, Precision: 0.8518, Recall: 0.9062, F1: 0.8781
CV Mean: 0.8339


In [160]:
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title('LightGBM Confusion Matrix')
plt.tight_layout()
plt.savefig('plots/21_lgb_cm.png', dpi=300, bbox_inches='tight')
plt.close()

In [161]:
precision, recall, _ = precision_recall_curve(y_test, y_pred)
auc_score = auc(recall, precision)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(recall, precision, label=f'Precision-Recall (AUC={auc_score:.2f})', color='b')
axes[0].set_xlabel('Recall')
axes[0].set_ylabel('Precision')
axes[0].set_title('LightGBM PR Curve')
axes[0].legend()

axes[1].plot(fpr, tpr, label=f'ROC Curve (AUC={roc_auc:.2f})')
axes[1].plot([0, 1], [0, 1], 'k--')
axes[1].set_xlabel('FPR')
axes[1].set_ylabel('TPR')
axes[1].set_title('LightGBM ROC Curve')
axes[1].legend()
plt.tight_layout()
plt.savefig('plots/22_lgb_curves.png', dpi=300, bbox_inches='tight')
plt.close()

In [162]:
import plotly.express as px
lgb_gain = classifier_lgb.booster_.feature_importance(importance_type='gain')
lgb_top, lgb_full = importance_df(feature_names=X.columns.tolist(), importances=lgb_gain, top_n=10)

# Plot and save
fig_lgb = px.bar(
    lgb_top.sort_values('importance'),
    x='importance',
    y='feature',
    orientation='h',
    title='LightGBM Feature Importance (Gain) - Top 10',
    color='importance',
    color_continuous_scale='Blues'
)
fig_lgb.update_layout(height=800, width=1000)
fig_lgb.write_image('plots/03_feature_importance_lightgbm.png', width=1000, height=800)
lgb_full.to_csv('plots/feature_importance_lightgbm_full.csv', index=False)
print('Saved: plots/03_feature_importance_lightgbm.png')
print('Saved: plots/feature_importance_lightgbm_full.csv')
fig_lgb.show()

Saved: plots/03_feature_importance_lightgbm.png
Saved: plots/feature_importance_lightgbm_full.csv


## Summary

In [163]:
model_metrics_list = []
model_metrics_list.append({'Model': 'KNN', 'Accuracy': accuracy_knn, 'Precision': precision_knn, 'Recall': recall_knn, 'F1': f1_knn, 'CV Mean': cv_scores_knn.mean()})
model_metrics_list.append({'Model': 'XGBoost', 'Accuracy': accuracy_xgb, 'Precision': precision_xgb, 'Recall': recall_xgb, 'F1': f1_xgb, 'CV Mean': cv_scores_xgb.mean()})
model_metrics_list.append({'Model': 'LightGBM', 'Accuracy': accuracy_lgb, 'Precision': precision_lgb, 'Recall': recall_lgb, 'F1': f1_lgb, 'CV Mean': cv_scores_lgb.mean()})

all_metrics_df = pd.DataFrame(model_metrics_list)
all_metrics_df.to_csv('model_performance.csv', index=False)

print('\\n=== ALL MODEL METRICS ===')
print(all_metrics_df.to_string(index=False))

\n=== ALL MODEL METRICS ===
   Model  Accuracy  Precision   Recall       F1  CV Mean
     KNN    0.8125   0.842262 0.874131 0.857901   0.8036
 XGBoost    0.8300   0.842857 0.907692 0.874074   0.8352
LightGBM    0.8365   0.851772 0.906154 0.878122   0.8339


In [164]:
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(all_metrics_df))
width = 0.15
ax.bar(x - 2*width, all_metrics_df['Accuracy'], width, label='Accuracy')
ax.bar(x - width, all_metrics_df['Precision'], width, label='Precision')
ax.bar(x, all_metrics_df['Recall'], width, label='Recall')
ax.bar(x + width, all_metrics_df['F1'], width, label='F1')
ax.bar(x + 2*width, all_metrics_df['CV Mean'], width, label='CV Mean')
ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.set_title('Model Comparison')
ax.set_xticks(x)
ax.set_xticklabels(all_metrics_df['Model'])
ax.legend()
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('plots/23_model_comparison.png', dpi=300, bbox_inches='tight')
plt.close()
print('Saved: plots/23_model_comparison.png')

print('\\n=== ANALYSIS COMPLETE ===')
print('All plots saved in plots/ folder')
print('All model metrics logged to MLflow')
print('Run: mlflow ui')

Saved: plots/23_model_comparison.png
\n=== ANALYSIS COMPLETE ===
All plots saved in plots/ folder
All model metrics logged to MLflow
Run: mlflow ui


In [165]:

# Calculate metrics
total_impressions = len(data)
total_clicks = data['click'].sum()
total_conversions = data['conversion'].sum()
total_spend = 20000

revenue_from_conversions = total_conversions * 40
revenue_from_clicks = (total_clicks - total_conversions) * 1
total_revenue = revenue_from_conversions + revenue_from_clicks

ctr = (total_clicks / total_impressions * 100) if total_impressions > 0 else 0
cvr = (total_conversions / total_clicks * 100) if total_clicks > 0 else 0
roas = (total_revenue / total_spend) if total_spend > 0 else 0
cpa = (total_spend / total_conversions) if total_conversions > 0 else 0
ltv = (total_revenue / total_conversions) if total_conversions > 0 else 0
profit = total_revenue - total_spend
roi_pct = (profit / total_spend * 100) if total_spend > 0 else 0

# Create DataFrame with each metric as separate column
adtech_metrics = pd.DataFrame({
    'Total_Impressions': [total_impressions],
    'Total_Clicks': [total_clicks],
    'Total_Conversions': [total_conversions],
    'Total_Spend': [total_spend],
    'Total_Revenue': [round(total_revenue, 2)],
    'CTR_Percent': [round(ctr, 2)],
    'CVR_Percent': [round(cvr, 2)],
    'ROAS': [round(roas, 3)],
    'CPA': [round(cpa, 2)],
    'LTV': [round(ltv, 2)],
    'Profit': [round(profit, 2)],
    'ROI_Percent': [round(roi_pct, 2)]
})

adtech_metrics.to_csv('adtech_metrics.csv', index=False)

print('\n=== ADTECH METRICS ===')
print(adtech_metrics.to_string(index=False))

# Access individual metrics as separate variables
print(f'\nCTR: {ctr:.2f}%')
print(f'CVR: {cvr:.2f}%')
print(f'ROAS: {roas:.3f}')
print(f'CPA: ${cpa:.2f}')
print(f'LTV: ${ltv:.2f}')
print(f'ROI: {roi_pct:.2f}%')



=== ADTECH METRICS ===
 Total_Impressions  Total_Clicks  Total_Conversions  Total_Spend  Total_Revenue  CTR_Percent  CVR_Percent  ROAS   CPA   LTV  Profit  ROI_Percent
             10000          6500                531        20000          27209         65.0         8.17  1.36 37.66 51.24    7209        36.04

CTR: 65.00%
CVR: 8.17%
ROAS: 1.360
CPA: $37.66
LTV: $51.24
ROI: 36.05%
