In [34]:
import pandas as pd
from obp.dataset import OpenBanditDataset
import os
import plotly.express as px
import numpy as np
import string
import plotly.graph_objects as go
import math
from plotly.subplots import make_subplots

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

print(os.getcwd())

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

/Users/armandoordoricadelatorre/Documents/U of T/PhD/PhD Research/OBP_Replication


### Import logged data from `all.csv` and `item_context.csv`

In [35]:

# BTS / ALL sample
log_df   = pd.read_csv("zr-obp/obd/bts/all/all.csv", index_col=0)
items_df = pd.read_csv("zr-obp/obd/bts/all/item_context.csv", index_col=0)

print("log_df shape:", log_df.shape)
print("items_df shape:", items_df.shape)

print("\nlog_df columns:")
print(log_df.columns.tolist()[:40])  # peek first ~40 col names

print("\nfirst 5 log rows:")
print(log_df.head())

print("\nfirst 5 item rows:")
print(items_df.head())

log_df shape: (10000, 89)
items_df shape: (80, 5)

log_df columns:
['timestamp', 'item_id', 'position', 'click', 'propensity_score', 'user_feature_0', 'user_feature_1', 'user_feature_2', 'user_feature_3', 'user-item_affinity_0', 'user-item_affinity_1', 'user-item_affinity_2', 'user-item_affinity_3', 'user-item_affinity_4', 'user-item_affinity_5', 'user-item_affinity_6', 'user-item_affinity_7', 'user-item_affinity_8', 'user-item_affinity_9', 'user-item_affinity_10', 'user-item_affinity_11', 'user-item_affinity_12', 'user-item_affinity_13', 'user-item_affinity_14', 'user-item_affinity_15', 'user-item_affinity_16', 'user-item_affinity_17', 'user-item_affinity_18', 'user-item_affinity_19', 'user-item_affinity_20', 'user-item_affinity_21', 'user-item_affinity_22', 'user-item_affinity_23', 'user-item_affinity_24', 'user-item_affinity_25', 'user-item_affinity_26', 'user-item_affinity_27', 'user-item_affinity_28', 'user-item_affinity_29', 'user-item_affinity_30']

first 5 log rows:
           

### Remapping categorical features to readable categories

In [36]:

def remap_user_features(df, feature_cols):
    """
    Map hash values in user_feature_N to short readable codes like A1, B1, ...
    """
    df_copy = df.copy()
    mapping_dicts = {}

    for col in feature_cols:
        # Extract the feature index (N from 'user_feature_N')
        feature_idx = col.split("_")[-1]
        uniques = df[col].dropna().unique()

        # Build codes A{N}, B{N}, C{N}...
        codes = [f"{letter}{feature_idx}" for letter in string.ascii_uppercase[:len(uniques)]]
        mapping = dict(zip(uniques, codes))

        df_copy[col] = df[col].map(mapping)
        mapping_dicts[col] = mapping

    return df_copy, mapping_dicts

# Example usage
user_feature_cols = [c for c in log_df.columns if c.startswith("user_feature")]
log_df_readable, mappings = remap_user_features(log_df, user_feature_cols)

print("Sample remapped features:")
print(log_df_readable[user_feature_cols].head())

print("\nMappings used:")
for feat, mapping in mappings.items():
    print(f"{feat}: {mapping}")

Sample remapped features:
  user_feature_0 user_feature_1 user_feature_2 user_feature_3
0             A0             A1             A2             A3
1             A0             B1             B2             B3
2             A0             A1             C2             A3
3             A0             A1             A2             B3
4             A0             A1             C2             B3

Mappings used:
user_feature_0: {'81ce123cbb5bd8ce818f60fb3586bba5': 'A0', 'cef3390ed299c09874189c387777674a': 'B0', '4ae385d792f81dde128124a925a830de': 'C0'}
user_feature_1: {'03a5648a76832f83c859d46bc06cb64a': 'A1', '2d03db5543b14483e52d761760686b64': 'B1', '6ff54aa8ff7a9dde75161c20a3ee4231': 'C1', 'f1c2d6a32ec39249160cf784b63f4c6f': 'D1', '8b50621825ffd909dd8d8317d366271f': 'E1'}
user_feature_2: {'7bc94a2da491829b777c49c4b5e480f2': 'A2', '2723d2eb8bba04e0362098011fa3997b': 'B2', 'c2e4f76cdbabecd33b8c762aeef386b3': 'C2', '719dab53a7560218a9d1f96b25d6fa32': 'D2', '9b2d331c329ceb74d3dcfb48d8798c

In [37]:
log_df_readable.head()

Unnamed: 0,timestamp,item_id,position,click,propensity_score,user_feature_0,user_feature_1,user_feature_2,user_feature_3,user-item_affinity_0,user-item_affinity_1,user-item_affinity_2,user-item_affinity_3,user-item_affinity_4,user-item_affinity_5,user-item_affinity_6,user-item_affinity_7,user-item_affinity_8,user-item_affinity_9,user-item_affinity_10,user-item_affinity_11,user-item_affinity_12,user-item_affinity_13,user-item_affinity_14,user-item_affinity_15,user-item_affinity_16,user-item_affinity_17,user-item_affinity_18,user-item_affinity_19,user-item_affinity_20,user-item_affinity_21,user-item_affinity_22,user-item_affinity_23,user-item_affinity_24,user-item_affinity_25,user-item_affinity_26,user-item_affinity_27,user-item_affinity_28,user-item_affinity_29,user-item_affinity_30,user-item_affinity_31,user-item_affinity_32,user-item_affinity_33,user-item_affinity_34,user-item_affinity_35,user-item_affinity_36,user-item_affinity_37,user-item_affinity_38,user-item_affinity_39,user-item_affinity_40,user-item_affinity_41,user-item_affinity_42,user-item_affinity_43,user-item_affinity_44,user-item_affinity_45,user-item_affinity_46,user-item_affinity_47,user-item_affinity_48,user-item_affinity_49,user-item_affinity_50,user-item_affinity_51,user-item_affinity_52,user-item_affinity_53,user-item_affinity_54,user-item_affinity_55,user-item_affinity_56,user-item_affinity_57,user-item_affinity_58,user-item_affinity_59,user-item_affinity_60,user-item_affinity_61,user-item_affinity_62,user-item_affinity_63,user-item_affinity_64,user-item_affinity_65,user-item_affinity_66,user-item_affinity_67,user-item_affinity_68,user-item_affinity_69,user-item_affinity_70,user-item_affinity_71,user-item_affinity_72,user-item_affinity_73,user-item_affinity_74,user-item_affinity_75,user-item_affinity_76,user-item_affinity_77,user-item_affinity_78,user-item_affinity_79
0,2019-11-24 00:00:17.004101+00:00,79,2,0,0.087125,A0,A1,A2,A3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-11-24 00:00:19.715857+00:00,14,1,0,0.006235,A0,B1,B2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019-11-24 00:01:04.303227+00:00,18,2,0,0.0613,A0,A1,C2,A3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019-11-24 00:01:11.571162+00:00,28,1,0,0.01943,A0,A1,A2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2019-11-24 00:02:41.811768+00:00,65,2,0,0.019375,A0,A1,C2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
print(log_df_readable.columns)

Index(['timestamp', 'item_id', 'position', 'click', 'propensity_score', 'user_feature_0', 'user_feature_1', 'user_feature_2', 'user_feature_3', 'user-item_affinity_0', 'user-item_affinity_1', 'user-item_affinity_2', 'user-item_affinity_3', 'user-item_affinity_4', 'user-item_affinity_5', 'user-item_affinity_6', 'user-item_affinity_7', 'user-item_affinity_8', 'user-item_affinity_9', 'user-item_affinity_10', 'user-item_affinity_11', 'user-item_affinity_12', 'user-item_affinity_13', 'user-item_affinity_14', 'user-item_affinity_15', 'user-item_affinity_16', 'user-item_affinity_17', 'user-item_affinity_18', 'user-item_affinity_19', 'user-item_affinity_20', 'user-item_affinity_21', 'user-item_affinity_22', 'user-item_affinity_23', 'user-item_affinity_24', 'user-item_affinity_25', 'user-item_affinity_26', 'user-item_affinity_27', 'user-item_affinity_28', 'user-item_affinity_29', 'user-item_affinity_30', 'user-item_affinity_31', 'user-item_affinity_32', 'user-item_affinity_33',
       'user-ite

In [39]:
log_df_readable.head()

Unnamed: 0,timestamp,item_id,position,click,propensity_score,user_feature_0,user_feature_1,user_feature_2,user_feature_3,user-item_affinity_0,user-item_affinity_1,user-item_affinity_2,user-item_affinity_3,user-item_affinity_4,user-item_affinity_5,user-item_affinity_6,user-item_affinity_7,user-item_affinity_8,user-item_affinity_9,user-item_affinity_10,user-item_affinity_11,user-item_affinity_12,user-item_affinity_13,user-item_affinity_14,user-item_affinity_15,user-item_affinity_16,user-item_affinity_17,user-item_affinity_18,user-item_affinity_19,user-item_affinity_20,user-item_affinity_21,user-item_affinity_22,user-item_affinity_23,user-item_affinity_24,user-item_affinity_25,user-item_affinity_26,user-item_affinity_27,user-item_affinity_28,user-item_affinity_29,user-item_affinity_30,user-item_affinity_31,user-item_affinity_32,user-item_affinity_33,user-item_affinity_34,user-item_affinity_35,user-item_affinity_36,user-item_affinity_37,user-item_affinity_38,user-item_affinity_39,user-item_affinity_40,user-item_affinity_41,user-item_affinity_42,user-item_affinity_43,user-item_affinity_44,user-item_affinity_45,user-item_affinity_46,user-item_affinity_47,user-item_affinity_48,user-item_affinity_49,user-item_affinity_50,user-item_affinity_51,user-item_affinity_52,user-item_affinity_53,user-item_affinity_54,user-item_affinity_55,user-item_affinity_56,user-item_affinity_57,user-item_affinity_58,user-item_affinity_59,user-item_affinity_60,user-item_affinity_61,user-item_affinity_62,user-item_affinity_63,user-item_affinity_64,user-item_affinity_65,user-item_affinity_66,user-item_affinity_67,user-item_affinity_68,user-item_affinity_69,user-item_affinity_70,user-item_affinity_71,user-item_affinity_72,user-item_affinity_73,user-item_affinity_74,user-item_affinity_75,user-item_affinity_76,user-item_affinity_77,user-item_affinity_78,user-item_affinity_79
0,2019-11-24 00:00:17.004101+00:00,79,2,0,0.087125,A0,A1,A2,A3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-11-24 00:00:19.715857+00:00,14,1,0,0.006235,A0,B1,B2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019-11-24 00:01:04.303227+00:00,18,2,0,0.0613,A0,A1,C2,A3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019-11-24 00:01:11.571162+00:00,28,1,0,0.01943,A0,A1,A2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2019-11-24 00:02:41.811768+00:00,65,2,0,0.019375,A0,A1,C2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## EDA 

### Histogram on number of rows per day 

In [40]:
# Ensure proper datetime dtype (keeps timezone info)
log_df = log_df_readable.copy()
log_df['timestamp'] = pd.to_datetime(log_df['timestamp'], utc=True, errors='coerce')
log_df = log_df.loc[log_df['timestamp'].notna()]

# convert to your local timezone (example: Toronto)
log_df['timestamp'] = log_df['timestamp'].dt.tz_convert('America/Toronto')

# # Count rows per day, including days with zero rows
daily_counts = (
    log_df
    .set_index('timestamp')
    .resample('D')
    .size()
    .rename('rows')
    .reset_index()
)

fig = px.bar(
    daily_counts,
    x='timestamp',
    y='rows',
    text='rows',
    title='Number of Rows per Day',
    labels={'timestamp': 'Date', 'rows': 'Rows'}
)
# Show values on top of each bar
fig.update_traces(
    texttemplate='%{text:,}',   # thousands separator
    textposition='outside',
    cliponaxis=False
)
# Add a bit of headroom so labels don’t get clipped
max_y = daily_counts['rows'].max()
fig.update_layout(yaxis_range=[0, max(1, max_y * 1.1)])

fig.show()


In [41]:
user_feature_cols = [c for c in log_df_readable.columns if c.startswith('user_feature_')]
affinity_cols = [c for c in log_df_readable.columns if c.startswith('user-item_affinity_')]

user_feature_cols, len(affinity_cols)


(['user_feature_0', 'user_feature_1', 'user_feature_2', 'user_feature_3'], 80)

In [42]:
feat_summary = []
for c in user_feature_cols:
    s = log_df_readable[c]
    vc = s.value_counts(dropna=False)
    feat_summary.append({
        'feature': c,
        'n_unique': s.nunique(dropna=True),
        'n_missing': int(s.isna().sum()),
        'top_value': None if vc.empty else vc.index[0],
        'top_count': 0 if vc.empty else int(vc.iloc[0]),
        'top_share': 0.0 if vc.empty else float(vc.iloc[0] / len(s)),
    })
pd.DataFrame(feat_summary).sort_values('n_unique', ascending=False)

Unnamed: 0,feature,n_unique,n_missing,top_value,top_count,top_share
2,user_feature_2,9,0,B2,2439,0.2439
3,user_feature_3,9,0,B3,3667,0.3667
1,user_feature_1,5,0,A1,8322,0.8322
0,user_feature_0,3,0,A0,8123,0.8123


In [43]:
log_df_readable['item_id'].nunique(), items_df.shape[0]

(80, 80)

In [44]:
# Build a stable “user key”
user_key = pd.util.hash_pandas_object(log_df_readable[user_feature_cols].astype(str), index=False).astype('uint64')
log_df_readable = log_df_readable.assign(user_key=user_key)

n_distinct_users = log_df_readable['user_key'].nunique()
n_distinct_users

253

In [45]:
log_df_readable.head()
log_df_readable['user_feature_0'].value_counts(dropna=False)

A0    8123
B0    1808
C0      69
Name: user_feature_0, dtype: int64

In [46]:
log_df_readable['user_feature_1'].value_counts(dropna=False)

A1    8322
B1     854
D1     681
C1     138
E1       5
Name: user_feature_1, dtype: int64

### cardinality per user feature

In [47]:
# Select only the user feature columns
user_feature_cols = [c for c in log_df_readable.columns if c.startswith("user_feature")]

# Count unique values per column
cardinality = {c: log_df_readable[c].nunique() for c in user_feature_cols}
cardinality_df = pd.DataFrame.from_dict(cardinality, orient="index", columns=["n_unique"]).reset_index()
cardinality_df.rename(columns={"index": "feature"}, inplace=True)

# Plot vertical bars
fig = px.bar(
    cardinality_df,
    x="feature",
    y="n_unique",
    text="n_unique",
    title="Cardinality of User Features",
    labels={"n_unique": "Number of Unique Values", "feature": "User Feature"},
)

fig.update_traces(textposition="outside")
fig.update_layout(
    xaxis_tickangle=-30,
    yaxis=dict(title="Number of Unique Values"),
    xaxis=dict(title="User Feature"),
    margin=dict(t=40)  # increase top margin so labels fit
)
fig.show()

In [48]:
log_df_readable.head()

Unnamed: 0,timestamp,item_id,position,click,propensity_score,user_feature_0,user_feature_1,user_feature_2,user_feature_3,user-item_affinity_0,user-item_affinity_1,user-item_affinity_2,user-item_affinity_3,user-item_affinity_4,user-item_affinity_5,user-item_affinity_6,user-item_affinity_7,user-item_affinity_8,user-item_affinity_9,user-item_affinity_10,user-item_affinity_11,user-item_affinity_12,user-item_affinity_13,user-item_affinity_14,user-item_affinity_15,user-item_affinity_16,user-item_affinity_17,user-item_affinity_18,user-item_affinity_19,user-item_affinity_20,user-item_affinity_21,user-item_affinity_22,user-item_affinity_23,user-item_affinity_24,user-item_affinity_25,user-item_affinity_26,user-item_affinity_27,user-item_affinity_28,user-item_affinity_29,user-item_affinity_30,user-item_affinity_31,user-item_affinity_32,user-item_affinity_33,user-item_affinity_34,user-item_affinity_35,user-item_affinity_36,user-item_affinity_37,user-item_affinity_38,user-item_affinity_39,user-item_affinity_40,user-item_affinity_41,user-item_affinity_42,user-item_affinity_43,user-item_affinity_44,user-item_affinity_45,user-item_affinity_46,user-item_affinity_47,user-item_affinity_48,user-item_affinity_49,user-item_affinity_50,user-item_affinity_51,user-item_affinity_52,user-item_affinity_53,user-item_affinity_54,user-item_affinity_55,user-item_affinity_56,user-item_affinity_57,user-item_affinity_58,user-item_affinity_59,user-item_affinity_60,user-item_affinity_61,user-item_affinity_62,user-item_affinity_63,user-item_affinity_64,user-item_affinity_65,user-item_affinity_66,user-item_affinity_67,user-item_affinity_68,user-item_affinity_69,user-item_affinity_70,user-item_affinity_71,user-item_affinity_72,user-item_affinity_73,user-item_affinity_74,user-item_affinity_75,user-item_affinity_76,user-item_affinity_77,user-item_affinity_78,user-item_affinity_79,user_key
0,2019-11-24 00:00:17.004101+00:00,79,2,0,0.087125,A0,A1,A2,A3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15330215301336017999
1,2019-11-24 00:00:19.715857+00:00,14,1,0,0.006235,A0,B1,B2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5927047800053972895
2,2019-11-24 00:01:04.303227+00:00,18,2,0,0.0613,A0,A1,C2,A3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2347760887220027378
3,2019-11-24 00:01:11.571162+00:00,28,1,0,0.01943,A0,A1,A2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2328851061653931531
4,2019-11-24 00:02:41.811768+00:00,65,2,0,0.019375,A0,A1,C2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3462025713095781014


In [49]:
# list of your user features
user_feature_cols = [c for c in log_df_readable.columns if c.startswith("user_feature")]

for col in user_feature_cols:
    # compute CTR per category
    ctr_df = (
        log_df_readable.groupby(col)["click"]
        .mean()
        .reset_index()
    )
    ctr_df.rename(columns={"click": "CTR"}, inplace=True)

    # ensure alphabetical order
    ctr_df = ctr_df.sort_values(col, ascending=True)
    category_order = sorted(ctr_df[col].unique())

    # make bar plot
    fig = px.bar(
        ctr_df,
        x=col,
        y="CTR",
        text="CTR",
        title=f"Click-Through Rate by {col}",
        labels={col: col, "CTR": "Click Rate"},
        category_orders={col: category_order},
    )
    fig.update_traces(texttemplate="%{text:.3f}", textposition="outside")
    fig.update_layout(yaxis=dict(tickformat=".2%"), margin=dict(t=100))
    fig.show()

In [50]:

# pick the user feature columns
user_feature_cols = [c for c in log_df_readable.columns if c.startswith("user_feature")]

# long table: (feature, value, CTR)
long = []
for col in user_feature_cols:
    tmp = (
        log_df_readable.groupby(col)["click"].mean().reset_index()
        .rename(columns={col: "value", "click": "CTR"})
    )
    tmp["feature"] = col
    long.append(tmp)
long = pd.concat(long, ignore_index=True)

# pivot to feature x value
heat = long.pivot(index="feature", columns="value", values="CTR")
# ensure alphabetical order of value labels
heat = heat.reindex(sorted(heat.columns), axis=1)

# build heatmap with percentage annotations
z = heat.values
zmax = float(np.nanmax(z))
text = np.where(np.isnan(z), "", np.vectorize(lambda v: f"{v:.2%}")(z))

fig = go.Figure(
    data=go.Heatmap(
        z=z, x=heat.columns, y=heat.index,
        colorscale="Blues", zmin=0, zmax=zmax,
        colorbar=dict(title="CTR", tickformat=".2%"),
        text=text, texttemplate="%{text}", textfont=dict(size=11)
    )
)
fig.update_layout(
    title="CTR by User Feature & Category",
    xaxis_title="Category (mapped)",
    yaxis_title="User Feature",
    margin=dict(t=90, l=80, r=20, b=60),
)
fig.show()

In [51]:
user_feature_cols = [c for c in log_df_readable.columns if c.startswith("user_feature")]

# compute CTR per (feature, value)
long = []
for col in user_feature_cols:
    tmp = (
        log_df_readable.groupby(col)["click"].mean().reset_index()
        .rename(columns={col: "value", "click": "CTR"})
        .sort_values("value")  # alphabetical bars
    )
    tmp["feature"] = col
    long.append(tmp)
long = pd.concat(long, ignore_index=True)

max_ctr = long["CTR"].max()

# grid size (e.g., 2x2 for 4 features)
n = len(user_feature_cols)
cols = 2
rows = math.ceil(n / cols)

fig = make_subplots(rows=rows, cols=cols, subplot_titles=user_feature_cols)

for i, col in enumerate(user_feature_cols, start=1):
    r = (i - 1) // cols + 1
    c = (i - 1) % cols + 1
    sub = long[long["feature"] == col]
    fig.add_bar(
        x=sub["value"], y=sub["CTR"], text=[f"{v:.2%}" for v in sub["CTR"]],
        textposition="outside", name=col, row=r, col=c, showlegend=False
    )

# shared y-axis scale for comparability
for ax_idx in range(1, rows * cols + 1):
    fig.update_yaxes(range=[0, max_ctr], tickformat=".2%", row=(ax_idx-1)//cols+1, col=(ax_idx-1)%cols+1)

fig.update_layout(
    height=320*rows, width=520*cols, title="CTR by Category (per User Feature, shared Y-scale)",
    margin=dict(t=90), bargap=0.25
)
fig.show()

In [52]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statsmodels.stats.proportion import proportion_confint

# Select only the user_feature columns
user_feature_cols = [c for c in log_df_readable.columns if c.startswith("user_feature_")]

def ctr_with_ci(df, feature_col):
    grouped = df.groupby(feature_col)["click"].agg(["sum", "count"]).reset_index()
    grouped["ctr"] = grouped["sum"] / grouped["count"]
    ci_low, ci_upp = proportion_confint(grouped["sum"], grouped["count"], method="wilson")
    grouped["ci_low"] = ci_low
    grouped["ci_upp"] = ci_upp
    return grouped

rows, cols = 2, 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=user_feature_cols, shared_yaxes=True)

for i, feature in enumerate(user_feature_cols):
    stats = ctr_with_ci(log_df_readable, feature)
    stats = stats.sort_values(feature)  # keep alphabetical order

    row = i // cols + 1
    col = i % cols + 1

    fig.add_trace(
        go.Scatter(
            x=stats[feature],
            y=stats["ctr"],
            mode="markers+lines",
            error_y=dict(
                type="data",
                symmetric=False,
                array=stats["ci_upp"] - stats["ctr"],
                arrayminus=stats["ctr"] - stats["ci_low"]
            ),
            name=feature,
            showlegend=False,
            text=[
                f"{feature}={val}<br>"
                f"CTR={ctr:.3%}<br>"
                f"95% CI=({lo:.3%}, {hi:.3%})<br>"
                f"Clicks={s} / Impressions={n}"
                for val, ctr, lo, hi, s, n in zip(
                    stats[feature], stats["ctr"], stats["ci_low"], stats["ci_upp"], stats["sum"], stats["count"]
                )
            ],
            hoverinfo="text"
        ),
        row=row, col=col
    )

    fig.update_xaxes(title_text=feature, row=row, col=col)
    fig.update_yaxes(title_text="CTR", row=row, col=col)

fig.update_layout(
    height=650, width=950,
    title="Click-Through Rate with 95% CI per Feature Value (Hover for counts)",
)

fig.show()

In [53]:
import math
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

user_feature_cols = [c for c in log_df_readable.columns if c.startswith("user_feature")]

# compute CTR + counts for each feature/value
long = []
for col in user_feature_cols:
    g = log_df_readable.groupby(col)["click"].agg(sum="sum", count="count").reset_index()
    g["CTR"] = g["sum"] / g["count"]
    g["feature"] = col
    g = g.rename(columns={col: "value"}).sort_values("value")  # alphabetical
    long.append(g)
long = pd.concat(long, ignore_index=True)

max_ctr = long["CTR"].max()

# grid size (e.g., 2x2 for 4 features)
n = len(user_feature_cols)
cols = 2
rows = math.ceil(n / cols)

fig = make_subplots(rows=rows, cols=cols, subplot_titles=user_feature_cols)

for i, col in enumerate(user_feature_cols, start=1):
    r = (i - 1) // cols + 1
    c = (i - 1) % cols + 1
    sub = long[long["feature"] == col]

    colors = ["blue" if clicks >= 5 else "lightgray" for clicks in sub["sum"]]

    fig.add_trace(
        go.Bar(
            x=sub["value"],
            y=sub["CTR"],
            text=[f"{v:.2%}" for v in sub["CTR"]],
            textposition="outside",
            marker_color=colors,
            showlegend=False,
            hovertemplate=(
                f"{col}=%{{x}}<br>"
                "CTR=%{y:.2%}<br>"
                "Clicks=%{customdata[0]} / Impressions=%{customdata[1]}<extra></extra>"
            ),
            customdata=sub[["sum", "count"]].values,
        ),
        row=r, col=c
    )

# shared y-axis for comparability
for ax_idx in range(1, rows * cols + 1):
    fig.update_yaxes(
        range=[0, max_ctr],
        tickformat=".2%",
        row=(ax_idx-1)//cols+1,
        col=(ax_idx-1)%cols+1
    )

fig.update_layout(
    height=320*rows, width=520*cols,
    title="CTR by Category (per User Feature) — Blue: Clicks ≥ 5, Gray: Clicks < 5",
    margin=dict(t=90), bargap=0.25
)

fig.show()

### Joint Probabilities 

In [54]:
import pandas as pd
import plotly.express as px

# pick two features to start with
f1, f2 = "user_feature_1", "user_feature_2"

# compute joint CTR
joint = (
    log_df_readable.groupby([f1, f2])
    .agg(click_rate=("click", "mean"), count=("click", "size"), clicks=("click", "sum"))
    .reset_index()
)

# sort categories alphabetically for consistency
joint[f1] = joint[f1].astype("category")
joint[f2] = joint[f2].astype("category")
joint = joint.sort_values([f1, f2])

# heatmap of joint click rates
fig = px.imshow(
    joint.pivot(index=f1, columns=f2, values="click_rate"),
    text_auto=".2%",
    aspect="auto",
    color_continuous_scale="Blues",
    labels=dict(color="CTR"),
    title=f"Joint CTR: {f1} × {f2}"
)

fig.show()

# if you want to see counts on hover
fig2 = px.scatter(
    joint, x=f2, y=f1, size="count", color="click_rate",
    hover_data=["clicks", "count"],
    color_continuous_scale="Blues",
    title=f"Joint CTR with sample sizes: {f1} × {f2}"
)
fig2.show()

### OneHot Encoding Dataframe

In [55]:
log_df_readable.head()

Unnamed: 0,timestamp,item_id,position,click,propensity_score,user_feature_0,user_feature_1,user_feature_2,user_feature_3,user-item_affinity_0,user-item_affinity_1,user-item_affinity_2,user-item_affinity_3,user-item_affinity_4,user-item_affinity_5,user-item_affinity_6,user-item_affinity_7,user-item_affinity_8,user-item_affinity_9,user-item_affinity_10,user-item_affinity_11,user-item_affinity_12,user-item_affinity_13,user-item_affinity_14,user-item_affinity_15,user-item_affinity_16,user-item_affinity_17,user-item_affinity_18,user-item_affinity_19,user-item_affinity_20,user-item_affinity_21,user-item_affinity_22,user-item_affinity_23,user-item_affinity_24,user-item_affinity_25,user-item_affinity_26,user-item_affinity_27,user-item_affinity_28,user-item_affinity_29,user-item_affinity_30,user-item_affinity_31,user-item_affinity_32,user-item_affinity_33,user-item_affinity_34,user-item_affinity_35,user-item_affinity_36,user-item_affinity_37,user-item_affinity_38,user-item_affinity_39,user-item_affinity_40,user-item_affinity_41,user-item_affinity_42,user-item_affinity_43,user-item_affinity_44,user-item_affinity_45,user-item_affinity_46,user-item_affinity_47,user-item_affinity_48,user-item_affinity_49,user-item_affinity_50,user-item_affinity_51,user-item_affinity_52,user-item_affinity_53,user-item_affinity_54,user-item_affinity_55,user-item_affinity_56,user-item_affinity_57,user-item_affinity_58,user-item_affinity_59,user-item_affinity_60,user-item_affinity_61,user-item_affinity_62,user-item_affinity_63,user-item_affinity_64,user-item_affinity_65,user-item_affinity_66,user-item_affinity_67,user-item_affinity_68,user-item_affinity_69,user-item_affinity_70,user-item_affinity_71,user-item_affinity_72,user-item_affinity_73,user-item_affinity_74,user-item_affinity_75,user-item_affinity_76,user-item_affinity_77,user-item_affinity_78,user-item_affinity_79,user_key
0,2019-11-24 00:00:17.004101+00:00,79,2,0,0.087125,A0,A1,A2,A3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15330215301336017999
1,2019-11-24 00:00:19.715857+00:00,14,1,0,0.006235,A0,B1,B2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5927047800053972895
2,2019-11-24 00:01:04.303227+00:00,18,2,0,0.0613,A0,A1,C2,A3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2347760887220027378
3,2019-11-24 00:01:11.571162+00:00,28,1,0,0.01943,A0,A1,A2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2328851061653931531
4,2019-11-24 00:02:41.811768+00:00,65,2,0,0.019375,A0,A1,C2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3462025713095781014


In [56]:
log_df_readable.groupby('user_feature_0')['click'].mean().sort_values(ascending=False)

user_feature_0
C0    0.014493
A0    0.004678
B0    0.001659
Name: click, dtype: float64

In [57]:
import pandas as pd

ctr_tables = {}

for col in ["user_feature_0", "user_feature_1", "user_feature_2", "user_feature_3"]:
    ctr = (
        log_df_readable
        .groupby(col)["click"]
        .agg(clicks="sum", impressions="count", ctr="mean")
        .reset_index()
        .sort_values("ctr", ascending=False)
    )
    ctr_tables[col] = ctr

# Example: show CTRs for user_feature_1
ctr_tables["user_feature_1"]

Unnamed: 0,user_feature_1,clicks,impressions,ctr
0,A1,39,8322,0.004686
1,B1,3,854,0.003513
2,C1,0,138,0.0
3,D1,0,681,0.0
4,E1,0,5,0.0


In [58]:


# Columns you want to one-hot encode
categorical_cols = ["user_feature_0", "user_feature_1", "user_feature_2", "user_feature_3"]

# Apply one-hot encoding to just those
log_df_encoded = pd.get_dummies(
    log_df_readable,
    columns=categorical_cols,
    prefix=categorical_cols,  # keeps nice column names like user_feature_0_A0
    drop_first=False          # keep all categories; set True if you want k-1 encoding
)

# Now log_df_encoded contains all original columns, but with categorical expanded into one-hots
log_df_encoded.head()

Unnamed: 0,timestamp,item_id,position,click,propensity_score,user-item_affinity_0,user-item_affinity_1,user-item_affinity_2,user-item_affinity_3,user-item_affinity_4,user-item_affinity_5,user-item_affinity_6,user-item_affinity_7,user-item_affinity_8,user-item_affinity_9,user-item_affinity_10,user-item_affinity_11,user-item_affinity_12,user-item_affinity_13,user-item_affinity_14,user-item_affinity_15,user-item_affinity_16,user-item_affinity_17,user-item_affinity_18,user-item_affinity_19,user-item_affinity_20,user-item_affinity_21,user-item_affinity_22,user-item_affinity_23,user-item_affinity_24,user-item_affinity_25,user-item_affinity_26,user-item_affinity_27,user-item_affinity_28,user-item_affinity_29,user-item_affinity_30,user-item_affinity_31,user-item_affinity_32,user-item_affinity_33,user-item_affinity_34,user-item_affinity_35,user-item_affinity_36,user-item_affinity_37,user-item_affinity_38,user-item_affinity_39,user-item_affinity_40,user-item_affinity_41,user-item_affinity_42,user-item_affinity_43,user-item_affinity_44,user-item_affinity_45,user-item_affinity_46,user-item_affinity_47,user-item_affinity_48,user-item_affinity_49,user-item_affinity_50,user-item_affinity_51,user-item_affinity_52,user-item_affinity_53,user-item_affinity_54,user-item_affinity_55,user-item_affinity_56,user-item_affinity_57,user-item_affinity_58,user-item_affinity_59,user-item_affinity_60,user-item_affinity_61,user-item_affinity_62,user-item_affinity_63,user-item_affinity_64,user-item_affinity_65,user-item_affinity_66,user-item_affinity_67,user-item_affinity_68,user-item_affinity_69,user-item_affinity_70,user-item_affinity_71,user-item_affinity_72,user-item_affinity_73,user-item_affinity_74,user-item_affinity_75,user-item_affinity_76,user-item_affinity_77,user-item_affinity_78,user-item_affinity_79,user_key,user_feature_0_A0,user_feature_0_B0,user_feature_0_C0,user_feature_1_A1,user_feature_1_B1,user_feature_1_C1,user_feature_1_D1,user_feature_1_E1,user_feature_2_A2,user_feature_2_B2,user_feature_2_C2,user_feature_2_D2,user_feature_2_E2,user_feature_2_F2,user_feature_2_G2,user_feature_2_H2,user_feature_2_I2,user_feature_3_A3,user_feature_3_B3,user_feature_3_C3,user_feature_3_D3,user_feature_3_E3,user_feature_3_F3,user_feature_3_G3,user_feature_3_H3,user_feature_3_I3
0,2019-11-24 00:00:17.004101+00:00,79,2,0,0.087125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15330215301336017999,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,2019-11-24 00:00:19.715857+00:00,14,1,0,0.006235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5927047800053972895,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,2019-11-24 00:01:04.303227+00:00,18,2,0,0.0613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2347760887220027378,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,2019-11-24 00:01:11.571162+00:00,28,1,0,0.01943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2328851061653931531,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,2019-11-24 00:02:41.811768+00:00,65,2,0,0.019375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3462025713095781014,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [59]:
import pandas as pd
import numpy as np
import plotly.express as px
from statsmodels.stats.proportion import proportion_confint

feature_cols = ["user_feature_0", "user_feature_1", "user_feature_2", "user_feature_3"]

df = log_df_readable.dropna(subset=feature_cols + ["click"]).copy()

# Global baseline CTR
baseline_ctr = df["click"].mean()
if baseline_ctr == 0:
    raise ValueError("Global CTR is zero; cannot compute lift vs baseline.")

# Build long table with CTR + Wilson CIs
tables = []
for col in feature_cols:
    g = (
        df.groupby(col)["click"]
          .agg(clicks="sum", impressions="count")
          .reset_index()
          .rename(columns={col: "category"})
    )
    g["ctr"] = g["clicks"] / g["impressions"]

    # Wilson 95% CI
    ci_low, ci_high = proportion_confint(
        g["clicks"], g["impressions"], alpha=0.05, method="wilson"
    )
    g["ci_low"] = ci_low
    g["ci_high"] = ci_high

    g["feature"] = col
    g["lift_vs_baseline"] = g["ctr"] / baseline_ctr
    g["lift_ci_low"] = g["ci_low"] / baseline_ctr
    g["lift_ci_high"] = g["ci_high"] / baseline_ctr

    tables.append(g)

ctr_all = pd.concat(tables, ignore_index=True)

# Nicer labels
ctr_all["label"] = ctr_all["category"].astype(str) + " (" + ctr_all["feature"] + ")"

# Alphabetical sort
ctr_all = ctr_all.sort_values("label")
ctr_all["label"] = pd.Categorical(ctr_all["label"], categories=ctr_all["label"], ordered=True)

# ---- Plot with error bars ----
fig = px.bar(
    ctr_all,
    x="label",
    y="lift_vs_baseline",
    color="feature",
    hover_data={
        "ctr": ":.4f",
        "clicks": True,
        "impressions": True,
        "ci_low": ":.4f",
        "ci_high": ":.4f",
        "label": False,
    },
    error_y=ctr_all["lift_ci_high"] - ctr_all["lift_vs_baseline"],
    error_y_minus=ctr_all["lift_vs_baseline"] - ctr_all["lift_ci_low"],
)

fig.add_hline(y=1.0, line_dash="dash")

fig.update_layout(
    title="Per-category CTR vs Global Baseline (95% Wilson CI)",
    xaxis_title="Category (feature)",
    yaxis_title="Lift = CTR / Global CTR",
    bargap=0.15,
    legend_title_text="Feature",
)
fig.update_xaxes(tickangle=45)

fig.show()

display(ctr_all)

Unnamed: 0,category,clicks,impressions,ctr,ci_low,ci_high,feature,lift_vs_baseline,lift_ci_low,lift_ci_high,label
0,A0,38,8123,0.004678,0.003410294,0.006414,user_feature_0,1.113827,0.8119748,1.527171,A0 (user_feature_0)
3,A1,39,8322,0.004686,0.003430204,0.0064,user_feature_1,1.115803,0.8167153,1.523716,A1 (user_feature_1)
8,A2,8,1569,0.005099,0.002585873,0.010029,user_feature_2,1.213997,0.6156841,2.387897,A2 (user_feature_2)
17,A3,14,3523,0.003974,0.002368682,0.00666,user_feature_3,0.946163,0.5639719,1.585628,A3 (user_feature_3)
1,B0,3,1808,0.001659,0.0005644659,0.004867,user_feature_0,0.39507,0.1343967,1.158876,B0 (user_feature_0)
4,B1,3,854,0.003513,0.0011954,0.010277,user_feature_1,0.8364,0.2846191,2.446893,B1 (user_feature_1)
9,B2,9,2439,0.00369,0.001942572,0.006998,user_feature_2,0.87858,0.4625171,1.666294,B2 (user_feature_2)
18,B3,19,3667,0.005181,0.003319624,0.008079,user_feature_3,1.233654,0.7903867,1.923501,B3 (user_feature_3)
2,C0,1,69,0.014493,0.002562925,0.077631,user_feature_0,3.450656,0.6102202,18.483625,C0 (user_feature_0)
5,C1,0,138,0.0,0.0,0.027083,user_feature_1,0.0,0.0,6.448277,C1 (user_feature_1)


In [60]:
mean_ctr = log_df_encoded['click'].mean()
print(f"Overall CTR: {mean_ctr:.2%}")

Overall CTR: 0.42%


In [61]:
log_df_encoded.groupby("click").size()

click
0    9958
1      42
dtype: int64

In [62]:

# Identify the one-hot encoded columns
one_hot_cols = [c for c in log_df_encoded.columns if c.startswith("user_feature_")]

# Compute correlations with click
corr_values = log_df_encoded[one_hot_cols].corrwith(log_df_encoded["click"])

# Build a DataFrame
corr_df = pd.DataFrame({
    "feature": corr_values.index,
    "correlation_with_click": corr_values.values
}).sort_values(by="correlation_with_click", ascending=False).reset_index(drop=True)

corr_df.style.bar(align='mid', color=['red', 'lightgreen'])

Unnamed: 0,feature,correlation_with_click
0,user_feature_1_A1,0.016749
1,user_feature_0_A0,0.015378
2,user_feature_0_C0,0.013266
3,user_feature_3_B3,0.011547
4,user_feature_2_A2,0.005995
5,user_feature_2_G2,0.003915
6,user_feature_3_D3,0.000536
7,user_feature_2_D2,0.000403
8,user_feature_2_F2,-0.000445
9,user_feature_2_C2,-0.000456


In [63]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Select user features (categorical)
user_feature_cols = [c for c in log_df_readable.columns if c.startswith("user_feature")]

# One-hot encode user features
encoder = OneHotEncoder(sparse=False, drop=None)
encoded = encoder.fit_transform(log_df_readable[user_feature_cols])

encoded_cols = encoder.get_feature_names_out(user_feature_cols)
encoded_df = pd.DataFrame(encoded, columns=encoded_cols, index=log_df_readable.index)

# Numeric features you might want to keep
numeric_cols = ["position"]  # add more if relevant

# Target
y = log_df_readable["click"]

# Final feature matrix
X = pd.concat([log_df_readable[numeric_cols], encoded_df], axis=1)

print("X shape:", X.shape)
print("y mean CTR:", y.mean())
print("Example features:", X.columns[:15].tolist())

X shape: (10000, 27)
y mean CTR: 0.0042
Example features: ['position', 'user_feature_0_A0', 'user_feature_0_B0', 'user_feature_0_C0', 'user_feature_1_A1', 'user_feature_1_B1', 'user_feature_1_C1', 'user_feature_1_D1', 'user_feature_1_E1', 'user_feature_2_A2', 'user_feature_2_B2', 'user_feature_2_C2', 'user_feature_2_D2', 'user_feature_2_E2', 'user_feature_2_F2']


In [64]:
encoded_df.head()

Unnamed: 0,user_feature_0_A0,user_feature_0_B0,user_feature_0_C0,user_feature_1_A1,user_feature_1_B1,user_feature_1_C1,user_feature_1_D1,user_feature_1_E1,user_feature_2_A2,user_feature_2_B2,user_feature_2_C2,user_feature_2_D2,user_feature_2_E2,user_feature_2_F2,user_feature_2_G2,user_feature_2_H2,user_feature_2_I2,user_feature_3_A3,user_feature_3_B3,user_feature_3_C3,user_feature_3_D3,user_feature_3_E3,user_feature_3_F3,user_feature_3_G3,user_feature_3_H3,user_feature_3_I3
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:


# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Fit Random Forest
rf = RandomForestClassifier(
    n_estimators=200,      # number of trees
    max_depth=None,        # let trees grow fully
    n_jobs=-1,             # use all cores
    random_state=42,
    class_weight="balanced"  # CTRs are rare -> balance classes
)
rf.fit(X_train, y_train)

# Evaluate
y_pred_proba = rf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
acc = accuracy_score(y_test, rf.predict(X_test))

print(f"ROC AUC: {auc:.4f}")
print(f"Accuracy: {acc:.4f}")

# Feature importance
importances = pd.DataFrame({
    "feature": X.columns,
    "importance": rf.feature_importances_
}).sort_values("importance", ascending=False)

print(importances.head(20))

ROC AUC: 0.4925
Accuracy: 0.8670
              feature  importance
0            position    0.257640
19  user_feature_3_B3    0.063316
18  user_feature_3_A3    0.059978
12  user_feature_2_D2    0.059688
11  user_feature_2_C2    0.059336
10  user_feature_2_B2    0.058485
4   user_feature_1_A1    0.056193
21  user_feature_3_D3    0.053186
1   user_feature_0_A0    0.051209
13  user_feature_2_E2    0.050083
9   user_feature_2_A2    0.041608
2   user_feature_0_B0    0.033259
22  user_feature_3_E3    0.028176
7   user_feature_1_D1    0.028068
5   user_feature_1_B1    0.026675
14  user_feature_2_F2    0.024883
20  user_feature_3_C3    0.021283
15  user_feature_2_G2    0.015812
23  user_feature_3_F3    0.007044
6   user_feature_1_C1    0.002777


### Plotting Feature Importance

In [66]:
# Top 20 features
topk = importances.head(20).sort_values("importance", ascending=True)  # sort for horizontal bar plot

fig = px.bar(
    topk,
    x="importance",
    y="feature",
    orientation="h",
    title="Top 20 Random Forest Feature Importances",
    labels={"importance": "Importance", "feature": "Feature"},
)

fig.update_layout(
    height=600,
    width=700,
    margin=dict(t=60, l=150, r=20, b=40)
)

fig.show()

In [68]:
log_df_readable.groupby("position")["click"].mean()

position
1    0.003272
2    0.004522
3    0.004818
Name: click, dtype: float64