In [28]:
# Standard library imports
import importlib
import math
import os
import string

# Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, r2_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# OBP imports
from obp.dataset import OpenBanditDataset
from obp.ope import (
    DirectMethod as DM_OBP,
    DoublyRobust as DR_OBP,
    InverseProbabilityWeighting as IPW,
    OffPolicyEvaluation
)
from obp.policy import IPWLearner

# Local imports
import visualizations
importlib.reload(visualizations)
from visualizations import (
    plot_bar_chart,
    plot_boxplot_with_stats,
    plot_distribution_with_cdf,
    plot_histogram_with_stats
)
from stats import (
    calculate_distribution_stats as _calculate_distribution_stats,
    compute_item_feature_distribution,
    compute_item_propensity_stats,
    compute_manual_propensity,
    compute_propensity_variance as _compute_propensity_variance
)

# Pandas display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

print(os.getcwd())

/Users/armandoordoricadelatorre/Documents/U of T/PhD/PhD Research/OBP_Replication


### Helper functions

In [None]:


def remap_user_features(df, feature_cols):
    """
    Map hash values in user_feature_N to short readable codes like A1, B1, ...
    """
    df_copy = df.copy()
    mapping_dicts = {}

    for col in feature_cols:
        # Extract the feature index (N from 'user_feature_N')
        feature_idx = col.split("_")[-1]
        uniques = df[col].dropna().unique()

        # Build codes A{N}, B{N}, C{N}...
        codes = [f"{letter}{feature_idx}" for letter in string.ascii_uppercase[:len(uniques)]]
        mapping = dict(zip(uniques, codes))

        df_copy[col] = df[col].map(mapping)
        mapping_dicts[col] = mapping

    return df_copy, mapping_dicts


def calculate_propensity_stats(df, groupby_col='item_id'):
    """
    Calculate manual propensity statistics for a given groupby column.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The dataframe containing the logged data
    groupby_col : str or list
        Column(s) to group by (default: 'item_id')
    
    Returns:
    --------
    propensity_df : pd.DataFrame
        DataFrame with counts, total, and manual propensity
    stats_dict : dict
        Dictionary with summary statistics (mean, p25, p50, p75, p99)
    stats_df : pd.DataFrame
        DataFrame formatted for plotting statistics
    """
    # Calculate counts and propensities
    propensity_df = df.groupby(groupby_col).size().to_frame()
    propensity_df.columns = ['count_of_occurrences']
    propensity_df['total'] = propensity_df['count_of_occurrences'].sum()
    propensity_df['manual_propensity'] = propensity_df['count_of_occurrences'] / propensity_df['total']
    
    # Calculate summary statistics
    stats_dict = {
        'mean': propensity_df['manual_propensity'].mean(),
        'p25': propensity_df['manual_propensity'].quantile(0.25),
        'p50': propensity_df['manual_propensity'].quantile(0.50),  # median
        'p75': propensity_df['manual_propensity'].quantile(0.75),
        'p99': propensity_df['manual_propensity'].quantile(0.99),
    }
    
    # Create DataFrame for plotting
    stats_df = pd.DataFrame([stats_dict]).T.reset_index()
    stats_df.columns = ['statistic', 'value']
    
    # Print summary
    print("Manual Propensity Statistics:")
    for stat, value in stats_dict.items():
        print(f"  {stat}: {value:.6f}")
    
    return propensity_df, stats_dict, stats_df


def calculate_distribution_stats(df, value_col, groupby_col, groupby_col_as_string=True):
    """
    Wrapper for stats module calculate_distribution_stats that adds summary_df.
    Returns 3 values for compatibility with existing notebook code.
    """
    # Call stats module function (returns 2 values)
    plot_df, stats_df = _calculate_distribution_stats(df, value_col, groupby_col, groupby_col_as_string)
    
    # Calculate detailed summary statistics
    summary_df = df.groupby(groupby_col)[value_col].agg([
        ('count', 'count'),
        ('mean', 'mean'),
        ('std', 'std'),
        ('min', 'min'),
        ('25%', lambda x: x.quantile(0.25)),
        ('50%', lambda x: x.quantile(0.50)),
        ('75%', lambda x: x.quantile(0.75)),
        ('max', 'max')
    ]).round(4)
    
    return plot_df, stats_df, summary_df


def compute_propensity_variance(df, groupby_col):
    """
    Wrapper for stats module compute_propensity_variance that adds group_stats.
    Returns 2 values for compatibility with existing notebook code.
    """
    # Calculate statistics per group
    group_stats = df.groupby(groupby_col)['propensity_score'].agg([
        ('mean', 'mean'),
        ('variance', 'var'),
        ('std', 'std'),
        ('count', 'count')
    ]).reset_index()
    
    # Total variance (overall)
    total_variance = df['propensity_score'].var()
    
    # Within-group variance (weighted average of group variances)
    total_count = df.shape[0]
    within_group_variance = (group_stats['variance'] * group_stats['count']).sum() / total_count
    
    # Between-group variance (variance of group means, weighted by group sizes)
    overall_mean = df['propensity_score'].mean()
    between_group_variance = ((group_stats['mean'] - overall_mean) ** 2 * group_stats['count']).sum() / total_count
    
    # Variance explained ratio (R-squared analog)
    variance_explained_ratio = between_group_variance / total_variance if total_variance > 0 else 0
    
    variance_metrics = {
        'groupby_variable': groupby_col,
        'n_groups': len(group_stats),
        'total_variance': total_variance,
        'within_group_variance': within_group_variance,
        'between_group_variance': between_group_variance,
        'variance_explained_ratio': variance_explained_ratio
    }
    
    return variance_metrics, group_stats




### Import logged data from `all.csv` and `item_context.csv`

In [3]:

# BTS / ALL sample
log_df   = pd.read_csv("zr-obp/obd/bts/all/all.csv", index_col=0)
items_df = pd.read_csv("zr-obp/obd/bts/all/item_context.csv", index_col=0)

print("log_df shape:", log_df.shape)
print("items_df shape:", items_df.shape)

print("\nlog_df columns:")
print(log_df.columns.tolist()[:40])  # peek first ~40 col names

print("\nfirst 5 log rows:")
print(log_df.head())

print("\nfirst 5 item rows:")
print(items_df.head())

log_df shape: (10000, 89)
items_df shape: (80, 5)

log_df columns:
['timestamp', 'item_id', 'position', 'click', 'propensity_score', 'user_feature_0', 'user_feature_1', 'user_feature_2', 'user_feature_3', 'user-item_affinity_0', 'user-item_affinity_1', 'user-item_affinity_2', 'user-item_affinity_3', 'user-item_affinity_4', 'user-item_affinity_5', 'user-item_affinity_6', 'user-item_affinity_7', 'user-item_affinity_8', 'user-item_affinity_9', 'user-item_affinity_10', 'user-item_affinity_11', 'user-item_affinity_12', 'user-item_affinity_13', 'user-item_affinity_14', 'user-item_affinity_15', 'user-item_affinity_16', 'user-item_affinity_17', 'user-item_affinity_18', 'user-item_affinity_19', 'user-item_affinity_20', 'user-item_affinity_21', 'user-item_affinity_22', 'user-item_affinity_23', 'user-item_affinity_24', 'user-item_affinity_25', 'user-item_affinity_26', 'user-item_affinity_27', 'user-item_affinity_28', 'user-item_affinity_29', 'user-item_affinity_30']

first 5 log rows:
           

### Remapping categorical features to readable categories

In [4]:
user_feature_cols = [c for c in log_df.columns if c.startswith("user_feature")]
log_df_readable, mappings = remap_user_features(log_df, user_feature_cols)

print("Sample remapped features:")
display(log_df_readable[user_feature_cols].head())

print("\nMappings used:")
for feat, mapping in mappings.items():
    print(f"{feat}: {mapping}")

Sample remapped features:


Unnamed: 0,user_feature_0,user_feature_1,user_feature_2,user_feature_3
0,A0,A1,A2,A3
1,A0,B1,B2,B3
2,A0,A1,C2,A3
3,A0,A1,A2,B3
4,A0,A1,C2,B3



Mappings used:
user_feature_0: {'81ce123cbb5bd8ce818f60fb3586bba5': 'A0', 'cef3390ed299c09874189c387777674a': 'B0', '4ae385d792f81dde128124a925a830de': 'C0'}
user_feature_1: {'03a5648a76832f83c859d46bc06cb64a': 'A1', '2d03db5543b14483e52d761760686b64': 'B1', '6ff54aa8ff7a9dde75161c20a3ee4231': 'C1', 'f1c2d6a32ec39249160cf784b63f4c6f': 'D1', '8b50621825ffd909dd8d8317d366271f': 'E1'}
user_feature_2: {'7bc94a2da491829b777c49c4b5e480f2': 'A2', '2723d2eb8bba04e0362098011fa3997b': 'B2', 'c2e4f76cdbabecd33b8c762aeef386b3': 'C2', '719dab53a7560218a9d1f96b25d6fa32': 'D2', '9b2d331c329ceb74d3dcfb48d8798c78': 'E2', '302deff13f835d731df1c842eed95971': 'F2', '9f4e8271d3d3014af5f35124c2de5082': 'G2', '7ae37150e596e6e8f19e27a06bd4d359': 'H2', 'c7cce49040b6630e9b5484dfcc0e6cd1': 'I2'}
user_feature_3: {'c39b0c7dd5d4eb9a18e7db6ba2f258f8': 'A3', '9bde591ffaab8d54c457448e4dca6f53': 'B3', '05b76f5e97e51128862059ac7df9e42a': 'C3', 'f97571b9c14a786aab269f0b427d2a85': 'D3', '06128286bcc64b6a4b0fb7bc0328fe17'

In [5]:
log_df_readable.head()

Unnamed: 0,timestamp,item_id,position,click,propensity_score,user_feature_0,user_feature_1,user_feature_2,user_feature_3,user-item_affinity_0,user-item_affinity_1,user-item_affinity_2,user-item_affinity_3,user-item_affinity_4,user-item_affinity_5,user-item_affinity_6,user-item_affinity_7,user-item_affinity_8,user-item_affinity_9,user-item_affinity_10,user-item_affinity_11,user-item_affinity_12,user-item_affinity_13,user-item_affinity_14,user-item_affinity_15,user-item_affinity_16,user-item_affinity_17,user-item_affinity_18,user-item_affinity_19,user-item_affinity_20,user-item_affinity_21,user-item_affinity_22,user-item_affinity_23,user-item_affinity_24,user-item_affinity_25,user-item_affinity_26,user-item_affinity_27,user-item_affinity_28,user-item_affinity_29,user-item_affinity_30,user-item_affinity_31,user-item_affinity_32,user-item_affinity_33,user-item_affinity_34,user-item_affinity_35,user-item_affinity_36,user-item_affinity_37,user-item_affinity_38,user-item_affinity_39,user-item_affinity_40,user-item_affinity_41,user-item_affinity_42,user-item_affinity_43,user-item_affinity_44,user-item_affinity_45,user-item_affinity_46,user-item_affinity_47,user-item_affinity_48,user-item_affinity_49,user-item_affinity_50,user-item_affinity_51,user-item_affinity_52,user-item_affinity_53,user-item_affinity_54,user-item_affinity_55,user-item_affinity_56,user-item_affinity_57,user-item_affinity_58,user-item_affinity_59,user-item_affinity_60,user-item_affinity_61,user-item_affinity_62,user-item_affinity_63,user-item_affinity_64,user-item_affinity_65,user-item_affinity_66,user-item_affinity_67,user-item_affinity_68,user-item_affinity_69,user-item_affinity_70,user-item_affinity_71,user-item_affinity_72,user-item_affinity_73,user-item_affinity_74,user-item_affinity_75,user-item_affinity_76,user-item_affinity_77,user-item_affinity_78,user-item_affinity_79
0,2019-11-24 00:00:17.004101+00:00,79,2,0,0.087125,A0,A1,A2,A3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-11-24 00:00:19.715857+00:00,14,1,0,0.006235,A0,B1,B2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019-11-24 00:01:04.303227+00:00,18,2,0,0.0613,A0,A1,C2,A3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019-11-24 00:01:11.571162+00:00,28,1,0,0.01943,A0,A1,A2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2019-11-24 00:02:41.811768+00:00,65,2,0,0.019375,A0,A1,C2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## EDA 

In [6]:
# Calculate manual propensity statistics using the helper function
propensity_manual_df, propensity_stats, stats_df = calculate_propensity_stats(
    log_df_readable, 
    groupby_col='item_id'
)

# Display the statistics DataFrame
display(stats_df)

# Plot the statistics using the reusable plot_bar_chart function
fig = plot_bar_chart(
    data=stats_df,
    x_col='statistic',
    y_col='value',
    x_title='Statistic',
    y_title='Propensity Value',
    title='Manual Propensity Statistics (Mean and Percentiles)',
    height=500,
    show_values=True,
    value_format='{:.6f}'
)

# Customize x-axis ordering
fig.update_layout(
    xaxis={'categoryorder': 'array', 'categoryarray': ['mean', 'p25', 'p50', 'p75', 'p99']},
    yaxis_title='Propensity Score'
)

fig.show()

Manual Propensity Statistics:
  mean: 0.012500
  p25: 0.001700
  p50: 0.003850
  p75: 0.012175
  p99: 0.082929


Unnamed: 0,statistic,value
0,mean,0.0125
1,p25,0.0017
2,p50,0.00385
3,p75,0.012175
4,p99,0.082929


### Distribution of manual propensity scores with statistics

In [7]:
# Create a histogram with vertical lines for each statistic using the reusable function
fig = plot_histogram_with_stats(
    data=propensity_manual_df,
    column_name='manual_propensity',
    stats_dict=propensity_stats,
    title='Distribution of Manual Propensity Scores with Statistics',
    xlabel='Manual Propensity Score',
    height=500,
    nbins=50
)

fig.show()

### Propensity Score Distribution - Rounded to 2 Decimals

To make the PDF/CDF readable and interpretable, we'll round propensity scores to 2 decimal places.
This allows us to see what propensity score value corresponds to a given cumulative percentage.

In [8]:
# Create a working dataframe with rounded propensity scores
propensity_rounded_df = log_df_readable.copy()

# Round propensity scores to 2 decimal places
propensity_rounded_df['propensity_score_rounded'] = propensity_rounded_df['propensity_score'].round(2)

# Show summary statistics
print("Propensity Score Summary (Rounded to 2 Decimals):")
print("="*60)
print(f"Original unique values: {log_df_readable['propensity_score'].nunique():,}")
print(f"Rounded unique values:  {propensity_rounded_df['propensity_score_rounded'].nunique():,}")
print(f"Total observations:     {len(propensity_rounded_df):,}")
print("="*60)

# Show some key statistics
print("\nKey Percentiles (Rounded Propensity Scores):")
for p in [10, 25, 50, 75, 90, 95, 99]:
    val = propensity_rounded_df['propensity_score_rounded'].quantile(p/100)
    print(f"  {p:2d}th percentile: {val:.2f} ({p}% of rows have propensity score ≤ {val:.2f})")

print("="*60)

# Plot the PDF and CDF of rounded propensity scores using our reusable function
plot_distribution_with_cdf(
    data=propensity_rounded_df,
    column_name='propensity_score_rounded',
    title='Propensity Score (from original dataset) Distribution (Rounded to 2 Decimals) - All Rows',
    sort_by='value',  # Sort by propensity score value
    height=600
)

Propensity Score Summary (Rounded to 2 Decimals):
Original unique values: 7,883
Rounded unique values:  90
Total observations:     10,000

Key Percentiles (Rounded Propensity Scores):
  10th percentile: 0.01 (10% of rows have propensity score ≤ 0.01)
  25th percentile: 0.02 (25% of rows have propensity score ≤ 0.02)
  50th percentile: 0.06 (50% of rows have propensity score ≤ 0.06)
  75th percentile: 0.15 (75% of rows have propensity score ≤ 0.15)
  90th percentile: 0.24 (90% of rows have propensity score ≤ 0.24)
  95th percentile: 0.34 (95% of rows have propensity score ≤ 0.34)
  99th percentile: 0.71 (99% of rows have propensity score ≤ 0.71)

Distribution Summary: propensity_score_rounded
Total observations: 10,000
Unique values: 90

Top 5 values:
 propensity_score_rounded  count  percentage  cumulative_pct
                     0.00    750        7.50            7.50
                     0.01   1087       10.87           18.37
                     0.02    844        8.44           2

In [9]:
log_df_readable['propensity_score'].max()

0.95424

In [10]:
log_df_readable[log_df_readable['propensity_score']== log_df_readable['propensity_score'].max()]

Unnamed: 0,timestamp,item_id,position,click,propensity_score,user_feature_0,user_feature_1,user_feature_2,user_feature_3,user-item_affinity_0,user-item_affinity_1,user-item_affinity_2,user-item_affinity_3,user-item_affinity_4,user-item_affinity_5,user-item_affinity_6,user-item_affinity_7,user-item_affinity_8,user-item_affinity_9,user-item_affinity_10,user-item_affinity_11,user-item_affinity_12,user-item_affinity_13,user-item_affinity_14,user-item_affinity_15,user-item_affinity_16,user-item_affinity_17,user-item_affinity_18,user-item_affinity_19,user-item_affinity_20,user-item_affinity_21,user-item_affinity_22,user-item_affinity_23,user-item_affinity_24,user-item_affinity_25,user-item_affinity_26,user-item_affinity_27,user-item_affinity_28,user-item_affinity_29,user-item_affinity_30,user-item_affinity_31,user-item_affinity_32,user-item_affinity_33,user-item_affinity_34,user-item_affinity_35,user-item_affinity_36,user-item_affinity_37,user-item_affinity_38,user-item_affinity_39,user-item_affinity_40,user-item_affinity_41,user-item_affinity_42,user-item_affinity_43,user-item_affinity_44,user-item_affinity_45,user-item_affinity_46,user-item_affinity_47,user-item_affinity_48,user-item_affinity_49,user-item_affinity_50,user-item_affinity_51,user-item_affinity_52,user-item_affinity_53,user-item_affinity_54,user-item_affinity_55,user-item_affinity_56,user-item_affinity_57,user-item_affinity_58,user-item_affinity_59,user-item_affinity_60,user-item_affinity_61,user-item_affinity_62,user-item_affinity_63,user-item_affinity_64,user-item_affinity_65,user-item_affinity_66,user-item_affinity_67,user-item_affinity_68,user-item_affinity_69,user-item_affinity_70,user-item_affinity_71,user-item_affinity_72,user-item_affinity_73,user-item_affinity_74,user-item_affinity_75,user-item_affinity_76,user-item_affinity_77,user-item_affinity_78,user-item_affinity_79
871,2019-11-24 12:11:57.430499+00:00,49,1,0,0.95424,A0,D1,B2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
879,2019-11-24 12:14:03.832273+00:00,49,1,0,0.95424,A0,A1,C2,A3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
880,2019-11-24 12:14:50.257724+00:00,49,1,0,0.95424,A0,A1,B2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
884,2019-11-24 12:16:03.655893+00:00,49,1,0,0.95424,A0,B1,A2,D3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
886,2019-11-24 12:16:53.154005+00:00,49,1,0,0.95424,A0,A1,B2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
893,2019-11-24 12:20:23.535788+00:00,49,1,0,0.95424,A0,A1,D2,B3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Propensity Score Distribution by Item ID

Visualize how propensity scores are distributed across different items using boxplots.

In [11]:
# Process data and calculate statistics
plot_df, stats_by_item, summary_by_item = calculate_distribution_stats(
    df=log_df_readable,
    value_col='propensity_score',
    groupby_col='item_id',
    groupby_col_as_string=True
)

# Create the boxplot visualization
fig = plot_boxplot_with_stats(
    plot_df=plot_df,
    stats_df=stats_by_item,
    value_col='propensity_score',
    groupby_col_str='item_id_str',
    title='Distribution of Propensity Scores by Item ID',
    xlabel='Item ID',
    ylabel='Propensity Score',
    height=600,
    show_mean_overlay=True
)

fig.show()

# Print summary statistics
print("\nPropensity Score Statistics by Item ID:")
print("="*60)

print("\nFirst 10 items:")
display(summary_by_item.head(10))

print("\nItems with highest mean propensity:")
display(summary_by_item.nlargest(5, 'mean'))

print("\nItems with lowest mean propensity:")
display(summary_by_item.nsmallest(5, 'mean'))


Propensity Score Statistics by Item ID:

First 10 items:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,125,0.0509,0.0531,0.0024,0.0176,0.0364,0.0632,0.2786
1,31,0.0077,0.0073,0.0006,0.0026,0.0059,0.0102,0.0349
2,17,0.0049,0.0053,0.0001,0.0012,0.003,0.0066,0.0189
3,46,0.0175,0.02,0.0002,0.0044,0.0115,0.0217,0.1062
4,17,0.0133,0.0204,0.0006,0.0018,0.0049,0.0133,0.069
5,22,0.0062,0.0047,0.0008,0.0027,0.0053,0.0093,0.0172
6,14,0.0064,0.0068,0.0001,0.0013,0.0044,0.0082,0.0205
7,741,0.1593,0.1568,0.0038,0.0767,0.1219,0.17,0.8672
8,48,0.0122,0.0118,0.0004,0.0034,0.0065,0.0174,0.043
9,18,0.0036,0.0033,0.0003,0.001,0.0023,0.0065,0.0099



Items with highest mean propensity:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
49,408,0.2924,0.3096,0.0021,0.042,0.1394,0.5897,0.9542
39,756,0.2248,0.1828,0.0016,0.0894,0.1821,0.2767,0.7691
51,1105,0.1875,0.0961,0.0043,0.1243,0.1843,0.2408,0.6199
7,741,0.1593,0.1568,0.0038,0.0767,0.1219,0.17,0.8672
61,704,0.1358,0.0945,0.0041,0.0735,0.1207,0.1789,0.6469



Items with lowest mean propensity:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
56,9,0.0019,0.0014,0.0001,0.0009,0.0016,0.0029,0.0047
20,11,0.0034,0.0043,0.0004,0.0012,0.0014,0.0031,0.012
9,18,0.0036,0.0033,0.0003,0.001,0.0023,0.0065,0.0099
74,13,0.0037,0.0037,0.0005,0.0015,0.0017,0.0038,0.0127
10,9,0.0041,0.0023,0.001,0.0027,0.0032,0.0069,0.0072


### Propensity Score Distribution by Position

Visualize how propensity scores are distributed across different positions (1, 2, 3) using the same reusable functions.

In [12]:
# Process data and calculate statistics for position
plot_df_pos, stats_by_position, summary_by_position = calculate_distribution_stats(
    df=log_df_readable,
    value_col='propensity_score',
    groupby_col='position',
    groupby_col_as_string=True
)

# Create the boxplot visualization
fig = plot_boxplot_with_stats(
    plot_df=plot_df_pos,
    stats_df=stats_by_position,
    value_col='propensity_score',
    groupby_col_str='position_str',
    title='Distribution of Propensity Scores by Position',
    xlabel='Position',
    ylabel='Propensity Score',
    height=500,
    show_mean_overlay=True
)

fig.show()

# Print summary statistics
print("\nPropensity Score Statistics by Position:")
print("="*60)
display(summary_by_position)

print("\nKey Insights:")
print(f"  • Position with highest mean propensity: {summary_by_position['mean'].idxmax()}")
print(f"    Mean: {summary_by_position['mean'].max():.4f}")
print(f"  • Position with lowest mean propensity: {summary_by_position['mean'].idxmin()}")
print(f"    Mean: {summary_by_position['mean'].min():.4f}")


Propensity Score Statistics by Position:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3362,0.1467,0.1977,0.0,0.0189,0.0599,0.1821,0.9542
2,3317,0.0956,0.085,0.0001,0.0251,0.068,0.1514,0.4563
3,3321,0.0838,0.0692,0.0001,0.0248,0.0649,0.1313,0.3211



Key Insights:
  • Position with highest mean propensity: 1
    Mean: 0.1467
  • Position with lowest mean propensity: 3
    Mean: 0.0838


### Propensity Score Distribution by User Features

Loop through all user features to analyze propensity score distributions across each feature's values.

In [13]:
# Loop through all user features and create visualizations
for i in range(4):  # user_feature_0 through user_feature_3
    feature_name = f'user_feature_{i}'
    
    print(f"\n{'='*70}")
    print(f"Processing: {feature_name}")
    print(f"{'='*70}\n")
    
    # Process data and calculate statistics
    plot_df, stats_by_feature, summary_by_feature = calculate_distribution_stats(
        df=log_df_readable,
        value_col='propensity_score',
        groupby_col=feature_name,
        groupby_col_as_string=False  # Already a string/categorical
    )
    
    # Create the boxplot visualization
    fig = plot_boxplot_with_stats(
        plot_df=plot_df,
        stats_df=stats_by_feature,
        value_col='propensity_score',
        groupby_col_str=feature_name,
        title=f'Distribution of Propensity Scores by User Feature {i}',
        xlabel=f'User Feature {i}',
        ylabel='Propensity Score',
        height=500,
        show_mean_overlay=True
    )
    
    fig.show()
    
    # Print summary statistics
    print(f"\nPropensity Score Statistics by User Feature {i}:")
    print("="*60)
    display(summary_by_feature)
    print("\n")


Processing: user_feature_0




Propensity Score Statistics by User Feature 0:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
user_feature_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A0,8123,0.1086,0.1324,0.0001,0.0223,0.0646,0.1486,0.9542
B0,1808,0.1102,0.1393,0.0,0.0233,0.0638,0.1505,0.9177
C0,69,0.107,0.1388,0.0018,0.022,0.0604,0.1368,0.8863





Processing: user_feature_1




Propensity Score Statistics by User Feature 1:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
user_feature_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A1,8322,0.1095,0.1344,0.0,0.0224,0.0648,0.1506,0.9542
B1,854,0.1099,0.1395,0.0003,0.022,0.0644,0.1428,0.9542
C1,138,0.089,0.1099,0.0001,0.0143,0.0588,0.1305,0.8863
D1,681,0.1039,0.1214,0.0001,0.0249,0.0638,0.1438,0.9542
E1,5,0.064,0.0633,0.0069,0.0279,0.0447,0.0717,0.169





Processing: user_feature_2




Propensity Score Statistics by User Feature 2:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
user_feature_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A2,1569,0.1034,0.1234,0.0,0.0218,0.0617,0.1452,0.9542
B2,2439,0.111,0.1355,0.0001,0.0227,0.0659,0.1521,0.9542
C2,2411,0.1097,0.1352,0.0001,0.0225,0.0657,0.1509,0.9542
D2,1407,0.1147,0.1425,0.0001,0.025,0.0662,0.1534,0.9542
E2,1514,0.1044,0.1301,0.0002,0.0208,0.06,0.1454,0.9177
F2,491,0.1115,0.1355,0.0001,0.0218,0.0665,0.1411,0.8226
G2,162,0.1024,0.1282,0.0004,0.018,0.0615,0.1367,0.8487
H2,6,0.0921,0.0838,0.0357,0.0383,0.0392,0.1572,0.204
I2,1,0.0155,,0.0155,0.0155,0.0155,0.0155,0.0155





Processing: user_feature_3




Propensity Score Statistics by User Feature 3:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
user_feature_3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A3,3523,0.1096,0.1343,0.0001,0.0237,0.0648,0.1455,0.9542
B3,3667,0.1093,0.136,0.0001,0.0217,0.0642,0.1508,0.9542
C3,549,0.1178,0.1481,0.0001,0.0244,0.0682,0.1521,0.8487
D3,1164,0.1033,0.1208,0.0002,0.0212,0.062,0.1468,0.9542
E3,913,0.1079,0.1298,0.0,0.0206,0.0668,0.1537,0.9144
F3,157,0.0968,0.1206,0.0006,0.021,0.055,0.1287,0.7691
G3,1,0.2622,,0.2622,0.2622,0.2622,0.2622,0.2622
H3,25,0.1063,0.1653,0.003,0.0232,0.0518,0.1351,0.8122
I3,1,0.0827,,0.0827,0.0827,0.0827,0.0827,0.0827






### Variance of Propensity Scores Across Variables

Compare the variance of propensity scores when grouped by different variables (item_id, position, user features).

In [14]:




# Compare variance across different groupings
variables_to_compare = ['item_id', 'position', 'user_feature_0', 'user_feature_1', 
                        'user_feature_2', 'user_feature_3']

variance_comparison = []
all_group_stats = {}

print("="*80)
print("PROPENSITY SCORE VARIANCE ANALYSIS")
print("="*80)

for var in variables_to_compare:
    print(f"\n{'='*80}")
    print(f"Variable: {var}")
    print(f"{'='*80}")
    
    metrics, group_stats = compute_propensity_variance(log_df_readable, var)
    variance_comparison.append(metrics)
    all_group_stats[var] = group_stats
    
    print(f"  Number of groups: {metrics['n_groups']}")
    print(f"  Total variance: {metrics['total_variance']:.6f}")
    print(f"  Within-group variance: {metrics['within_group_variance']:.6f}")
    print(f"  Between-group variance: {metrics['between_group_variance']:.6f}")
    print(f"  Variance explained: {metrics['variance_explained_ratio']:.2%}")
    
    # Show groups with highest and lowest variance
    print(f"\n  Top 3 groups by variance:")
    top_var = group_stats.nlargest(3, 'variance')
    for _, row in top_var.iterrows():
        print(f"    {var}={row[var]}: var={row['variance']:.6f}, mean={row['mean']:.6f}, n={int(row['count'])}")
    
    print(f"\n  Top 3 groups by mean propensity:")
    top_mean = group_stats.nlargest(3, 'mean')
    for _, row in top_mean.iterrows():
        print(f"    {var}={row[var]}: mean={row['mean']:.6f}, var={row['variance']:.6f}, n={int(row['count'])}")
        
# Create summary DataFrame
variance_summary_df = pd.DataFrame(variance_comparison)
variance_summary_df = variance_summary_df.sort_values('variance_explained_ratio', ascending=False)

print(f"\n{'='*80}")
print("SUMMARY: Variance Comparison Across All Variables")
print(f"{'='*80}\n")
display(variance_summary_df)

print("\nKey Insights:")
print(f"  • Variable explaining most variance: {variance_summary_df.iloc[0]['groupby_variable']}")
print(f"    Explains {variance_summary_df.iloc[0]['variance_explained_ratio']:.2%} of total variance")
print(f"  • Variable explaining least variance: {variance_summary_df.iloc[-1]['groupby_variable']}")
print(f"    Explains {variance_summary_df.iloc[-1]['variance_explained_ratio']:.2%} of total variance")

PROPENSITY SCORE VARIANCE ANALYSIS

Variable: item_id
  Number of groups: 80
  Total variance: 0.017876
  Within-group variance: 0.012112
  Between-group variance: 0.005787
  Variance explained: 32.37%

  Top 3 groups by variance:
    item_id=49.0: var=0.095865, mean=0.292416, n=408
    item_id=39.0: var=0.033423, mean=0.224788, n=756
    item_id=7.0: var=0.024598, mean=0.159300, n=741

  Top 3 groups by mean propensity:
    item_id=49.0: mean=0.292416, var=0.095865, n=408
    item_id=39.0: mean=0.224788, var=0.033423, n=756
    item_id=51.0: mean=0.187483, var=0.009236, n=1105

Variable: position
  Number of groups: 3
  Total variance: 0.017876
  Within-group variance: 0.017130
  Between-group variance: 0.000750
  Variance explained: 4.19%

  Top 3 groups by variance:
    position=1.0: var=0.039080, mean=0.146746, n=3362
    position=2.0: var=0.007231, mean=0.095555, n=3317
    position=3.0: var=0.004795, mean=0.083811, n=3321

  Top 3 groups by mean propensity:
    position=1.0: mean

Unnamed: 0,groupby_variable,n_groups,total_variance,within_group_variance,between_group_variance,variance_explained_ratio
0,item_id,80,0.017876,0.012112,0.00578704,0.323732
1,position,3,0.017876,0.01713,0.0007496589,0.041937
4,user_feature_2,9,0.017876,0.017872,1.571984e-05,0.000879
5,user_feature_3,9,0.017876,0.017874,1.304707e-05,0.00073
3,user_feature_1,5,0.017876,0.017872,8.571148e-06,0.000479
2,user_feature_0,3,0.017876,0.017879,3.900129e-07,2.2e-05



Key Insights:
  • Variable explaining most variance: item_id
    Explains 32.37% of total variance
  • Variable explaining least variance: user_feature_0
    Explains 0.00% of total variance


In [15]:
# Visualize variance comparison
fig = go.Figure()

# Create grouped bar chart for variance components
fig.add_trace(go.Bar(
    name='Between-Group Variance',
    x=variance_summary_df['groupby_variable'],
    y=variance_summary_df['between_group_variance'],
    marker_color='#3498db',
    text=variance_summary_df['between_group_variance'].round(6),
    textposition='auto',
))

fig.add_trace(go.Bar(
    name='Within-Group Variance',
    x=variance_summary_df['groupby_variable'],
    y=variance_summary_df['within_group_variance'],
    marker_color='#e74c3c',
    text=variance_summary_df['within_group_variance'].round(6),
    textposition='auto',
))

fig.update_layout(
    title='Propensity Score Variance Decomposition by Grouping Variable',
    xaxis_title='Grouping Variable',
    yaxis_title='Variance',
    barmode='group',
    height=500,
    showlegend=True,
    hovermode='x unified'
)

fig.show()

# Create bar chart for variance explained ratio
fig2 = px.bar(
    variance_summary_df,
    x='groupby_variable',
    y='variance_explained_ratio',
    title='Proportion of Propensity Score Variance Explained by Each Variable',
    labels={
        'groupby_variable': 'Grouping Variable',
        'variance_explained_ratio': 'Variance Explained (R²)'
    },
    text=variance_summary_df['variance_explained_ratio'].apply(lambda x: f'{x:.2%}'),
    color='variance_explained_ratio',
    color_continuous_scale='Blues',
    height=500
)

fig2.update_traces(textposition='outside')
fig2.update_layout(showlegend=False)
fig2.show()

print("\n✓ Variance comparison visualizations complete")


✓ Variance comparison visualizations complete


In [None]:
# Compute VIF for user features to demonstrate the difference
print("="*80)
print("VARIANCE INFLATION FACTOR (VIF) ANALYSIS")
print("="*80)
print("\nVIF measures multicollinearity: How much can each feature be predicted")
print("by the other features?\n")

# Encode categorical features
le_vif = {}
user_features_encoded = log_df_readable.copy()

for col in user_feature_cols:
    le = LabelEncoder()
    user_features_encoded[f'{col}_encoded'] = le.fit_transform(user_features_encoded[col].astype(str))
    le_vif[col] = le

# Get encoded feature columns
encoded_cols = [f'{col}_encoded' for col in user_feature_cols]
X_vif = user_features_encoded[encoded_cols]

# Compute VIF for each feature
vif_results = []

for i, target_col in enumerate(encoded_cols):
    # Use other features to predict this feature
    other_cols = [col for j, col in enumerate(encoded_cols) if j != i]
    
    X_others = X_vif[other_cols]
    y_target = X_vif[target_col]
    
    # Fit regression model
    lr = LinearRegression()
    lr.fit(X_others, y_target)
    
    # Calculate R²
    r2 = r2_score(y_target, lr.predict(X_others))
    
    # Calculate VIF
    vif = 1 / (1 - r2) if r2 < 0.9999 else float('inf')
    
    original_col = user_feature_cols[i]
    vif_results.append({
        'feature': original_col,
        'R²_with_others': r2,
        'VIF': vif
    })
    
    print(f"{original_col}:")
    print(f"  R² (predicted by other features): {r2:.4f}")
    print(f"  VIF: {vif:.2f}")
    
    if vif < 5:
        print(f"  ✓ Low multicollinearity")
    elif vif < 10:
        print(f"  ⚠ Moderate multicollinearity")
    else:
        print(f"  ✗ High multicollinearity - consider removing")
    print()

# Create VIF summary DataFrame
vif_df = pd.DataFrame(vif_results)

print("="*80)
print("VIF SUMMARY")
print("="*80)
display(vif_df)

print("\nInterpretation Guide:")
print("  VIF = 1     → No correlation with other features")
print("  VIF = 1-5   → Low correlation (acceptable)")
print("  VIF = 5-10  → Moderate correlation (caution)")
print("  VIF > 10    → High correlation (problematic for regression)")

# Compare with our variance decomposition results
print("\n" + "="*80)
print("COMPARISON: Variance Decomposition R² vs VIF")
print("="*80)
print("\nVariance Decomposition R² (from earlier):")
print("  → How much does each feature explain propensity score variance?")
display(variance_summary_df[['groupby_variable', 'variance_explained_ratio']])

print("\nVIF (R² with other features):")
print("  → How much can each feature be predicted by other features?")
display(vif_df[['feature', 'R²_with_others', 'VIF']])

print("\n" + "="*80)
print("KEY INSIGHT:")
print("="*80)
print("• Variance Decomposition R² tells us which features matter for propensity scores")
print("• VIF tells us if features are redundant with each other")
print("• Both are useful but answer different questions!")
print("="*80)

VARIANCE INFLATION FACTOR (VIF) ANALYSIS

VIF measures multicollinearity: How much can each feature be predicted
by the other features?

user_feature_0:
  R² (predicted by other features): 0.0126
  VIF: 1.01
  ✓ Low multicollinearity

user_feature_1:
  R² (predicted by other features): 0.0619
  VIF: 1.07
  ✓ Low multicollinearity

user_feature_2:
  R² (predicted by other features): 0.0609
  VIF: 1.06
  ✓ Low multicollinearity

user_feature_3:
  R² (predicted by other features): 0.0050
  VIF: 1.01
  ✓ Low multicollinearity

VIF SUMMARY


Unnamed: 0,feature,R²_with_others,VIF
0,user_feature_0,0.012644,1.012806
1,user_feature_1,0.06187,1.065951
2,user_feature_2,0.060916,1.064867
3,user_feature_3,0.005023,1.005049



Interpretation Guide:
  VIF = 1     → No correlation with other features
  VIF = 1-5   → Low correlation (acceptable)
  VIF = 5-10  → Moderate correlation (caution)
  VIF > 10    → High correlation (problematic for regression)

COMPARISON: Variance Decomposition R² vs VIF

Variance Decomposition R² (from earlier):
  → How much does each feature explain propensity score variance?


Unnamed: 0,groupby_variable,variance_explained_ratio
0,item_id,0.323732
1,position,0.041937
4,user_feature_2,0.000879
5,user_feature_3,0.00073
3,user_feature_1,0.000479
2,user_feature_0,2.2e-05



VIF (R² with other features):
  → How much can each feature be predicted by other features?


Unnamed: 0,feature,R²_with_others,VIF
0,user_feature_0,0.012644,1.012806
1,user_feature_1,0.06187,1.065951
2,user_feature_2,0.060916,1.064867
3,user_feature_3,0.005023,1.005049



KEY INSIGHT:
• Variance Decomposition R² tells us which features matter for propensity scores
• VIF tells us if features are redundant with each other
• Both are useful but answer different questions!


### Item Based Propensity Scores (logged vs manually computed)

In [17]:
# Test all stats module functions to ensure they work correctly after refactoring

print("=" * 80)
print("TESTING STATS MODULE FUNCTIONS")
print("=" * 80)

# Test 1: compute_item_feature_distribution
print("\n1. Testing compute_item_feature_distribution...")
test_result_1 = compute_item_feature_distribution(log_df_readable, item_id=0, feature_col='user_feature_0')
print("✓ Success! Result:")
print(test_result_1)

# Test 2: compute_manual_propensity
print("\n2. Testing compute_manual_propensity...")
test_result_2 = compute_manual_propensity(log_df_readable, categorical_col='position')
print(f"✓ Success! Computed {len(test_result_2)} rows. First 5:")
print(test_result_2.head())

# Test 3: compute_item_propensity_stats
print("\n3. Testing compute_item_propensity_stats...")
test_result_3 = compute_item_propensity_stats(log_df_readable, item_id=0)
print("✓ Success! Result:")
print(test_result_3)

# Test 4: calculate_distribution_stats (wrapper returns 3 values)
print("\n4. Testing calculate_distribution_stats...")
test_plot_df, test_stats_df, test_summary_df = calculate_distribution_stats(log_df_readable, 'propensity_score', 'item_id')
print(f"✓ Success! Got plot_df with {len(test_plot_df)} rows, stats_df with {len(test_stats_df)} rows, and summary_df with {len(test_summary_df)} rows")
print("Stats summary (first 3 items):")
print(test_stats_df.head(3))

# Test 5: compute_propensity_variance (wrapper returns 2 values: variance_metrics dict, group_stats)
print("\n5. Testing compute_propensity_variance...")
test_var_metrics_item, test_group_stats_item = compute_propensity_variance(log_df_readable, 'item_id')
test_var_metrics_pos, test_group_stats_pos = compute_propensity_variance(log_df_readable, 'position')
print(f"✓ Success!")
print(f"   Variance by item_id: {test_var_metrics_item['between_group_variance']:.6f}")
print(f"   Variance explained: {test_var_metrics_item['variance_explained_ratio']:.2%}")
print(f"   Variance by position: {test_var_metrics_pos['between_group_variance']:.6f}")
print(f"   Variance explained: {test_var_metrics_pos['variance_explained_ratio']:.2%}")

print("\n" + "=" * 80)
print("ALL TESTS PASSED! ✓ Stats module is working correctly.")
print("=" * 80)

TESTING STATS MODULE FUNCTIONS

1. Testing compute_item_feature_distribution...
✓ Success! Result:
  user_feature_0  count  total_occurrences  proportion
0             A0     94                125       0.752
1             B0     31                125       0.248

2. Testing compute_manual_propensity...
✓ Success! Computed 239 rows. First 5:
   item_id  position  count_of_occurrences  total  manual_propensity
0        0         1                    39  10000             0.0039
1        0         2                    39  10000             0.0039
2        0         3                    47  10000             0.0047
3        1         1                    15  10000             0.0015
4        1         2                    12  10000             0.0012

3. Testing compute_item_propensity_stats...
✓ Success! Result:
   item_id  count_of_occurrences  total  manual_propensity  mean_propensity_score
0        0                   125  10000             0.0125               0.050874

4. Testing ca

In [18]:
# Example: Use the function for item_id=0 and user_feature_0
temp_df_item_0 = compute_item_feature_distribution(log_df_readable, item_id=0, feature_col='user_feature_0')
temp_df_item_0

Unnamed: 0,user_feature_0,count,total_occurrences,proportion
0,A0,94,125,0.752
1,B0,31,125,0.248


In [19]:
# Example 1: Compute manual propensity for item_id and user_feature_0
manual_prop_uf0 = compute_manual_propensity(log_df_readable, categorical_col='user_feature_0')
print("Manual propensity by item_id and user_feature_0:")
display(manual_prop_uf0.head(10))
print(f"\nTotal rows: {len(manual_prop_uf0)}")

Manual propensity by item_id and user_feature_0:


Unnamed: 0,item_id,user_feature_0,count_of_occurrences,total,manual_propensity
0,0,A0,94,10000,0.0094
1,0,B0,31,10000,0.0031
2,1,A0,24,10000,0.0024
3,1,B0,7,10000,0.0007
4,2,A0,14,10000,0.0014
5,2,B0,3,10000,0.0003
6,3,A0,42,10000,0.0042
7,3,B0,4,10000,0.0004
8,4,A0,13,10000,0.0013
9,4,B0,3,10000,0.0003



Total rows: 187


In [20]:
item_id_0_manual_propensity_global_feature_0 =log_df_readable.groupby(['item_id', 'user_feature_0'])['propensity_score'].mean().reset_index()
item_id_0_manual_propensity_global_feature_0 = item_id_0_manual_propensity_global_feature_0[item_id_0_manual_propensity_global_feature_0['item_id'] == 0]
item_id_0_manual_propensity_global_feature_0

Unnamed: 0,item_id,user_feature_0,propensity_score
0,0,A0,0.05419
1,0,B0,0.040818


In [21]:
log_df_readable[(log_df_readable['item_id'] == 0) & (log_df_readable['user_feature_0'] == 'A0')]['propensity_score'].mean()

0.05419

In [22]:
log_df_readable[(log_df_readable['item_id'] == 0) & (log_df_readable['user_feature_0'] == 'B0')]['propensity_score'].mean()

0.040818225806451607

In [23]:
log_df_readable[(log_df_readable['item_id'] == 0)]['click'].mean()

0.0

In [24]:
log_df_readable[(log_df_readable['item_id'] == 49)]['click'].mean()

0.0024509803921568627