In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.data_prep import load_sessions, flatten_events, build_session_features

sessions = load_sessions("../data/otto-recsys-train.jsonl", n=50000)
event_df = flatten_events(sessions)
session_features = build_session_features(event_df)

接下来进行 A/B Test。首先进行随机分组（模拟线上流量切分）。

In [2]:
import numpy as np

np.random.seed(42)

session_features['group'] = np.random.choice(
    ['control','treatment'],
    size=len(session_features)
)

session_features['group'].value_counts()

group
control      25046
treatment    24954
Name: count, dtype: int64

假设：新算法对高 cart 用户提升 20% 转化概率。人为制造 uplift。

In [3]:
session_features['new_converted'] = session_features['converted'].copy()

high_cart_threshold = session_features['cart_cnt'].quantile(0.75)

mask = (
    (session_features['group'] == 'treatment') &
    (session_features['cart_cnt'] > high_cart_threshold)
)

# 给 treatment 高意图用户增加转化
session_features.loc[mask, 'new_converted'] = True

计算两组转化率。

In [4]:
ab_result = session_features.groupby('group')['new_converted'].mean()
ab_result

group
control      0.228220
treatment    0.316021
Name: new_converted, dtype: float64

显著性检验。

In [5]:
from statsmodels.stats.proportion import proportions_ztest

control = session_features[session_features['group']=='control']
treatment = session_features[session_features['group']=='treatment']

count = [
    treatment['new_converted'].sum(),
    control['new_converted'].sum()
]

nobs = [
    len(treatment),
    len(control)
]

stat, pval = proportions_ztest(count, nobs)

print("Z-stat:", stat)
print("p-value:", pval)

Z-stat: 22.058988433056772
p-value: 7.831456913511434e-108


p-value < 0.05，差异显著。

计算uplift和CI。

In [6]:
import statsmodels.api as sm

# 两组转化率
p1 = treatment['new_converted'].mean()
p2 = control['new_converted'].mean()

uplift = p1 - p2

# 计算标准误
se = np.sqrt(
    p1*(1-p1)/len(treatment) +
    p2*(1-p2)/len(control)
)

# 95% CI
ci_lower = uplift - 1.96*se
ci_upper = uplift + 1.96*se

print("Uplift:", uplift)
print("95% CI:", ci_lower, ci_upper)

Uplift: 0.08780140446043494
95% CI: 0.08003661234957973 0.09556619657129015
