# Effect Size + CUPED (Cookie Cats)
Compute diff-in-means and show CUPED example with a pre-period covariate.

In [1]:
import pandas as pd, numpy as np
from scipy import stats

df = pd.read_csv('../data/cookie_cats.csv')
df['variant'] = df['version'].map({'gate_30':'A','gate_40':'B'})
A = df.query("variant=='A'")
B = df.query("variant=='B'")

# Target metric: retention 7 as mean of 0/1
mA, mB = A['retention_7'].mean(), B['retention_7'].mean()
nA, nB = len(A), len(B)
diff = mB - mA

# 95% CI for diff-in-means (Bernoulli -> normal approx)
se = np.sqrt(mA*(1-mA)/nA + mB*(1-mB)/nB)
lo, hi = diff - 1.96*se, diff + 1.96*se
print(f"Δ retention_7d = {diff:.4f}  (95% CI [{lo:.4f}, {hi:.4f}])")

# CUPED using retention_1 as covariate
xA, xB = A['retention_1'], B['retention_1']
theta = np.cov(df['retention_7'], df['retention_1'], ddof=0)[0,1] / np.var(df['retention_1'], ddof=0)
A_cuped = A['retention_7'] - theta*(xA - xA.mean())
B_cuped = B['retention_7'] - theta*(xB - xB.mean())
mA_c, mB_c = A_cuped.mean(), B_cuped.mean()
diff_c = mB_c - mA_c
se_c = np.sqrt(A_cuped.var()/nA + B_cuped.var()/nB)  # conservative
lo_c, hi_c = diff_c - 1.96*se_c, diff_c + 1.96*se_c
print(f"CUPED Δ = {diff_c:.4f}  (95% CI [{lo_c:.4f}, {hi_c:.4f}])")

# save artifct
with open('../docs/effect_size_ci.txt','w') as f:
    f.write(f"diff={diff:.4f}, CI=[{lo:.4f},{hi:.4f}], diff_cuped={diff_c:.4f}, CI_cuped=[{lo_c:.4f},{hi_c:.4f}]\n")


Δ retention_7d = -0.0082  (95% CI [-0.0133, -0.0031])
CUPED Δ = -0.0082  (95% CI [-0.0130, -0.0034])
