# Effect Size + CUPED (Cookie Cats)
Compute diff-in-means and show CUPED example with a pre-period covariate.

In [ ]:
import pandas as pd, numpy as np, pathlib
from scipy import stats
p = pathlib.Path('product_ab_cookiecats/data/cookie_cats.csv')
if not p.exists():
    print('Place cookie_cats.csv in', p.parent)
else:
    df = pd.read_csv(p)
    needed = {'variant','retention_7d','retention_1d'}
    print('Columns:', list(df.columns))
    if needed.issubset(df.columns):
        A = df[df.variant=='A']; B = df[df.variant=='B']
        diff = B['retention_7d'].mean() - A['retention_7d'].mean()
        # simple CI (pooled std, large-sample)
        import numpy as np
        nA, nB = len(A), len(B)
        var = A['retention_7d'].var(ddof=1)/nA + B['retention_7d'].var(ddof=1)/nB
        se = np.sqrt(var)
        ci = (diff - 1.96*se, diff + 1.96*se)
        print('Diff-in-means:', diff, '95% CI:', ci)
        # CUPED: covariate adjustment
        theta = df['retention_7d'].cov(df['retention_1d']) / df['retention_1d'].var(ddof=1)
        df['y_adj'] = df['retention_7d'] - theta*(df['retention_1d'] - df['retention_1d'].mean())
        A2 = df[df.variant=='A']['y_adj']; B2 = df[df.variant=='B']['y_adj']
        diff2 = B2.mean() - A2.mean()
        var2 = A2.var(ddof=1)/nA + B2.var(ddof=1)/nB
        se2 = np.sqrt(var2)
        ci2 = (diff2 - 1.96*se2, diff2 + 1.96*se2)
        print('CUPED diff:', diff2, '95% CI:', ci2)
    else:
        print('Add/rename columns to include variant, retention_7d, retention_1d')
