In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm

In [2]:
import hashlib

def test_group(x, seed, size):
    hash_sha256 = hashlib.sha256(f"{seed}_{x}".encode())
    return int(hash_sha256.hexdigest(), 16)%size

# ログデータ

In [3]:
rows=[]
for i in range(10000):
    impression=np.random.randint(1000)
    click=np.random.randint(
        impression+1 if np.random.randint(2) else int(np.sqrt(impression))+1
    )
    rows+=[{"user_id": i, "click": 1}]*click+[{"user_id": i, "click": 0}]*(impression-click)
log=pd.DataFrame(rows)
log

Unnamed: 0,user_id,click
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
5005826,9999,0
5005827,9999,0
5005828,9999,0
5005829,9999,0


# 間違った分散

In [4]:
log["group"]=log["user_id"].map(lambda x: test_group(x, "example", size=2))

In [5]:
naive_var=log.groupby("group")["click"].var(ddof=1)
naive_var

group
0    0.187713
1    0.192405
Name: click, dtype: float64

In [6]:
size=log.groupby("group")["click"].size()
size

group
0    2515694
1    2490137
Name: click, dtype: int64

In [7]:
np.sqrt((naive_var/size).sum())

0.00038972230109674433

In [8]:
# 間違った分散を使った検定: 有意差が出てしまう
mean=log.groupby("group")["click"].mean()
z=(mean[0]-mean[1])/np.sqrt((naive_var/size).sum())
p=norm.cdf(-np.abs(z))*2
p

1.5566062339444403e-133

# ブートストラップ

In [9]:
# ランダム化単位で集計したテーブルを作成する
df=log.groupby("user_id")["click"].agg(["count", "sum"]).rename(
    columns={"count":"impression", "sum":"click"}).reset_index()
df

Unnamed: 0,user_id,impression,click
0,0,775,6
1,1,821,17
2,2,122,33
3,3,236,99
4,4,725,20
...,...,...,...
9984,9995,176,151
9985,9996,772,674
9986,9997,191,114
9987,9998,142,64


In [10]:
# 2つのグループに分ける
df["group"]=df["user_id"].map(lambda x: test_group(x, "example", size=2))

In [11]:
df.groupby("group").size()

group
0    5030
1    4959
dtype: int64

In [12]:
def sampling_click_rate(df, group):
    tmp=df[df["group"]==group]
    tmp=tmp.sample(n=tmp.shape[0], replace=True)
    return tmp["click"].sum()/tmp["impression"].sum()

In [13]:
# 正しい標準偏差を推定
diffs = [
    sampling_click_rate(df, 0) - sampling_click_rate(df, 1)
    for i in range(1000)
]
np.std(diffs, ddof=1)

0.007097540247443568

In [14]:
# 検定
z=(mean[0]-mean[1])/np.std(diffs, ddof=1)
p=norm.cdf(-np.abs(z))*2
p

0.17691725047652063

# デルタ法

In [15]:
def var_delta(df, group):
    tmp=df[df["group"]==group]
    
    meanX=tmp["click"].mean()
    meanY=tmp["impression"].mean()
    varX=tmp["click"].var(ddof=1)
    varY=tmp["impression"].var(ddof=1)
    covXY=np.cov(tmp["impression"],tmp["click"], ddof=1)[0,1]

    return (varX/meanY**2 + meanX**2/meanY**4 * varY - 2*meanX/meanY**3 * covXY)/tmp.shape[0]

In [16]:
# 正しい標準偏差を推定
np.sqrt(var_delta(df, 0)+var_delta(df, 1))

0.007181788719058018

In [17]:
# 検定
z=(mean[0]-mean[1])/np.sqrt(var_delta(df, 0)+var_delta(df, 1))
p=norm.cdf(-np.abs(z))*2
p

0.18205065546297083