In [1]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

# Q1

In [2]:
df = pd.read_csv('AB_test_data.csv')
df.head()

Unnamed: 0,purchase_TF,Variant,date,id
0,False,A,2019-11-08,0x25b44a
1,False,B,2020-08-27,0x46271e
2,False,A,2020-06-11,0x80b8f1
3,False,B,2020-08-22,0x8d736d
4,False,A,2020-08-05,0x96c9c8


In [3]:
df['date'] = pd.to_datetime(df['date'])

In [4]:
df['purchase_TF'].value_counts()

False    110415
True      19585
Name: purchase_TF, dtype: int64

In [5]:
summary = df.iloc[:, [0, 1, 2]].pivot_table(values='purchase_TF', index='Variant', aggfunc=np.sum)
summary['total'] = df.pivot_table(values='purchase_TF', index='Variant', aggfunc=lambda x: len(x))
summary['rate'] = summary.iloc[:, 0]/summary.iloc[:, 1]
summary['std'] = df.iloc[:, [0, 1, 2]].pivot_table(values='purchase_TF', index='Variant', aggfunc=np.std)
summary

Unnamed: 0_level_0,purchase_TF,total,rate,std
Variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,18702,125000,0.149616,0.356696
B,883,5000,0.1766,0.381368


## Hypothesis Test

In [6]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

In [7]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
control_results = df[df['Variant'] == 'A']['purchase_TF']
treatment_results = df[df['Variant'] == 'B']['purchase_TF']
n_con = control_results.count()
n_treat = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]

z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)

print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

z statistic: -5.23
p-value: 0.000
ci 95% for control group: [0.148, 0.152]
ci 95% for treatment group: [0.166, 0.187]


# Q2
## Optimal Sample Size

In [8]:
import scipy.stats as stats

t_alpha = stats.norm.ppf(1-0.05/2)
t_beta = stats.norm.ppf(1-0.2)
print(t_alpha, t_beta)

1.959963984540054 0.8416212335729143


In [9]:
# Calculate optimal sample size
rate_avg = summary['rate'].mean()
rate_A = summary['rate'][0]
rate_B = summary['rate'][1]

Var_avg = rate_avg * (1-rate_avg)
Var_A = rate_A * (1-rate_A)
Var_B = rate_B * (1-rate_B)

diff = abs(rate_A - rate_B)

N = 1/diff**2 * (t_alpha * np.sqrt(2 * Var_avg) + t_beta * np.sqrt(Var_A + Var_B)) ** 2
N = int(np.ceil(N))
N

2942

## Test Hypothesis for 10 times

In [10]:
def random_sample(data, size, random_seeds):
    control_sample = data[data['Variant'] == 'A'].sample(n=size, random_state=random_seeds)
    treat_sample = data[data['Variant'] == 'B'].sample(n=size, random_state=random_seeds)
    ab_test = ab_test = pd.concat([control_sample, treat_sample], axis=0)
    
    return ab_test

In [11]:
def AB_test(data, size, random_seeds):
    ab_test = random_sample(data, size, random_seeds)
    
    control_results = ab_test[ab_test['Variant'] == 'A']['purchase_TF']
    treatment_results = ab_test[ab_test['Variant'] == 'B']['purchase_TF']
    
    n_con = control_results.count()
    n_treat = treatment_results.count()
    successes = [control_results.sum(), treatment_results.sum()]
    nobs = [n_con, n_treat]
    
    z_stat, pval = proportions_ztest(successes, nobs=nobs)
    (lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)

    print(f'z statistic: {z_stat:.2f}')
    print(f'p-value: {pval:.3f}')
    print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
    print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

In [12]:
for i in range(10):
    print('Result for Test', i)
    print()
    AB_test(df, N, i)
    print()

Result for Test 0

z statistic: -1.89
p-value: 0.059
ci 95% for control group: [0.144, 0.171]
ci 95% for treatment group: [0.162, 0.189]

Result for Test 1

z statistic: -1.98
p-value: 0.048
ci 95% for control group: [0.141, 0.167]
ci 95% for treatment group: [0.159, 0.186]

Result for Test 2

z statistic: -2.43
p-value: 0.015
ci 95% for control group: [0.140, 0.166]
ci 95% for treatment group: [0.162, 0.190]

Result for Test 3

z statistic: -2.36
p-value: 0.018
ci 95% for control group: [0.145, 0.171]
ci 95% for treatment group: [0.167, 0.195]

Result for Test 4

z statistic: -3.13
p-value: 0.002
ci 95% for control group: [0.137, 0.162]
ci 95% for treatment group: [0.166, 0.194]

Result for Test 5

z statistic: -3.03
p-value: 0.002
ci 95% for control group: [0.136, 0.161]
ci 95% for treatment group: [0.164, 0.192]

Result for Test 6

z statistic: -1.26
p-value: 0.209
ci 95% for control group: [0.148, 0.175]
ci 95% for treatment group: [0.160, 0.187]

Result for Test 7

z statistic: -3

# Q3
## Sequential Test

In [13]:
ln_A = np.log(1/0.05)
ln_B = np.log(0.2)

print(ln_A, ln_B)

2.995732273553991 -1.6094379124341003


In [14]:
summary

Unnamed: 0_level_0,purchase_TF,total,rate,std
Variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,18702,125000,0.149616,0.356696
B,883,5000,0.1766,0.381368


In [15]:
def stopping(x):
    if x >= ln_A:
        print("Accept H_1 with lambda", x)
        return True
    if x <= ln_B:
        print("Accept H_2 with lambda", x)
        return True
    return False

In [18]:
for i in range(10):
    sample = random_sample(df, N, i)
    lam = 0
    for j in range(len(sample)):
        if sample['purchase_TF'].iloc[j] == 0:
            update = np.log((1-0.177)/(1-0.15))
        else:
            update = np.log(0.177/0.15)
            
        lam += update
        if stopping(lam):
            print("Stop at iteration", j)
            print()
            break


Accept H_2 with lambda -1.6264387266907763
Stop at iteration 1281

Accept H_2 with lambda -1.6197172023604345
Stop at iteration 233

Accept H_2 with lambda -1.636801386846481
Stop at iteration 160

Accept H_2 with lambda -1.6143446944761173
Stop at iteration 337

Accept H_2 with lambda -1.639364115234294
Stop at iteration 589

Accept H_1 with lambda 2.9966362296924594
Stop at iteration 427

Accept H_2 with lambda -1.6358798275147926
Stop at iteration 546

Accept H_2 with lambda -1.6347112170743856
Stop at iteration 1894

Accept H_2 with lambda -1.6260563710778524
Stop at iteration 368

Accept H_2 with lambda -1.629923014410272
Stop at iteration 1324

