# A/B Testing

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests 

In [23]:
df = pd.read_csv('ab_browser_test.csv', header = 0)
print(df.shape)
df.head()

(566134, 6)


Unnamed: 0,userID,browser,slot,n_clicks,n_queries,n_nonclk_queries
0,1,Browser #2,exp,23,32,19
1,3,Browser #4,exp,3,4,2
2,5,Browser #4,exp,29,35,16
3,6,Browser #4,control,12,6,0
4,7,Browser #4,exp,54,68,30


In [24]:
df.describe()

Unnamed: 0,userID,n_clicks,n_queries,n_nonclk_queries
count,566134.0,566134.0,566134.0,566134.0
mean,301377.214027,11.431658,10.720524,4.703987
std,175526.333101,20.251494,16.262827,8.109958
min,1.0,0.0,1.0,0.0
25%,148627.25,1.0,2.0,1.0
50%,299362.5,4.0,5.0,2.0
75%,455698.75,13.0,13.0,6.0
max,603137.0,863.0,665.0,581.0


In [36]:
df_exp = df[df['slot'] == 'exp']
df_ctrl = df[df['slot'] == 'control']


In [37]:
diff_clicks = 100 * (np.sum(df_exp['n_clicks']) - np.sum(df_ctrl['n_clicks'])) / np.sum(df_ctrl['n_clicks'])
print(diff_clicks)

1.6135689824415806


In [41]:
print(df_exp.shape, df_ctrl.shape)
print(np.mean(df_exp['n_clicks']) - np.mean(df_ctrl['n_clicks']), 
      np.median(df_exp['n_clicks']) - np.median(df_ctrl['n_clicks']))

(281580, 6) (284554, 6)
0.3030947340656329 1.0


In [77]:
def get_bootstrap_samples(data, n_samples):
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples
np.random.seed(0)
boot_exp = get_bootstrap_samples(df_exp['n_clicks'].values, 500)
boot_ctrl = get_bootstrap_samples(df_ctrl['n_clicks'].values, 500)

In [78]:
boot_exp_mean = np.mean(boot_exp, axis = 1)
boot_exp_median = np.median(boot_exp, axis = 1)
boot_ctrl_mean = np.mean(boot_ctrl, axis = 1)
boot_ctrl_median = np.median(boot_ctrl, axis = 1)

In [79]:
diff_mean = boot_exp_mean - boot_ctrl_mean
diff_median = boot_exp_median - boot_ctrl_median
def stat_intervals(stat, alpha):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries
print(stat_intervals(diff_mean, 0.05), stat_intervals(diff_median, 0.05))
print(stats.ttest_ind(boot_exp_mean, boot_ctrl_mean, equal_var = False),
      stats.ttest_ind(boot_exp_median, boot_ctrl_median, equal_var = False))

[ 0.19960335  0.40496719] [ 1.  1.]
Ttest_indResult(statistic=126.13284465906047, pvalue=0.0) Ttest_indResult(statistic=inf, pvalue=0.0)


In [80]:
stats.probplot(boot_ctrl_mean, sparams=499, plot=plt)[1][2]

0.99917553992634967

In [81]:
boot_ctrl_sq = np.var(boot_ctrl, axis = 1) * len(boot_ctrl)
stats.probplot(boot_ctrl_sq, dist="chi2", sparams=499, plot=plt)[1][2]

0.99851575631942646

In [61]:
n_click_ctrl = df_ctrl.groupby(by='userID')['n_clicks'].agg(np.sum)
n_click_exp = df_exp.groupby(by='userID')['n_clicks'].agg(np.sum)

In [62]:
stats.mannwhitneyu(n_click_ctrl, n_click_exp)

MannwhitneyuResult(statistic=38901259929.0, pvalue=4.3471471887604393e-75)

In [71]:
p_values = []
for i in [2, 4, 14, 17, 20, 22]:
    p_values.append(stats.mannwhitneyu(df_exp[df_exp['browser'] == 'Browser #' + str(i)]['n_clicks'].values,
                                       df_ctrl[df_ctrl['browser'] == 'Browser #' + str(i)]['n_clicks'].values,
                                       alternative='two-sided')[1])
print(p_values)

[0.054487720839448202, 0.81521235316909968, 0.0, 0.074801202860216034, 0.90733312777470754, 0.51481103044200638]


In [73]:
multipletests(p_values, alpha = 0.05/6, method = 'holm')[1]

array([ 0.2724386 ,  1.        ,  0.        ,  0.29920481,  1.        ,  1.        ])

In [76]:
exp_nonclick = []
ctrl_nonclick = []
for i in [2, 4, 14, 17, 20, 22]:
    exp_nonclick.append(100 * df_exp[df_exp['browser'] == 'Browser #' + str(i)]['n_nonclk_queries'].sum() / df_exp[df_exp['browser'] == 'Browser #' + str(i)]['n_queries'].sum())
    exp_nonclick.append(100 * df_ctrl[df_ctrl['browser'] == 'Browser #' + str(i)]['n_nonclk_queries'].sum() / df_ctrl[df_ctrl['browser'] == 'Browser #' + str(i)]['n_queries'].sum())
print(exp_nonclick)
print(ctrl_nonclick)

[44.982746948554706, 45.96274717919465, 45.14294190358467, 46.97092963514274, 43.755617361273295, 57.59041136008114, 36.93741284866483, 36.29936674628209, 38.97737648371716, 40.540484743383296, 39.85394721969546, 40.593976593513354]
[]
