In [32]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.special import comb 

## Measuring Variability

In [4]:
baseline = {'Cookies': 5000,
            'Click': 400,
            'Enrollment': 82.5,
            'CTP': 0.08,
            'GC': 0.20625,
            'R': 0.53,
            'NC': 0.1093125}

In [5]:
std_GC = np.sqrt(baseline['GC']*(1-baseline['GC'])/baseline['Click'])
std_R  = np.sqrt(baseline['R']*(1-baseline['R'])/baseline['Enrollment'])
std_NC = np.sqrt(baseline['NC']*(1-baseline['NC'])/baseline['Click'])
print("The standard variability of gross conversion is {}, retention is {}, and net conversion is {}".format(std_GC, std_R, std_NC))

The standard variability of gross conversion is 0.020230604137049392, retention is 0.05494901217850908, and net conversion is 0.01560154458248846


## Designing Experiment

### Sizing

The [Online_Sample_Size_Calculator](http://www.evanmiller.org/ab-testing/sample-size.html) is a convenient way to determine sample size. alpha=0.5, beta=0.2 

In [10]:
index   = ['GC', 'R', 'NC']
columns = ['Sample Size', 'Number of Cookies']
data    = [[25835, round(2*25835/baseline['CTP'])], 
           [39115, round(2*39115/baseline['GC']/baseline['CTP'])],
           [27413, round(2*27413/baseline['CTP'])]]
sample_size = pd.DataFrame(index=index, columns=columns, data=data)
sample_size

Unnamed: 0,Sample Size,Number of Cookies
GC,25835,645875
R,39115,4741212
NC,27413,685325


## Analyzing Results

### Sanity Check

In [13]:
control   = pd.read_excel('Final Project Results.xlsx', 0, index_col=0)
treatment = pd.read_excel('Final Project Results.xlsx', 1, index_col=0)

#### Pageviews

In [19]:
pageviews_control = control.Pageviews.sum()
pageviews_treatment = treatment.Pageviews.sum()
pageviews_total = pageviews_control + pageviews_treatment
stand_pageviews = (pageviews_control/pageviews_total - 0.5)/np.sqrt((0.5**2/pageviews_total))
p_pageviews = 1-stats.norm.cdf(stand_pageviews) 
p_pageviews

0.14392482085331415

In [20]:
sd_pageviews = np.sqrt(0.5*(1-0.5)/(pageviews_control+pageviews_treatment))
me_pageviews = sd_pageviews*1.96
ci_pageviews = [0.5-me_pageviews, 0.5+me_pageviews]
prop_pageviews = pageviews_control/pageviews_total
print(ci_pageviews, prop_pageviews)

[0.49882039214902313, 0.5011796078509769] 0.5006396668806133


#### Clicks

In [21]:
click_control = control.Clicks.sum()
click_treatment = treatment.Clicks.sum()
click_total = click_control + click_treatment
stand_click = (click_control/click_total - 0.5)/np.sqrt((0.5**2/click_total))
p_click = 1-stats.norm.cdf(stand_click) 
p_click

0.41193385199077437

#### Click-through-probability

In [23]:
ctp_control = click_control/pageviews_control
ctp_treatment = click_treatment/pageviews_treatment
ctp_pool = click_total/pageviews_total
ctp_std = np.sqrt(ctp_pool*(1-ctp_pool)*(1/pageviews_control + 1/pageviews_treatment))
stand_ctp = (ctp_control-ctp_treatment)/ctp_std
p_ctp = stats.norm.cdf(stand_ctp) 
p_ctp

0.4658679762236956

### Effect Size Tests

#### Gross Conversion

In [25]:
# Numbers of enrollments in two groups
enroll_control   = control.Enrollments.sum()
enroll_treatment = treatment.Enrollments.sum()
# Nubmers of clicks in two groups
click_control = control.loc[control.Enrollments.notnull(), 'Clicks'].sum()
click_treatment= treatment.loc[treatment.Enrollments.notnull(), 'Clicks'].sum()
# Gross conversion of two groups
gc_control   = enroll_control/click_control
gc_treatment = enroll_treatment/click_treatment
# p_value
gc_diff   = gc_treatment - gc_control
gc_pooled = (enroll_control + enroll_treatment)/(click_control + click_treatment)
gc_sd     = np.sqrt(gc_pooled*(1-gc_pooled)*(1/click_control + 1/click_treatment))
stand_gc  = gc_diff/gc_sd
p_gc      = stats.norm.cdf(stand_gc)
p_gc

1.2892005168602965e-06

In [27]:
ci_gc = [gc_diff-stats.norm.ppf(1-0.05/2)*gc_sd, gc_diff+stats.norm.ppf(1-0.05/2)*gc_sd]
ci_gc

[-0.02912320088750467, -0.011986548273218463]

#### Net Conversion

In [28]:
# Numbers of payments in two groups
pay_control   = control.Payments.sum()
pay_treatment = treatment.Payments.sum()
# Nubmers of clicks in two groups
click_control = control.loc[control.Payments.notnull(), 'Clicks'].sum()
click_treatment= treatment.loc[treatment.Payments.notnull(), 'Clicks'].sum()
# Net conversion of two groups
nc_control   = pay_control/click_control
nc_treatment = pay_treatment/click_treatment
# p_value
nc_diff   = nc_treatment - nc_control
nc_pooled = (pay_control + pay_treatment)/(click_control + click_treatment)
nc_sd     = np.sqrt(nc_pooled*(1-nc_pooled)*(1/click_control + 1/click_treatment))
stand_nc  = nc_diff/nc_sd
p_nc      = stats.norm.cdf(stand_nc)
p_nc

0.07792034131075103

In [29]:
ci_nc = [nc_diff-stats.norm.ppf(1-0.05/2)*nc_sd, nc_diff+stats.norm.ppf(1-0.05/2)*nc_sd]
ci_nc

[-0.011604500677993734, 0.0018570553289053993]

### Sign Test

In [41]:
c_t = control.merge(treatment, left_index=True, right_index=True)
c_t = c_t.loc[c_t.Payments_x.notnull(),:]
c_t['GC'] = c_t['Enrollments_y']/c_t['Clicks_y'] > c_t['Enrollments_x']/c_t['Clicks_x']
c_t['NC'] = c_t['Payments_y']/c_t['Clicks_y'] > c_t['Payments_x']/c_t['Clicks_x']

def sign_test(x, n):
    def binomial(x, n):
        return comb(n,x)*0.5**n
    p = 0
    for i in range(x+1):
        p = p + binomial(i, n)
    return p

In [43]:
p_gc_sign = 2 * sign_test(c_t['GC'].sum(), c_t['GC'].count())
p_gc_sign

0.002599477767944336

In [45]:
p_nc_sign = 2 * sign_test(c_t['NC'].sum(), c_t['NC'].count())
p_nc_sign

0.6776394844055176