In [1]:
import numpy as np
import pandas as pd
import math

## Calculating significance of difference in an experiment

In [25]:
def se_pooled(n_cont, x_cont, n_exp, x_exp):
    p_pool=(x_cont+x_exp)/(n_cont+n_exp)
    return p_pool,(p_pool*(1-p_pool)*((1/n_cont)+(1/n_exp)))**(1/2)

In [29]:
nz=se_pooled(6021, 302, 5979, 374)
g=se_pooled(50000+6021,2500+302,50000+5979,2500+374)

In [30]:
nz, g

((0.05633333333333333, 0.00420953442023799),
 (0.05067857142857143, 0.0013108102809227253))

In [31]:
n_contg, x_contg, n_expg, x_expg=50000+6021,2500+302,50000+5979,2500+374
n_contnz, x_contnz, n_expnz, x_expnz=6021, 302, 5979, 374

In [41]:
#global difference between exp and control
g_diff=x_expg/n_expg-x_contg/n_contg
m_g = g[1]*1.96
g_diff, m_g, m_g>g_diff

(0.0013237234004343165, 0.0025691881506085417, True)

In [42]:
#nz difference between exp and control
nz_diff=x_expnz/n_expnz-x_contnz/n_expnz; nz_diff
m_nz = nz[1]*1.96
nz_diff, m_nz, m_nz>nz_diff

(0.012042147516307077, 0.00825068746366646, False)

## Other

In [3]:
Xs_cont = np.array([196, 200, 200, 216, 212, 185, 225, 187, 205, 211, 192, 196, 223, 192])
Ns_cont = np.array([2029, 1991, 1951, 1985, 1973, 2021, 2041, 1980, 1951, 1988, 1977, 2019, 2035, 2007])
Xs_exp = np.array([179, 208, 205, 175, 191, 291, 278, 216, 225, 207, 205, 200, 297, 299])
Ns_exp = np.array([1971, 2009, 2049, 2015, 2027, 1979, 1959, 2020, 2049, 2012, 2023, 1981, 1965, 1993])

In [28]:
emp_se=0.0062
emp_sf=((1/5000)+(1/5000))**(1/2)
x_sf=((1/Ns_cont.sum())+(1/Ns_exp.sum()))**(1/2)

In [29]:
SE=(emp_se/emp_sf)*x_sf; SE

0.002619982707795742

In [30]:
m=1.96*SE
Xp=(Xs_exp.sum()/Ns_exp.sum())
Np=((Xs_cont.sum()/Ns_cont.sum()))
d=Xp-Np
Xp,Np,d,(d-m,d+m)

(0.1132183088549836,
 0.10161728925146701,
 0.011601019603516588,
 (0.006465853496236934, 0.016736185710796242))

In [34]:
p_exp=Xs_exp/Ns_exp
p_cont=Xs_cont/Ns_cont
(p_exp>p_cont).sum(),p_exp.size

(9, 14)

In [42]:
(math.factorial(14)/(math.factorial(9)*math.factorial(5)))*(.5**9)*(.5**5)

0.1221923828125

In [43]:
6.5-0.98*(14**(1/2))

2.8331757609615376

In [46]:
def binomial(n,k,ps):
    combs=math.factorial(n)/(math.factorial(k)*math.factorial(n-k))
    p_success=ps**k
    p_nsuccess=(1-ps)**(n-k)
    return combs*p_success*p_nsuccess

In [49]:
binomial(10,,.5)

0.1221923828125

In [23]:
1-0.99**(10),1-0.95**(10)

(0.09561792499119559, 0.4012630607616213)

# Bonferroni correction

In [26]:
# sample data
diffs=np.array([0.03,-.5,-.01,10])
se=np.array([.013,.21,.0045,6.85])
metrics=['prob of clicking though to course overview','avg time spent reading course overview page',
         'prob of enrolling','avg time in classroom during the first week']

In [57]:
z_alpha=1.96
z_bonferroni = 2.5

In [58]:
def includes_zero(diffs, se, metrics, z_alpha):
    m=se*z_alpha
    ub=diffs+m
    lb=diffs-m
    print('metrics where confidence interval includes zero')
    return {k:v for k,v in zip(metrics,(0>lb)*(0<ub))}

In [54]:
includes_zero(diffs, se, metrics, z_alpha)

metrics where confidence interval includes zero


{'avg time in classroom during the first week': True,
 'avg time spent reading course overview page': False,
 'prob of clicking though to course overview': False,
 'prob of enrolling': False}

In [59]:
includes_zero(diffs, se, metrics, z_bonferroni)

metrics where confidence interval includes zero


{'avg time in classroom during the first week': True,
 'avg time spent reading course overview page': True,
 'prob of clicking though to course overview': True,
 'prob of enrolling': True}

In [60]:
0.05/8

0.00625

# Final project calculations

In [2]:
# load data
control = pd.read_excel('final_project_data.xlsx', sheet_name='Control')
experiment = pd.read_excel('final_project_data.xlsx', sheet_name='Experiment')

In [60]:
df = pd.merge(left=control, right=experiment, how='outer', on='Date', suffixes=('_control','_experiment'))

In [61]:
df.columns = df.columns.map(lambda x: x.lower())

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37 entries, 0 to 36
Data columns (total 9 columns):
date                      37 non-null object
pageviews_control         37 non-null int64
clicks_control            37 non-null int64
enrollments_control       23 non-null float64
payments_control          23 non-null float64
pageviews_experiment      37 non-null int64
clicks_experiment         37 non-null int64
enrollments_experiment    23 non-null float64
payments_experiment       23 non-null float64
dtypes: float64(4), int64(4), object(1)
memory usage: 2.9+ KB


In [41]:
df.sum()

Date                      Sat, Oct 11Sun, Oct 12Mon, Oct 13Tue, Oct 14We...
Pageviews_control                                                    345543
Clicks_control                                                        28378
Enrollments_control                                                    3785
Payments_control                                                       2033
Pageviews_experiment                                                 344660
Clicks_experiment                                                     28325
Enrollments_experiment                                                 3423
Payments_experiment                                                    1945
dtype: object

## Sanity Checks

### Number of pageviews

In [7]:
total_pageviews=df.pageviews_control.sum()+df.pageviews_experiment.sum()
p_control=df.pageviews_control.sum()/total_pageviews

In [8]:
# checking proportion of observations in sample and control group
expected_sd=((0.5*0.5)/total_pageviews)**(1/2)
m = expected_sd*1.96
ci=(.5-m, .5+m)
expected_sd, m, ci

(0.0006018407402943247,
 0.0011796078509768765,
 (0.49882039214902313, 0.5011796078509769))

In [9]:
p_control>ci[0] and p_control<ci[1], p_control

(True, 0.5006396668806133)

### Number of clicks

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37 entries, 0 to 36
Data columns (total 9 columns):
date                      37 non-null object
pageviews_control         37 non-null int64
clicks_control            37 non-null int64
enrollments_control       23 non-null float64
payments_control          23 non-null float64
pageviews_experiment      37 non-null int64
clicks_experiment         37 non-null int64
enrollments_experiment    23 non-null float64
payments_experiment       23 non-null float64
dtypes: float64(4), int64(4), object(1)
memory usage: 2.9+ KB


In [11]:
total_clicks=df.clicks_control.sum()+df.clicks_experiment.sum()

In [12]:
p_control=df.clicks_control.sum()/total_clicks; p_control

0.5004673474066628

In [13]:
expected_sd = ((.5*.5)/total_clicks)**(1/2)
m = expected_sd*1.96
ci=(.5-m, .5+m)
ci

(0.49588449572378945, 0.5041155042762105)

In [14]:
p_control>ci[0] and p_control<ci[1], p_control

(True, 0.5004673474066628)

## Evaluation - statistical and practical significance

In [15]:
df.head()

Unnamed: 0,date,pageviews_control,clicks_control,enrollments_control,payments_control,pageviews_experiment,clicks_experiment,enrollments_experiment,payments_experiment
0,"Sat, Oct 11",7723,687,134.0,70.0,7716,686,105.0,34.0
1,"Sun, Oct 12",9102,779,147.0,70.0,9288,785,116.0,91.0
2,"Mon, Oct 13",10511,909,167.0,95.0,10480,884,145.0,79.0
3,"Tue, Oct 14",9871,836,156.0,105.0,9867,827,138.0,92.0
4,"Wed, Oct 15",10014,837,163.0,64.0,9793,832,140.0,94.0


In [62]:
# gross conversion = enrollments/clicks
# net conversion = payments/clicks
df['gross_conversion_contol']=df['enrollments_control']/df['clicks_control']
df['gross_conversion_experiment']=df['enrollments_experiment']/df['clicks_experiment']
df['net_conversion_control']=df['payments_control']/df['clicks_control']
df['net_conversion_experiment']=df['payments_experiment']/df['clicks_experiment']

In [63]:
df['gross_conversion_diff']=df['gross_conversion_experiment']-df['gross_conversion_contol']
df['net_conversion_diff']=df['net_conversion_experiment']-df['net_conversion_control']

#### Gross conversion test
$H_O: \mu_c-\mu_e=0$

$H_A: \mu_c-\mu_e\neq0$

$\alpha=0.05$

$z=1.96$

In [64]:
df=df.loc[:22]
exp_clicks, exp_enrol, cont_clicks, cont_enrol = df.clicks_experiment.sum(),\
df.enrollments_experiment.sum(), df.clicks_control.sum(), df.enrollments_control.sum()

In [48]:
def p_se(N_exp, N_cont, X_exp, X_cont):
    p_pooled = (X_cont+X_exp)/(N_cont+N_exp)
    se = ((p_pooled*(1-p_pooled))*((1/N_exp)+(1/N_cont)))**(1/2)
    return p_pooled, se

In [49]:
p_pooled, se = p_se(exp_clicks, cont_clicks, exp_enrol, cont_enrol)
p_pooled, se

(0.20860706740369866, 0.004371675385225936)

In [50]:
d = exp_enrol/exp_clicks - cont_enrol/cont_clicks
m = 1.96*se
ci = (d-m, d+m)

In [51]:
d, m, ci

(-0.020554874580361565,
 0.008568483755042836,
 (-0.0291233583354044, -0.01198639082531873))

#### Net conversion test
$H_O: \mu_c-\mu_e=0$

$H_A: \mu_c-\mu_e\neq0$

$\alpha=0.05$

$z=1.96$

In [52]:
# net conversion = payments/clicks
X_exp, X_cont, N_exp, N_cont = df.payments_experiment.sum(), df.payments_control.sum(),\
df.clicks_experiment.sum(), df.clicks_control.sum()

In [53]:
p_pooled, se = p_se(N_exp, N_cont, X_exp, X_cont)
p_pooled, se

(0.1151274853124186, 0.0034341335129324238)

In [56]:
d = X_exp/N_exp - X_cont/N_cont
m = 1.96*se
ci = (d-m, d+m)

In [57]:
d, m, ci

(-0.0048737226745441675,
 0.0067309016853475505,
 (-0.011604624359891718, 0.001857179010803383))

### Sign Test

In [68]:
(df.gross_conversion_diff==0).sum(), (df.net_conversion_diff==0).sum()

(0, 0)

In [70]:
df['gc_sign']=df.gross_conversion_diff.apply(lambda x: 'positive' if x>0 else 'negative')
df['nc_sign']=df.net_conversion_diff.apply(lambda x: 'positive' if x>0 else 'negative')

In [71]:
df.gc_sign.value_counts()

negative    19
positive     4
Name: gc_sign, dtype: int64

In [72]:
df.nc_sign.value_counts()

negative    13
positive    10
Name: nc_sign, dtype: int64