In [1]:
import pandas as pd
import numpy as np

In [2]:
pageviews = 5000

In [3]:
df_basevals = pd.read_csv("data/baseline_vals.csv", index_col=False,header = None, names = ['metric','baseline_val'])
df_basevals.metric = df_basevals.metric.map(lambda x: x.lower())
df_basevals

Unnamed: 0,metric,baseline_val
0,unique cookies to view page per day:,40000.0
1,"unique cookies to click ""start free trial"" per...",3200.0
2,enrollments per day:,660.0
3,"click-through-probability on ""start free trial"":",0.08
4,"probability of enrolling, given click:",0.20625
5,"probability of payment, given enroll:",0.53
6,"probability of payment, given click",0.109313


In [4]:
round(np.sqrt((.206250*(1-.206250))/(5000*3200/40000)),4)

0.0202

In [5]:
round(np.sqrt((.53*(1-.53))/(5000*660/40000)),4)

0.0551

In [6]:
round(np.sqrt((.109313*(1-.109313))/(5000*3200/40000)),4)

0.0156

## Pageviews without Bonferroni Correction ##

### Gross Conversion ###

- Baseline Conversion: 20.625%
- Minimum Detectable Effect: 1%
- alpha: 5%
- beta: 20%
- 1 - beta: 80%
- sample size = 25,835 enrollments/group
- Number of groups = 2 (experiment and control)
- total sample size =  51,670 enrollments
- clicks/pageview: 3200/40000 = .08 clicks/pageview
- pageviews = 645,875



### Retention ###

- Baseline Conversion: 53%
- Minimum Detectable Effect: 1%
- alpha: 5%
- beta: 20%
- 1 - beta: 80%
- sample size = 39,155 enrollments/group
- Number of groups = 2 (experiment and control)
- total sample size = 78,230 enrollments
- enrollments/pageview: 660/40000 = .0165 enrollments/pageview
- pageviews = 78,230/.0165 = 4,741,212

### Net Conversion ###

- Baseline Conversion: 10.9313%
- Minimum Detectable Effect: .75%
- alpha: 5%
- beta: 20%
- 1 - beta: 80%
- sample size = 27,413 enrollments/group
- Number of groups = 2 (experiment and control)
- total sample size = 54,826
- clicks/pageview: 3200/40000 = .08 clicks/pageview
- pageviews = 685,325






### Duration and Exposure ###

In [7]:
4741212.0/40000

118.5303

If we divert 100% off traffic, given 40,000 page views per day, the experiment would take 119 days.  That is a long time.  If we eliminate retention, we are left with Gross Conversion and Net Conversion.  This reduces the number of required pageviews to 685,325, and an 18 day experiment with 100% diversion.  There may be other experiments to run, so let's say 50% diversion for 35 days.

In [8]:
685325.0/40000

17.133125

In [9]:
df_control = pd.read_csv("data/Final Project Results - Control.csv")
df_experiment = pd.read_csv("data/Final Project Results - Experiment.csv")

In [10]:
results = {"Control":pd.Series([df_control.Pageviews.sum(),df_control.Clicks.sum(),
                                  df_control.Enrollments.sum(),df_control.Payments.sum()],
                                  index = ["cookies","clicks","enrollments","payments"]),
           "Experiment":pd.Series([df_experiment.Pageviews.sum(),df_experiment.Clicks.sum(),
                               df_experiment.Enrollments.sum(),df_experiment.Payments.sum()],
                               index = ["cookies","clicks","enrollments","payments"])}
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Control,Experiment
cookies,345543,344660
clicks,28378,28325
enrollments,3785,3423
payments,2033,1945


### Sanity Checks

#### Count Metrics

In [11]:
df_results['Total']=df_results.Control + df_results.Experiment
df_results['Prob'] = 0.5
df_results['StdErr'] = np.sqrt((df_results.Prob * (1- df_results.Prob))/df_results.Total)
df_results["MargErr"] = 1.96 * df_results.StdErr
df_results["CI_lower"] = df_results.Prob - df_results.MargErr
df_results["CI_upper"] = df_results.Prob + df_results.MargErr
df_results["Obs_val"] = df_results.Experiment/df_results.Total
df_results["Pass_Sanity"] = df_results.apply(lambda x: (x.Obs_val > x.CI_lower) and (x.Obs_val < x.CI_upper),axis=1)
df_results['Diff'] = abs((df_results.Experiment - df_results.Control)/df_results.Total)

df_results

Unnamed: 0,Control,Experiment,Total,Prob,StdErr,MargErr,CI_lower,CI_upper,Obs_val,Pass_Sanity,Diff
cookies,345543,344660,690203,0.5,0.000602,0.00118,0.49882,0.50118,0.49936,True,0.001279
clicks,28378,28325,56703,0.5,0.0021,0.004116,0.495884,0.504116,0.499533,True,0.000935
enrollments,3785,3423,7208,0.5,0.005889,0.011543,0.488457,0.511543,0.474889,False,0.050222
payments,2033,1945,3978,0.5,0.007928,0.015538,0.484462,0.515538,0.488939,True,0.022122


#### Other Metrics

In [45]:
# click through probability (clicks/cookies)

control_cookies = df_results.loc['cookies','Control']
control_clicks = df_results.loc['clicks','Control']

exp_cookies = df_results.loc['cookies','Experiment']
exp_clicks = df_results.loc['clicks', 'Experiment']

## control value 
cont_p_hat = control_clicks/control_cookies

## observed value (experimental value)
exp_p_hat = exp_clicks/exp_cookies

## Standard Error
SE_ClickProb = np.sqrt((cont_p_hat * (1- cont_p_hat))/control_cookies)


## margin of error for 95% confidence interval (z = 1.96)

ME_ClickProb = SE_ClickProb * 1.96

## CI
upper_ClickProb = exp_p_hat + ME_ClickProb
lower_ClickProb = exp_p_hat - ME_ClickProb

## Sane in the membrane (yes, it passes)
print(cont_p_hat,exp_p_hat,lower_ClickProb,upper_ClickProb, SE_ClickProb, ME_ClickProb)


(0.082125813574576823, 0.082182440666163759, 0.081266986844116651, 0.083097894488210866, 0.00046706827655464432, 0.0009154538220471028)


### Evaluation Metric Results Calculations

In [12]:
df_control_notnull = df_control[pd.isnull(df_control.Enrollments) != True]
df_experiment_notnull = df_experiment[pd.isnull(df_control.Enrollments) != True]

In [13]:
results_notnull = {"Control":pd.Series([df_control_notnull.Pageviews.sum(),df_control_notnull.Clicks.sum(),
                                  df_control_notnull.Enrollments.sum(),df_control_notnull.Payments.sum()],
                                  index = ["cookies","clicks","enrollments","payments"]),
           "Experiment":pd.Series([df_experiment_notnull.Pageviews.sum(),df_experiment_notnull.Clicks.sum(),
                               df_experiment_notnull.Enrollments.sum(),df_experiment_notnull.Payments.sum()],
                               index = ["cookies","clicks","enrollments","payments"])}
df_results_notnull = pd.DataFrame(results_notnull)
df_results_notnull

Unnamed: 0,Control,Experiment
cookies,212163,211362
clicks,17293,17260
enrollments,3785,3423
payments,2033,1945


In [14]:
df_results_notnull['Total']=df_results_notnull.Control + df_results_notnull.Experiment

df_results_notnull

Unnamed: 0,Control,Experiment,Total
cookies,212163,211362,423525
clicks,17293,17260,34553
enrollments,3785,3423,7208
payments,2033,1945,3978


In [15]:
# experiment values

enrollments_exp = df_results_notnull.loc["enrollments"].Experiment
clicks_exp = df_results_notnull.loc["clicks"].Experiment
payments_exp = df_results_notnull.loc["payments"].Experiment

# control values

enrollments_cont = df_results_notnull.loc["enrollments"].Control
clicks_cont = df_results_notnull.loc["clicks"].Control
payments_cont = df_results_notnull.loc["payments"].Control



# metrics

GrossConversion_exp = enrollments_exp/clicks_exp
NetConversion_exp = payments_exp/clicks_exp
GrossConversion_cont = enrollments_cont/clicks_cont
NetConversion_cont = payments_cont/clicks_cont

GrossConversion = (enrollments_exp + enrollments_cont)/(clicks_cont + clicks_exp)
NetConversion = (payments_cont + payments_exp)/(clicks_cont + clicks_exp)




In [16]:
print('GrossConversion: {} \nNetConversion:{}'.format(GrossConversion,NetConversion))

GrossConversion: 0.208607067404 
NetConversion:0.115127485312


In [17]:
GrossConversion_cont

0.2188746891805933

In [18]:
GrossConversion_exp

0.19831981460023174

In [19]:
def stats_prop(p_hat,z_score,N_cont,N_exp,diff):
    std_err = np.sqrt((p_hat * (1- p_hat ))*(1/N_cont + 1/N_exp))
    marg_err = z_score * std_err
    ci_lower = diff - marg_err
    ci_upper = diff + marg_err
    
    return std_err,marg_err,ci_lower,ci_upper
    
    

In [20]:
GrossConversion_diff = GrossConversion_exp - GrossConversion_cont
GrossConversion_diff

-0.020554874580361565

In [21]:
se_gross,me_gross,cil_gross,ciu_gross = stats_prop(GrossConversion,1.96,clicks_cont,
                                                   clicks_exp,GrossConversion_diff)

In [22]:
print(se_gross,me_gross,cil_gross,ciu_gross)

(0.0043716753852259364, 0.0085684837550428355, -0.029123358335404401, -0.01198639082531873)


In [23]:
NetConversion_diff = NetConversion_exp - NetConversion_cont
NetConversion_diff

-0.0048737226745441675

In [24]:
se_net,me_net,cil_net,ciu_net = stats_prop(NetConversion,1.96,clicks_cont,
                                           clicks_exp,NetConversion_diff)

In [25]:
print(se_net,me_net,cil_net,ciu_net)

(0.0034341335129324238, 0.0067309016853475505, -0.011604624359891718, 0.001857179010803383)


In [26]:
df_SignTest = pd.merge(df_control_notnull,df_experiment_notnull,on="Date")
df_SignTest['GrossConversion_cont'] = df_SignTest.Enrollments_x/df_SignTest.Clicks_x
df_SignTest['GrossConversion_exp'] = df_SignTest.Enrollments_y/df_SignTest.Clicks_y
df_SignTest['NetConversion_cont'] = df_SignTest.Payments_x/df_SignTest.Clicks_x
df_SignTest['NetConversion_exp'] = df_SignTest.Payments_y/df_SignTest.Clicks_y

cols = ['Date','GrossConversion_cont','GrossConversion_exp','NetConversion_cont','NetConversion_exp']


In [27]:
df_SignTest = df_SignTest[cols]

In [28]:
df_SignTest.head()

Unnamed: 0,Date,GrossConversion_cont,GrossConversion_exp,NetConversion_cont,NetConversion_exp
0,"Sat, Oct 11",0.195051,0.153061,0.101892,0.049563
1,"Sun, Oct 12",0.188703,0.147771,0.089859,0.115924
2,"Mon, Oct 13",0.183718,0.164027,0.10451,0.089367
3,"Tue, Oct 14",0.186603,0.166868,0.125598,0.111245
4,"Wed, Oct 15",0.194743,0.168269,0.076464,0.112981


In [29]:
df_SignTest['GC_Sign'] = df_SignTest.GrossConversion_cont - df_SignTest.GrossConversion_exp
df_SignTest['NC_Sign'] = df_SignTest.NetConversion_cont - df_SignTest.NetConversion_exp

In [30]:
len(df_SignTest)

23

In [31]:
len(df_SignTest[df_SignTest.GC_Sign > 0])

19

In [32]:
len(df_SignTest[df_SignTest.NC_Sign > 0])

13