In [1]:
import math as mt
import numpy as np
import pandas as pd
from scipy.stats import norm

In [2]:
# original data

baseline = {"Cookies":40000,"Clicks":3200,"Enrollments":660,
            "CTR":0.08,"GConversion":0.20625,
           "Retention":0.53,"NConversion":0.109313}

In [3]:
# adjust the Cookies and others by the ratio of Cookies change

baseline["Cookies"] = 5000
baseline["Clicks"]=baseline["Clicks"]*(5000/40000)
baseline["Enrollments"]=baseline["Enrollments"]*(5000/40000)
baseline

{'CTR': 0.08,
 'Clicks': 400.0,
 'Cookies': 5000,
 'Enrollments': 82.5,
 'GConversion': 0.20625,
 'NConversion': 0.109313,
 'Retention': 0.53}

In [4]:
# Get p, n Gross Conversion (GC)
# and Stansard Deviation(sd) rounded to 4 decimal digits.

GC={}
GC["d_min"]=0.01
GC["p"]=baseline["GConversion"]
#p is given in this case - or we could calculate it from "enrollments/clicks"
GC["n"]=baseline["Clicks"]
GC["sd"]=round(mt.sqrt((GC["p"]*(1-GC["p"]))/GC["n"]),4)
GC["sd"]

0.0202

In [5]:
# Retention(R) 

R={}
R["d_min"]=0.01
R["p"]=baseline["Retention"]
R["n"]=baseline["Enrollments"]
R["sd"]=round(mt.sqrt((R["p"]*(1-R["p"]))/R["n"]),4)
R["sd"]

0.0549

In [6]:
# Net Conversion (NC)

NC={}
NC["d_min"]=0.0075
NC["p"]=baseline["NConversion"]
NC["n"]=baseline["Clicks"]
NC["sd"]=round(mt.sqrt((NC["p"]*(1-NC["p"]))/NC["n"]),4)
NC["sd"]

0.0156

In [7]:
def get_sds(p,d):
  sd1 = mt.sqrt(2*p*(1-p))
  sd2 = mt.sqrt(p*(1-p)+(p+d)*(1-(p+d)))
  x = [sd1,sd2]
  return x

In [8]:
# Get z-score

def get_z_score(alpha):
  return norm.ppf(alpha)

def get_sds(p,d):
  sd1 = mt.sqrt(2*p*(1-p))
  sd2 = mt.sqrt(p*(1-p)+(p+d)*(1-(p+d)))
  sds = [sd1,sd2]
  return sds

def get_sampleSize(sds,alpha,beta,d):
  n=pow((get_z_score(1-alpha/2)*sds[0]+get_z_score(1-beta)*sds[1]),2)/pow(d,2)
  return(n)

In [9]:
GC["d"]=0.01
R["d"]=0.01
NC["d"]=0.0075

In [10]:
# get an integer value for simplicity

GC['SampSize'] = round(get_sampleSize(get_sds(GC['p'],GC['d']),0.05,0.2,GC['d']))

GC['SampSize']

25835

In [11]:
GC['SampSize'] = round(get_sampleSize(get_sds(GC['p'],GC['d']),0.05,0.2,GC['d']))
GC['SampSize']

25835

In [13]:
GC['SampSize'] = round(GC['SampSize']/0.08*2)
GC["SampSize"]

In [19]:
R['SampSize'] = round(get_sampleSize(get_sds(R['p'],R['d']),0.05,0.2,R['d']))
R['SampSize']

39087

In [20]:
R['SampSize'] = R['SampSize']/0.08/0.20625*2
R["SampSize"]

4737818.181818182

In [21]:
NC['SampSize'] = round(get_sampleSize(get_sds(NC['p'],NC['d']),0.05,0.2,NC['d']))
NC['SampSize']

27413

In [22]:
NC['SampSize'] = NC['SampSize']/0.08*2
NC["SampSize"]

685325.0

In [23]:
# load in data

control = pd.read_csv('/content/drive/MyDrive/ML 60 DAYS/control_data.csv')
experiment = pd.read_csv('/content/drive/MyDrive/ML 60 DAYS/experiment_data.csv')
control.head()

Unnamed: 0,Date,Pageviews,Clicks,Enrollments,Payments
0,"Sat, Oct 11",7723,687,134.0,70.0
1,"Sun, Oct 12",9102,779,147.0,70.0
2,"Mon, Oct 13",10511,909,167.0,95.0
3,"Tue, Oct 14",9871,836,156.0,105.0
4,"Wed, Oct 15",10014,837,163.0,64.0


In [24]:
experiment.head()

Unnamed: 0,Date,Pageviews,Clicks,Enrollments,Payments
0,"Sat, Oct 11",7716,686,105.0,34.0
1,"Sun, Oct 12",9288,785,116.0,91.0
2,"Mon, Oct 13",10480,884,145.0,79.0
3,"Tue, Oct 14",9867,827,138.0,92.0
4,"Wed, Oct 15",9793,832,140.0,94.0


In [25]:
pageviews_cont = control['Pageviews'].sum()
pageviews_exp = experiment.Pageviews.sum()
pageviews_total = pageviews_cont+pageviews_exp
print('number of pageviews in control:',pageviews_cont)
print('number of pageviews in experiment:', pageviews_exp)

number of pageviews in control: 345543
number of pageviews in experiment: 344660


In [26]:
clicks_cont = control.Clicks.loc[control.Enrollments.notnull()].sum()
clicks_exp = experiment['Clicks'].loc[experiment['Enrollments'].notnull()].sum()
clicks_cont,clicks_exp

(17293, 17260)

In [29]:
#Gross Conversion - number of enrollments divided by number of clicks

enrollments_cont = control.Enrollments.sum()
enrollments_exp = experiment.Enrollments.sum()

alpha = 0.05

GC_cont = enrollments_cont/clicks_cont
GC_exp = enrollments_exp/clicks_exp
GC_pooled = (enrollments_cont+enrollments_exp)/(clicks_cont+clicks_exp)
GC_sd_pooled = mt.sqrt(GC_pooled*(1-GC_pooled)*(1/clicks_cont+1/clicks_exp))
GC_ME = round(get_z_score(1-alpha/2)*GC_sd_pooled,4)
GC_diff = round(GC_exp-GC_cont,4)
print('The change due to the experiment is',GC_diff*100,'%')
print('Confidence Interval: [',GC_diff-GC_ME,',',GC_diff+GC_ME,']')
print('The change is statistically significant if the CI does not include 0. In this case, it is practically significant if',NC['d_min'],'is not in the CI as well')

The change due to the experiment is -2.06 %
Confidence Interval: [ -0.0292 , -0.012 ]
The change is statistically significant if the CI does not include 0. In this case, it is practically significant if 0.0075 is not in the CI as well


In [30]:
#Net Conversion - number of payments divided by number of clicks
payments_cont=control["Payments"].sum()
payments_exp=experiment["Payments"].sum()

NC_cont=payments_cont/clicks_cont
NC_exp=payments_exp/clicks_exp
NC_pooled=(payments_cont+payments_exp)/(clicks_cont+clicks_exp)
NC_sd_pooled=mt.sqrt(NC_pooled*(1-NC_pooled)*(1/clicks_cont+1/clicks_exp))
NC_ME=round(get_z_score(1-alpha/2)*NC_sd_pooled,4)
NC_diff=round(NC_exp-NC_cont,4)
print("The change due to the experiment is",NC_diff*100,"%")
print("Confidence Interval: [",NC_diff-NC_ME,",",NC_diff+NC_ME,"]")
print ("The change is statistically significant if the CI doesn't include 0. In that case, it is practically significant if",NC["d_min"],"is not in the CI as well.")

The change due to the experiment is -0.49 %
Confidence Interval: [ -0.0116 , 0.0018000000000000004 ]
The change is statistically significant if the CI doesn't include 0. In that case, it is practically significant if 0.0075 is not in the CI as well.
