In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math as mt

In [14]:
control =  pd.read_csv('data/control.csv', delimiter = ',')
exposed =  pd.read_csv('data/exposed.csv', delimiter = ',')

In [16]:
yes_cont=control['yes'].sum()
yes_exp=exposed['yes'].sum()
yes_total=yes_cont+yes_exp
print ("number of yes in control:", yes_cont)
print ("number of yes in experiment:" ,yes_exp)

number of yes in control: 264
number of yes in experiment: 308


In [17]:
results = {"Control":pd.Series([control.yes.sum(),control.no.sum()],
                                  index = ["Yes","No"]),
           "Experiment":pd.Series([exposed.yes.sum(),exposed.no.sum()],
                               index = ["Yes","No"])}
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Control,Experiment
Yes,264,308
No,322,349


In [18]:
df_results['Total']=df_results.Control + df_results.Experiment
df_results['Prob'] = 0.5
df_results['StdErr'] = np.sqrt((df_results.Prob * (1- df_results.Prob))/df_results.Total)
df_results["MargErr"] = 1.96 * df_results.StdErr
df_results["CI_lower"] = df_results.Prob - df_results.MargErr
df_results["CI_upper"] = df_results.Prob + df_results.MargErr
df_results["Obs_val"] = df_results.Experiment/df_results.Total
df_results["Pass_Sanity"] = df_results.apply(lambda x: (x.Obs_val > x.CI_lower) and (x.Obs_val < x.CI_upper),axis=1)
df_results['Diff'] = abs((df_results.Experiment - df_results.Control)/df_results.Total)
df_results

Unnamed: 0,Control,Experiment,Total,Prob,StdErr,MargErr,CI_lower,CI_upper,Obs_val,Pass_Sanity,Diff
Yes,264,308,572,0.5,0.020906,0.040976,0.459024,0.540976,0.538462,True,0.076923
No,322,349,671,0.5,0.019302,0.037832,0.462168,0.537832,0.520119,True,0.040238


In [37]:
control_yes = df_results.loc['Yes','Control']
control_no = df_results.loc['No','Control']

exp_yes = df_results.loc['Yes','Experiment']
exp_no = df_results.loc['No', 'Experiment']

## control value 
cont_p_hat = control_no/control_yes

## observed value (experimental value)
exp_p_hat = exp_no/exp_yes

## Standard Error
SE_Prob = np.sqrt(np.abs((cont_p_hat * (1- cont_p_hat))/control_yes))


## margin of error for 95% confidence interval (z = 1.96)

ME_Prob = SE_Prob * 1.96

## CI
upper_Prob = exp_p_hat + ME_Prob
lower_Prob = exp_p_hat - ME_Prob

## Sane in the membrane (yes, it passes)
print(cont_p_hat,exp_p_hat,lower_Prob,upper_Prob, SE_Prob, ME_Prob)

1.2196969696969697 1.1331168831168832 1.0706726820183998 1.1955610842153666 0.03185928627473644 0.06244420109848343


In [39]:
df_control_notnull_yes = control[pd.isnull(control.yes) != True]
df_exposed_notnull_yes = exposed[pd.isnull(control.yes) != True]

df_control_notnull_no = control[pd.isnull(control.no) != True]
df_exposed_notnull_no = exposed[pd.isnull(control.no) != True]

  df_exposed_notnull_yes = exposed[pd.isnull(control.yes) != True]
  df_exposed_notnull_no = exposed[pd.isnull(control.no) != True]


In [41]:
results_notnull = {"Control":pd.Series([df_control_notnull_yes.yes.sum(),df_control_notnull_no.no.sum()],
                                  index = ["Yes","No"]),
           "Experiment":pd.Series([df_exposed_notnull_yes.yes.sum(),df_exposed_notnull_no.no.sum()],
                               index = ["Yes","No"])}
df_results_notnull = pd.DataFrame(results_notnull)
df_results_notnull

Unnamed: 0,Control,Experiment
Yes,264,308
No,322,349


In [43]:
yes_exp = df_results_notnull.loc["Yes"].Experiment
no_exp = df_results_notnull.loc["No"].Experiment

# control values

yes_cont = df_results_notnull.loc["Yes"].Control
no_cont = df_results_notnull.loc["No"].Control


# metrics

GrossConversion_exp = yes_exp/no_exp
GrossConversion_cont = yes_cont/no_cont


GrossConversion = (no_exp + yes_cont)/(no_exp + no_cont)


In [44]:
print('GrossConversion: {} '.format(GrossConversion))

GrossConversion: 0.9135618479880775 


In [45]:
GrossConversion_cont

0.8198757763975155

In [46]:
GrossConversion_exp

0.8825214899713467

In [47]:
def stats_prop(p_hat,z_score,N_cont,N_exp,diff):
    std_err = np.sqrt((p_hat * (1- p_hat ))*(1/N_cont + 1/N_exp))
    marg_err = z_score * std_err
    ci_lower = diff - marg_err
    ci_upper = diff + marg_err
    
    return std_err,marg_err,ci_lower,ci_upper

In [50]:
GrossConversion_diff = GrossConversion_exp - GrossConversion_cont
GrossConversion_diff

0.0626457135738312

## Applying Sequential Test

In [None]:
class SequentialTest:
  def __init__(self,exposed,control,...):
    '''
    initialise startup variables
    '''

  
  def stoppingRule(self, ...):
    '''
    This function should take current observation and return statistical decision made. 
    Consider truncate rule for longer tests
    '''
    S, a, b,

  def computeBoundaries(self,):
    '''
    This function shoud compute boundaries 
    '''

  def plotTest(self,):
    '''
    showing the cumulative statistical test (e.g., log probability ratio) and the uper and lower limits.
    '''

  def plotBoundaries(self,):
    '''cumulative sums of exposed successes, bounded by the critical limits.
    '''

In [21]:
def transform_data(df):
  '''
  segment data into exposed and control groups
  consider that SmartAd runs the experment hourly, group data into hours. 
      Hint: create new column to hold date+hour and use df.column.map(lambda x:  pd.Timestamp(x,tz=None).strftime('%Y-%m-%d:%H'))
  create two dataframes with bernouli series 1 for posetive(yes) and 0 for negative(no)
    Hint: Given engagement(sum of yes and no until current observation as an array) and success (yes countas an array), the method generates random binomial distribution
        #Example
           engagement = np.array([5, 3, 3])
           yes = np.array([2, 0, 3])       
         Output is "[1] 1 0 1 0 0 0 0 0 1 1 1", showing a binary array of 5+3+3 values
         of which 2 of the first 5 are ones, 0 of the next 3 are ones, and all 3 of
         the last 3 are ones where position the ones is randomly distributed within each group.
  '''
  return exposed,control

def plotDataSummary(exposed, control):
  'This function plots cummulated success'

def pretyPrintTestResult(self, test):
  '''This function print final test result. Json format is recommended. For example
  {
    "name": "",
    "engagementCountControl": ,
    "engagementCountExposed": ,
    "positiveCountControl": ,
    "positiveCountExposed": ,
    "ControlSuccessProbability": ,
    "ExposedSuccessProbability": ,
    "basePositiveRate": ,
    "significanceSign": ".",
    "lift": ,
    "oddRatio": ,
    "exactSuccessOddRate":,
    "confidenceIntervalLevel": ,
    "alpha": ,
    "beta": ,
    "power": ,
    "criticalValue": ,
    "lower critical(a)": 
    "upper critical(b)": ,
    "TotalObservation": 
  }'''

In [None]:
'Define statistical parameters such as alpha, beta, sample size if evan approach is used, odd ratio for SPRT'
alpha=
beta=
#other variables here
'Compute statistical lower and upper decision points such as a and b'
a=
b=
#other variables here

##data processing here
exposed,control=transform_data(data)
##plot data summary
plotDataSummary(exposed,control)

'Perform test. Loop over each of data entry and perform test. Accumulate result into dataframe and print out test journey'
test=SequentialTest(...)

'Print test result.'
pretyPrintTestResult(resultObject)