In [None]:
## General Structure for all Notebooks 
## 1) Describe the problem being tackled, source of data, output 
## 2) Import libraries/set up wd; general working setup
## 3) Examine the quality of the data 
## 4) Build the right dataset 
## 5) Apply the structure

# Problem 
### What are we solving for? 

### Background 
* Spain-based users have a much higher conversion rate than any other Spanish-speaking country.
* MVT test where each page has its own translation, e.g. MX = MX, AR = AR, etc. 
* Initial results come back that the experiment was negative (null hypothesis accepted)


### Tasks 
* Confirm that test is actually negative

#### Explain why that might be happening. Are the localized translations really worse?

*If you identified what was wrong, design an algorithm that would return FALSE if the same problem*
*is happening in the future and TRUE if everything is good and results can be trusted.*

In [13]:
import pandas as pd
import numpy as np 
import os 

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
from termcolor import colored


from scipy.stats import chi2_contingency
from scipy import stats

wd = os.getcwd()
print(wd)

## Github with Answers: https://github.com/JifuZhao/DS-Take-Home/blob/master/02.%20Spanish%20Translation%20AB%20Test.ipynb

/Users/annadudek/00_DataMasked


In [14]:
def describe_data(data):
    print(f" shape: {data.shape}")
    print('')
    print(data.dtypes)

    
    ## assume first id column in the dataset (in terms of left to right order)
    id_column = data.filter(like='id').columns.to_list()[0]
    print('')
    print(f"Column taken as id: {id_column}")
    print(f"If {len(data)} = {len(data[id_column].unique())}, then dataframe is at this level")  
    
    
    print('')
    return data.head(3)

In [15]:
## get data

users_raw = pd.read_csv("/Users/annadudek/Documents/GitHub/ABTesting_Practice/user_table.csv")
browsing_raw = pd.read_csv("/Users/annadudek/Documents/GitHub/ABTesting_Practice/test_table.csv")


In [16]:
describe_data(users_raw)

 shape: (452867, 4)

user_id     int64
sex        object
age         int64
country    object
dtype: object

Column taken as id: user_id
If 452867 = 452867, then dataframe is at this level



Unnamed: 0,user_id,sex,age,country
0,765821,M,20,Mexico
1,343561,F,27,Nicaragua
2,118744,M,23,Colombia


In [17]:
describe_data(browsing_raw)

 shape: (453321, 9)

user_id              int64
date                object
source              object
device              object
browser_language    object
ads_channel         object
browser             object
conversion           int64
test                 int64
dtype: object

Column taken as id: user_id
If 453321 = 453321, then dataframe is at this level



Unnamed: 0,user_id,date,source,device,browser_language,ads_channel,browser,conversion,test
0,315281,2015-12-03,Direct,Web,ES,,IE,1,0
1,497851,2015-12-04,Ads,Web,ES,Google,IE,0,1
2,848402,2015-12-04,Ads,Web,ES,Facebook,Chrome,0,0


In [18]:
### Notes 
## there are a couple more user_ids in the browsing data than in the user data. Either there are some that don't match
## or some users appear multiple times in browsing behaviour 

data=pd.merge(users_raw, browsing_raw, how = 'right', on='user_id')
data=data[data['country'].notna()]
print(data.shape)
data.head()



(452867, 12)


Unnamed: 0,user_id,sex,age,country,date,source,device,browser_language,ads_channel,browser,conversion,test
0,765821,M,20.0,Mexico,2015-12-02,Ads,Mobile,ES,Yahoo,Android_App,0,1
1,343561,F,27.0,Nicaragua,2015-12-04,Ads,Web,ES,Facebook,Safari,0,0
2,118744,M,23.0,Colombia,2015-11-30,Ads,Mobile,ES,Facebook,Android_App,0,1
3,987753,F,27.0,Venezuela,2015-12-04,SEO,Web,ES,,IE,0,1
4,554597,F,20.0,Spain,2015-12-04,Direct,Web,ES,,Chrome,0,0


In [19]:

### does test(is in experiment or not) differ by market, e.g. 50-50 in test and 50-50 in experiment
data.groupby('country')['test'].size()

### what would be the ideal experiment setup for this? 

country
Argentina       46733
Bolivia         11124
Chile           19737
Colombia        54060
Costa Rica       5309
Ecuador         15895
El Salvador      8175
Guatemala       15125
Honduras         8568
Mexico         128484
Nicaragua        6723
Panama           3951
Paraguay         7347
Peru            33666
Spain           51782
Uruguay          4134
Venezuela       32054
Name: test, dtype: int64

# AB Testing Template 


* Control: test = 0
* Variant: test = 1 
* Cleaning actions: 
    1. Omit Spainish records entirely; just find out what the conversion rate is 
    
* Test Type: Needs to be chi-square because it is converted 0/1
    1. The conversion rate to beat is the Spanish one? 

* Dates: min = '2015-11-30' / max = '2015-12-04'
    * no date cleaning needed / though this experiment is too short 

## Significance Testing

In [20]:
## SPAIN ONLY CONVERSION 
########################################################################
ES_denominator = len(data[(data.country=='Spain') & (data.conversion == 0)])
ES_numerator = len(data[(data.country=='Spain') & (data.conversion == 1)])

print(f"Spanish conversion: {(ES_numerator/(ES_denominator + ES_numerator))*100}")
print('')



# data['exp_group'] = np.where(data['country'] == 'Spain', 'Control', 'Variant')
data['exp_group'] = np.where(data['test'] == 0, 'Control', 'Variant')



## CHI SQUARE FOR REMAINING GROUPS 
########################################################################
obs_table=pd.DataFrame(data[data.country != 'Spain'][['conversion', 'exp_group']].pivot_table(index='exp_group', columns='conversion', 
                                aggfunc=len, fill_value=0))
obs_table.reset_index(inplace=True)
obs_table.columns = obs_table.columns.get_level_values(0)

obs_table.columns = ['exp_group', 'not_converted', 'converted']

print(f"Conversion Rate for {obs_table.exp_group[0]}:\
     {(obs_table.converted[0]/(obs_table.not_converted[0]+obs_table.converted[0]))*100}")

print(f"Conversion Rate for {obs_table.exp_group[1]}:\
     {(obs_table.converted[1]/(obs_table.not_converted[0]+obs_table.converted[1]))*100}")
print("")

result = chi2_contingency(obs_table[['not_converted','converted']])
chisq, p = result[:2]
print ('chisq = {}, p = {}'.format(chisq, p))
print (f"Is p-value less than 0.05: {p < 0.05}")




Spanish conversion: 7.971882121200418

Conversion Rate for Control:     4.829179055749524
Conversion Rate for Variant:     5.043369640713082

chisq = 54.37858181419832, p = 1.653553456785947e-13
Is p-value less than 0.05: True


In [21]:
### T-Test 
  
#t-test of test vs control for our target metric 

print(data[data.country != 'Spain'].groupby('exp_group')['conversion'].mean()*100)
print('')

test = stats.ttest_ind(data[(data.country != 'Spain') & (data.exp_group == 'Variant')]['conversion'], 
                       data[(data.country != 'Spain') & (data.exp_group == 'Control')]['conversion'], 
                       equal_var=False
                       )
  


print(f"test statistic: {test.statistic}")
print(f"p-value {test.pvalue}")
print (f"Is p-value less than 0.05: {test.pvalue < 0.05}")




exp_group
Control    4.829179
Variant    4.341116
Name: conversion, dtype: float64

test statistic: -7.353895203080277
p-value 1.928917857779903e-13
Is p-value less than 0.05: True


## Why is there such a difference 

In [22]:
##The most likely reason for weird A/B test results are:

##1) We didn’t collect enough data
##2) Some bias has been introduced in the experiment so that test/control people are not really random

In [23]:
def categorical_randomization(data):
    
    """ Function takes all the categorical variables in the dataframe and creates frequency tables by 
    Control v Variant in order to check randomization. 
    
    Could introduce a % test to see if they are really that far off. E.g. if greater than 5% relative difference 
    between groups flag as problematic"""

    categorical_variables = data.select_dtypes('object').columns.tolist()
    categorical_variables.remove('exp_group')
#     print(f"Categorical Variables: {categorical_variables}")
    
    freq_table=pd.DataFrame()
    for i in categorical_variables: 
        var_table=pd.DataFrame(data[['exp_group',i]].pivot_table(index='exp_group', columns=i, 
                                aggfunc=len, fill_value=0)).reset_index()

        levels_list = var_table.columns.tolist()
        levels_list.remove('exp_group')

        for l in levels_list: 
            var_table[str("pct_" + l)] = var_table[l]/var_table.sum(axis=1)
        
        freq_table = pd.concat([freq_table, var_table], axis =1)
    
    
    
    
    """Clean up freq_table & flag problematic randomization VARIANCE BETWEEN GROUPS > 5% """
    
    freq_table=freq_table.transpose()
    freq_table.reset_index(inplace=True)
    freq_table.columns = ['categorical_variable', 'Control', 'Variant']
    freq_table=freq_table[freq_table.categorical_variable !='exp_group']
    
    freq_table['pct_variable']=freq_table['categorical_variable'].str.contains("pct")
    freq_table['group_variance'] = np.where(freq_table['pct_variable'] == True,
                                            abs((freq_table['Control'] - freq_table['Variant'])/freq_table['Control']) , 0)


    print(colored('Following categorical variables have greater than 5% variance between Control & Variant',
                  'red', attrs=['bold']))
    print("")
    print(freq_table[freq_table.group_variance >= 0.05])
    
    
    pd.options.display.max_rows = 1000                              ## increase the length you can see in notebooks
    return freq_table                                


In [24]:
randomization_results=categorical_randomization(data[data.country != 'Spain'])
randomization_results


## the other alternative to going through each of the variables is to build a decision tree predicting on 


[1m[31mFollowing categorical variables have greater than 5% variance between Control & Variant[0m

   categorical_variable     Control     Variant  pct_variable group_variance
22        pct_Argentina   0.0504881    0.173223          True        2.43097
23          pct_Bolivia   0.0299496   0.0258326          True       0.137467
24            pct_Chile   0.0531701   0.0458071          True       0.138478
25         pct_Colombia    0.146176    0.125001          True       0.144858
26       pct_Costa Rica   0.0143542   0.0122767          True       0.144732
27          pct_Ecuador   0.0433649   0.0364223          True       0.160097
28      pct_El Salvador   0.0221681   0.0188484          True       0.149752
29        pct_Guatemala   0.0411308   0.0347724          True       0.154589
30         pct_Honduras   0.0235334   0.0194972          True       0.171508
31           pct_Mexico    0.346492     0.29788          True       0.140297
32        pct_Nicaragua     0.01845   0.0153123    

Unnamed: 0,categorical_variable,Control,Variant,pct_variable,group_variance
1,F,77096.0,89909.0,False,0.0
2,M,108215.0,125865.0,False,0.0
3,pct_F,0.416036,0.416681,True,0.00155178
4,pct_M,0.583963,0.583318,True,0.00110523
6,Argentina,9356.0,37377.0,False,0.0
7,Bolivia,5550.0,5574.0,False,0.0
8,Chile,9853.0,9884.0,False,0.0
9,Colombia,27088.0,26972.0,False,0.0
10,Costa Rica,2660.0,2649.0,False,0.0
11,Ecuador,8036.0,7859.0,False,0.0


# Stratified Sampling 

In [None]:
## Is that possible to stratify across so many variables 

## Novely Effect

In [None]:

## What does the conversion rate look like by group over time 

daily_conversion=data[data.country != 'Spain'].groupby(['date', 'exp_group']).agg({'conversion': ['sum','count']})
daily_conversion.reset_index(inplace=True)
daily_conversion.columns = [['date', 'exp_group', 'converted_users', 'total_users']]
daily_conversion.columns = daily_conversion.columns.get_level_values(0)

daily_conversion['CVR']= (daily_conversion['converted_users']/daily_conversion['total_users'])*100
daily_conversion['date']=pd.to_datetime(daily_conversion.date)
daily_conversion

In [None]:

fig, (ax1, ax2) = plt.subplots(2,figsize=(8,6))


################################ Control 

ax1.title.set_text('Control')
ax1.plot(daily_conversion[daily_conversion.exp_group == 'Control']['date'],
           daily_conversion[daily_conversion.exp_group == 'Control']['CVR'])
ax1.set_ylim([3,6])



xs= daily_conversion[daily_conversion.exp_group == 'Control']['date']
ys= daily_conversion[daily_conversion.exp_group == 'Control']['CVR']

for x,y in zip(xs,ys):

    label = "{:.2f}".format(y)

    # this method is called for each point
    ax1.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

    
################################ Variant

ax2.title.set_text('Variant')
ax2.plot(daily_conversion[daily_conversion.exp_group == 'Variant']['date'],
           daily_conversion[daily_conversion.exp_group == 'Variant']['CVR'])
ax2.set_ylim([3, 6])

xs= daily_conversion[daily_conversion.exp_group == 'Variant']['date']
ys= daily_conversion[daily_conversion.exp_group == 'Variant']['CVR']

for x,y in zip(xs,ys):

    label = "{:.2f}".format(y)

    # this method is called for each point
    ax2.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center


# ax1.plot(daily_conversion.date,daily_conversion.CVR)