### Data Science Interviews Group Study (Spring 2017)
- Brian Goodness
- Assignment 2: A/B Testing

# Objectives:

- Confirm that the test (i.e., each country has its own translation by a local) is actually negative. That is, it appears that the old version of the site with just one translation across Spain and LatAm performs better

- Explain why that might be happening. Are the localized translations really worse?

- If you identified what was wrong, design an algorithm that would return FALSE if the same problem is happening in the future and TRUE if everything is good and the results can be trusted.

In [2]:
import pandas as pd
#import statsmodels.api as sm
import pylab as pl
import numpy as np

### load data; inspect

In [3]:
test_data = pd.read_csv('Translation_Test/test_table.csv')
user_data = pd.read_csv('Translation_Test/user_table.csv')

In [4]:
print(len(test_data))
test_data.head()

453321


Unnamed: 0,user_id,date,source,device,browser_language,ads_channel,browser,conversion,test
0,315281,2015-12-03,Direct,Web,ES,,IE,1,0
1,497851,2015-12-04,Ads,Web,ES,Google,IE,0,1
2,848402,2015-12-04,Ads,Web,ES,Facebook,Chrome,0,0
3,290051,2015-12-03,Ads,Mobile,Other,Facebook,Android_App,0,1
4,548435,2015-11-30,Ads,Web,ES,Google,FireFox,0,1


In [5]:
print(len(user_data))
user_data.head()

452867


Unnamed: 0,user_id,sex,age,country
0,765821,M,20,Mexico
1,343561,F,27,Nicaragua
2,118744,M,23,Colombia
3,987753,F,27,Venezuela
4,554597,F,20,Spain


In [6]:
# merge data
merged = test_data.merge(user_data, left_on='user_id', right_on='user_id')
print(len(merged))

452867


In [7]:
merged.dtypes

user_id              int64
date                object
source              object
device              object
browser_language    object
ads_channel         object
browser             object
conversion           int64
test                 int64
sex                 object
age                  int64
country             object
dtype: object

In [8]:
# tabulate method
def tab(frame, field):
    result = pd.DataFrame(frame[field].value_counts())
    result['percent'] = result[field]/result[field].sum()*100
    return result

In [9]:
# crosstab method (to include percentages)
def crosstab_pcts(frame, row, column):
    ctab = pd.crosstab(frame[row], frame[column])
    for field in ctab.columns:
        ctab['%s_pct' % field] = ctab[field]/ctab[field].sum()*100
    return ctab

In [10]:
crosstab_pcts(merged, 'conversion', 'test')

test,0,1,0_pct,1_pct
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,224016,206407,94.484443,95.658884
1,13077,9367,5.515557,4.341116


In [66]:
crosstab_pcts(merged[merged.country.str.contains('Uruguay')], 'conversion', 'test')

test,0,1,0_pct,1_pct
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,410,3671,98.795181,98.70933
1,5,48,1.204819,1.29067


In [68]:
crosstab_pcts(merged[merged.country.str.contains('Argentina')], 'browser_language', 'test')

test,0,1,0_pct,1_pct
browser_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
EN,1291,5128,13.798632,13.719667
ES,7793,31204,83.294143,83.484496
Other,272,1045,2.907225,2.795837


In [61]:
for country in merged.country.unique():
    print('\nCountry: %s' % country)
    print(crosstab_pcts(merged[merged.country.str.contains(country)], 'conversion', 'browser_language'))


Country: Spain
browser_language    EN     ES  Other     EN_pct     ES_pct  Other_pct
conversion                                                           
0                 6634  39703   1317  92.190106  92.020118  91.458333
1                  562   3443    123   7.809894   7.979882   8.541667

Country: Mexico
browser_language     EN      ES  Other     EN_pct     ES_pct  Other_pct
conversion                                                             
0                 16913  101683   3420  95.075609  94.954523  94.763092
1                   876    5403    189   4.924391   5.045477   5.236908

Country: Venezuela
browser_language    EN     ES  Other    EN_pct     ES_pct  Other_pct
conversion                                                          
0                 4237  25389    836  95.17071  94.972506  96.202532
1                  215   1344     33   4.82929   5.027494   3.797468

Country: Bolivia
browser_language    EN    ES  Other     EN_pct     ES_pct  Other_pct
conversion      

In [11]:
crosstab_pcts(merged, 'country', 'test')

test,0,1,0_pct,1_pct
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Argentina,9356,37377,3.946131,17.322291
Bolivia,5550,5574,2.340854,2.583258
Chile,9853,9884,4.155753,4.580719
Colombia,27088,26972,11.425053,12.500116
Costa Rica,2660,2649,1.121923,1.227673
Ecuador,8036,7859,3.389387,3.642237
El Salvador,4108,4067,1.732653,1.884842
Guatemala,7622,7503,3.214772,3.477249
Honduras,4361,4207,1.839363,1.949725
Mexico,64209,64275,27.081778,29.788112


In [33]:
#cross-tab: Spain
crosstab_pcts(merged[merged.country.str.contains('Spain')], 'conversion', 'test')

test,0,0_pct
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1
0,47654,92.028118
1,4128,7.971882


In [40]:
#cross-tab: Rest of World
non_spain = merged[~merged.country.str.contains('Spain')]
crosstab_pcts(non_spain, 'conversion', 'test')

test,0,1,0_pct,1_pct
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,176362,206407,95.170821,95.658884
1,8949,9367,4.829179,4.341116


In [44]:
# complete t-test
from scipy.stats import ttest_ind

control = non_spain[non_spain['test']==0]
test = non_spain[non_spain['test']==1]

ttest_ind(control['conversion'], test['conversion'])

Ttest_indResult(statistic=7.3822521630539679, pvalue=1.559329277881674e-13)