In [225]:
import pandas as pd
import numpy as np
import collections

import scipy
from scipy.stats import ttest_ind, mannwhitneyu
from random import sample, seed
# assume power analysis has been complete and that the sample size given is sufficient for this test
# effect size - minimum size of the effect that you want to detect
# power - probablility of detecting a given effect size TRUE POSITIVE
# significanct level (alpha)

In [82]:
#loading exercise file into DF (fm default directory)
df = pd.read_csv('Senior Data Analyst Second Round Practical Exercise Data.csv')
df.head()

Unnamed: 0,unique_id,test_group,booked,type_of_care,account_type,relationship_length_sought,schedule
0,2WPVxZ0,Intervention,False,Disability Support,Self Managed,Ongoing,Flexible
1,2Wpgia8,Intervention,False,Disability Support,Self Managed,Ongoing,Specific
2,2WK9Bq4,Intervention,False,Other,Self Managed,Ongoing,Flexible
3,2Vv-cok,Intervention,False,Other,Self Managed,Once_Off,Flexible
4,2vRLGyQ,Control,True,Aged Care,Self Managed,Ongoing,Flexible


In [66]:
df.describe()
#checking for missing values: the empty rows present in the csv have been removed in loading process: nill missing values
#unique id is not unique
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   unique_id                   299 non-null    object
 1   test_group                  299 non-null    object
 2   booked                      299 non-null    bool  
 3   type_of_care                299 non-null    object
 4   account_type                299 non-null    object
 5   relationship_length_sought  299 non-null    object
 6   schedule                    299 non-null    object
dtypes: bool(1), object(6)
memory usage: 14.4+ KB


In [243]:
# checking the number of duplicated unique_id's. 40 are present
df.duplicated('unique_id').sum()
df2 = df.drop_duplicates(subset =['unique_id'], keep = 'first')

In [241]:
summary_table = pd.crosstab(df.test_group,df.booked)
summary_table['total'] = summary_table.sum(axis=1)
summary_table['pc_booked'] = 100* summary_table[True]/ summary_table['total']
summary_table

booked,False,True,total,pc_booked
test_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Control,102,47,149,31.543624
Intervention,98,52,150,34.666667


In [246]:
summary_table2 = pd.crosstab(df2.test_group,df2.booked)
summary_table2['total'] = summary_table2.sum(axis=1)
summary_table2['pc_booked'] = 100* summary_table2[True]/ summary_table2['total']
summary_table2

booked,False,True,total,pc_booked
test_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Control,86,41,127,32.283465
Intervention,88,44,132,33.333333


In [213]:
invtervention_value_count = summary_table.loc['Intervention','total']
control_value_count = summary_table.loc['Control','total']
ground_truth = summary_table.loc['Intervention','pc_booked'] - summary_table.loc['Control','pc_booked']
print(ground_truth)

3.1230425055928386


In [253]:
control_values_slice = df[df.test_group == 'Control']
intervention_values_slice = df[df.test_group == 'Intervention']

control_values = control_values_slice['booked'].astype(int).to_numpy()
intervention_values = intervention_values_slice['booked'].astype(int).to_numpy()
all_values = df['booked'].astype(int).to_list()

In [254]:
stat, pvalue=mannwhitneyu(control_values, intervention_values)
print('p =',pvalue)
#This is not the same as a normality of data assumption. This assumption is saying that if we hypothetically repeated this test many times and computed the mean each time, then the distribution of mean is Normal. 
# This is called the Central Limit Theorem. It states that as you get more and more data points, the distribution of mean is more and more Normally distributed. 

p = 0.5673455997228845


In [258]:
zscore, prob = ttest_ind(control_values, intervention_values, equal_var=False)
print(f"Zscore is {zscore:0.2f}, p-value is {prob:0.3f} (two tailed), {prob/2:0.3f} (one tailed)")

Zscore is -0.57, p-value is 0.568 (two tailed), 0.284 (one tailed)


In [259]:
permuntations = 10000 # number of permutations
permuntations_results = [0]*permuntations # list for permuntation results

for i in range(permuntations):
    bag = sample(all_values, k=len(all_values)) # shuffle the bag
    a_rs2 = bag[:invtervention_value_count]
    b_rs2 = bag[invtervention_value_count:]
    permuntations_results[i] = 100*sum(a_rs2)/invtervention_value_count - 100*sum(b_rs2)/invtervention_value_count

print(permuntations_results[:10])

[-0.6666666666666714, -4.666666666666668, -2.0, -8.666666666666668, -3.333333333333332, 6.0, 0.6666666666666714, 2.0, -0.6666666666666714, 7.333333333333332]


In [250]:
permuntations_results_series = pd.Series(permuntations_results)
print(pd.pivot_table(permuntations_results_series.value_counts().reset_index(),values=0,columns='index').to_string(index=False))

 -22.000000  -20.666667  -19.333333  -18.000000  -16.666667  -15.333333  -14.000000  -12.666667  -11.333333  -10.000000  -8.666667   -7.333333   -6.000000   -4.666667   -3.333333   -2.000000   -0.666667    0.666667    2.000000    3.333333    4.666667    6.000000    7.333333    8.666667    10.000000   11.333333   12.666667   14.000000   15.333333   16.666667   18.000000   19.333333
          1           2           1           4           8          15          39          68          91         183         253         385         509         655         774         912         992         935         928         802         707         587         420         287         188         108          84          28          15          10           8           1


In [256]:
pos_extreme_count = sum(permuntations_results_series >= ground_truth)
print('One=way: Extreme count: ', pos_extreme_count,'\nOne-way: Extreme ratio (p-value):', pos_extreme_count / permuntations)
print(ground_truth)

One=way: Extreme count:  3245 
One-way: Extreme ratio (p-value): 0.3245
3.1230425055928386
