# **Setup**

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import scipy.stats as stats 

# **Data Import**

In [2]:
# Set Random Seed 
np.random.seed(42)

# Load Data
df = pd.read_csv('dataset.csv')
print(df.shape)
print(df.columns)

df.head()

(1000, 5)
Index(['user_id', 'group', 'latitude', 'longitude', 'conversion'], dtype='object')


Unnamed: 0,user_id,group,latitude,longitude,conversion
0,1,A,35.669109,6.869443,1
1,2,B,6.497346,-7.494524,1
2,3,A,-34.285029,-170.768856,0
3,4,A,56.483104,-57.150782,1
4,5,A,33.251611,-43.129577,0


# **Calc Avg Conversion Rates**

In [7]:
conversion_rates = df.groupby('group')['conversion'].agg(['mean', 'count'])
conversion_rates

Unnamed: 0_level_0,mean,count
group,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.3,490
B,0.315686,510


In [11]:
ConvRate_GroupA = conversion_rates['mean']['A']
ConvRate_GroupB = conversion_rates['mean']['B'] 

print(f"Conversion rate of Group A: {ConvRate_GroupA}")
print(f"Conversion rate of Group B: {ConvRate_GroupB}")

Conversion rate of Group A: 0.3
Conversion rate of Group B: 0.3156862745098039


# **Step 01: Set Significance Level**

In [12]:
# Set Significance Level (alpha)
sig_level = 0.05

# **Step 02: Select Test**

- Statistical Test 01: 2 Sample T-Test

## **Student's T-Test: Assumptions Check**

- Normality
- Homoscedasticity (Equal Variance)
- Independence 

In [14]:
df.groupby('group')['conversion'].agg(['count','mean','std','var'])

Unnamed: 0_level_0,count,mean,std,var
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,490,0.3,0.458726,0.210429
B,510,0.315686,0.465245,0.216453


### **Normality Check**

- Use Shapiro-Wilk test

In [15]:
# Shapiro-Wilk Test for Normality Of Distribution
shapiro_A = stats.shapiro(df[df['group'] == 'A']['conversion'])
shapiro_B = stats.shapiro(df[df['group'] == 'B']['conversion'])

# Review Results
print('Shapiro-Wilk Test Group A', shapiro_A)
print('Shapiro-Wilk Test Group B', shapiro_B)

Shapiro-Wilk Test Group A ShapiroResult(statistic=0.5755484295691086, pvalue=8.305357450336786e-33)
Shapiro-Wilk Test Group B ShapiroResult(statistic=0.585129503103347, pvalue=4.39191381945773e-33)


In [16]:
'''
Shapiro-Wilk Test Conclusion: 
Group A: pvalue=8.305357450336786e-33 < 0.05 => Reject H0 => Non-Normal Distribution
Group B: pvalue=4.39191381945773e-33 < 0.05 => Reject H0 => Non-Normal Distribution
'''

'\nShapiro-Wilk Test Conclusion: \nGroup A: pvalue=8.305357450336786e-33 < 0.05 => Reject H0 => Non-Normal Distribution\nGroup B: pvalue=4.39191381945773e-33 < 0.05 => Reject H0 => Non-Normal Distribution\n'

# **Step 04: Define Hypothesis Statements**

- H0: There is no difference in the conversion rate between the two groups
- H1 : There is a difference in the conversion rate between the two groups

# **Step 05:**

# **Referenence**

- https://towardsdatascience.com/a-comprehensive-guided-project-to-a-b-testing-notebook-91e5a300966c 
- https://github.com/Anello92/Geospatial-AB-Testing 