# **Setup**

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import scipy.stats as stats 

# **Data Import**

In [2]:
# Set Random Seed 
np.random.seed(42)

# Load Data
df = pd.read_csv('dataset.csv')
print(df.shape)
print(df.columns)

df.head()

(1000, 5)
Index(['user_id', 'group', 'latitude', 'longitude', 'conversion'], dtype='object')


Unnamed: 0,user_id,group,latitude,longitude,conversion
0,1,A,35.669109,6.869443,1
1,2,B,6.497346,-7.494524,1
2,3,A,-34.285029,-170.768856,0
3,4,A,56.483104,-57.150782,1
4,5,A,33.251611,-43.129577,0


# **Calc Avg Conversion Rates**

In [3]:
conversion_rates = df.groupby('group')['conversion'].agg(['mean', 'count'])
conversion_rates

Unnamed: 0_level_0,mean,count
group,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.3,490
B,0.315686,510


In [4]:
ConvRate_GroupA = conversion_rates['mean']['A']
ConvRate_GroupB = conversion_rates['mean']['B'] 

print(f"Conversion rate of Group A: {ConvRate_GroupA}")
print(f"Conversion rate of Group B: {ConvRate_GroupB}")

Conversion rate of Group A: 0.3
Conversion rate of Group B: 0.3156862745098039


# **Set Significance Level**

In [5]:
# Set Significance Level (alpha)
sig_level = 0.05

# **Select Test**

- Statistical Test 01 (Parameteric): 2 Sample T-Test
- Statistical Test 02 (Non-Parameteric): Mann-Whitney U Test

# **01 Student's T-Test**

**Assumptions**
- Normality
- Homoscedasticity (Equal Variance)
- Independence 

In [6]:
df.groupby('group')['conversion'].agg(['count','mean','std','var'])

Unnamed: 0_level_0,count,mean,std,var
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,490,0.3,0.458726,0.210429
B,510,0.315686,0.465245,0.216453


### **Normality Check**

- Use Shapiro-Wilk test

In [7]:
# Shapiro-Wilk Test for Normality Of Distribution
shapiro_A = stats.shapiro(df[df['group'] == 'A']['conversion'])
shapiro_B = stats.shapiro(df[df['group'] == 'B']['conversion'])

# Review Results
print('Shapiro-Wilk Test Group A', shapiro_A)
print('Shapiro-Wilk Test Group B', shapiro_B)

Shapiro-Wilk Test Group A ShapiroResult(statistic=0.5755484295691086, pvalue=8.305357450336786e-33)
Shapiro-Wilk Test Group B ShapiroResult(statistic=0.585129503103347, pvalue=4.39191381945773e-33)


In [8]:
'''
Shapiro-Wilk Test Conclusion: 
Group A: pvalue=8.305357450336786e-33 < 0.05 => Reject H0 => Non-Normal Distribution
Group B: pvalue=4.39191381945773e-33 < 0.05 => Reject H0 => Non-Normal Distribution
'''

'\nShapiro-Wilk Test Conclusion: \nGroup A: pvalue=8.305357450336786e-33 < 0.05 => Reject H0 => Non-Normal Distribution\nGroup B: pvalue=4.39191381945773e-33 < 0.05 => Reject H0 => Non-Normal Distribution\n'

### **Homogenity of Variance**

- Leven's Test

In [9]:
# Levene's Test for Homogeneity of Variances

levene_test = stats.levene(df[df['group']=='A']['conversion'], df[df['group']=='B']['conversion'])
print(f'Levene Test for Homogeneity of Variances:\n{levene_test}')  

Levene Test for Homogeneity of Variances:
LeveneResult(statistic=0.2880082452976037, pvalue=0.5916193377766581)


In [10]:
''' 
Levene Test Conclusion & Interpretation:
pvalue=0.59 > 0.05 => Fail to Reject H0 => Homogeneity of Variances Assumed
'''

' \nLevene Test Conclusion & Interpretation:\npvalue=0.59 > 0.05 => Fail to Reject H0 => Homogeneity of Variances Assumed\n'

## **T-Test**

In [11]:
t_stat, p_val = stats.ttest_ind(df[df['group']=='A']['conversion'], df[df['group']=='B']['conversion']) 

print(f't-statistic: {t_stat}')  
print(f'p-value: {p_val}')  

t-statistic: -0.5366639966474401
p-value: 0.5916193377768859


# **02 Mann Whitney U Test**

**Hypotheses Statements**
- H₀ : The distributions of the two populations are equal, meaning the two samples come from the same distribution
- H₁ : The distributions of the two populations are not equal, meaning the two samples come from different distributions

**Assumptions**
- Independence of Sample Groups
- Ordinal Measurement
- Non Normal Distribution

In [12]:
u_stat, p_val = stats.mannwhitneyu(df[df['group']=='A']['conversion'], df[df['group']=='B']['conversion']) 

print(f't-statistic: {u_stat}')  
print(f'p-value: {p_val}')  

t-statistic: 122990.0
p-value: 0.5914621529131736


In [13]:
''' 
p-value=0.59146 > 0.05 => Fail to Reject H0 => No Significant Difference in distribution of group A and B
'''

' \np-value=0.59146 > 0.05 => Fail to Reject H0 => No Significant Difference in distribution of group A and B\n'

# **Conclusion**

Running the campaign in California or New York did not impact the conversion rate. 

# **Visualize Data Distribution**

In [14]:
df.head()

Unnamed: 0,user_id,group,latitude,longitude,conversion
0,1,A,35.669109,6.869443,1
1,2,B,6.497346,-7.494524,1
2,3,A,-34.285029,-170.768856,0
3,4,A,56.483104,-57.150782,1
4,5,A,33.251611,-43.129577,0


In [16]:
#20. Plotting Geographical Distribution of Groups A and B  
import plotly.express as px
import pandas as pd

# Example data (replace with actual data)
# group_A = pd.DataFrame({'longitude': [...], 'latitude': [...]})
# group_B = pd.DataFrame({'longitude': [...], 'latitude': [...]})

# Ensuring safe assignment using .loc
# group_A.loc[:, 'Group'] = 'Group A'
# group_B.loc[:, 'Group'] = 'Group B'

# Combine the groups into a single DataFrame
# combined_data = pd.concat([group_A, group_B])

# Interactive plot with Plotly
fig = px.scatter(
    df, x='longitude', y='latitude', 
    color='group',
    title='Geographical Distribution of Groups A and B',
    labels={'longitude': 'Longitude', 'latitude': 'Latitude'},
    opacity=0.7  # Transparency for better visibility
)

# Layout adjustments
fig.update_layout(
    template='plotly_white',
    legend=dict(title='Groups'),
    height=600,
    width=800
)

fig.show()

# **Referenence**

- https://towardsdatascience.com/a-comprehensive-guided-project-to-a-b-testing-notebook-91e5a300966c 
- https://github.com/Anello92/Geospatial-AB-Testing 