## A/B Hypothesis Testing

#### Importing 

In [11]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, chi2_contingency, f_oneway
from src.data_quality_checks import get_total_missing_percentage, check_missing_data
from src.descriptive_statistics import describe_numerical, summarize_categorical
from src.plots import plot_histograms, plot_bar_charts, plot_histogram, plot_bar_chart, display_summary_table, plot_time_series, plot_time_series_dots, scatter_plot_advanced, scatter_plot, box_plots

#### Data Preprocessing 

In [12]:
# Loading historical data
file_path = "../data/MachineLearningRating_v3.txt"

df = pd.read_csv(file_path, sep='|', low_memory=False)

selecting relevant columns

In [13]:
relevant_cols = ['TransactionMonth','TotalPremium', 'TotalClaims', 'Gender', 'PostalCode', 'Citizenship', 'Province', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'NewVehicle']

df_relevant_cols = df[relevant_cols]

Data cleaning

In [14]:
# checking for columns with missing values
missing_data = check_missing_data(df_relevant_cols)
print(missing_data)

    Column Name  Missing Values  Percentage Missing
3        Gender            9536            0.953507
7   VehicleType             552            0.055195
9          make             552            0.055195
10        Model             552            0.055195
11   NewVehicle          153295           15.327998


In [15]:
df_relevant_cols.loc[:, 'VehicleType'] = df_relevant_cols['VehicleType'].fillna(df_relevant_cols['VehicleType'].mode()[0])
df_relevant_cols.loc[:, 'make'] = df_relevant_cols['make'].fillna(df_relevant_cols['make'].mode()[0])
df_relevant_cols.loc[:, 'Model'] = df_relevant_cols['Model'].fillna(df_relevant_cols['Model'].mode()[0])
df_relevant_cols.loc[:, 'NewVehicle'] = df_relevant_cols['NewVehicle'].fillna(df_relevant_cols['NewVehicle'].mode()[0])
df_relevant_cols.loc[:, 'Gender'] = df_relevant_cols['Gender'].fillna(df_relevant_cols['Gender'].mode()[0])

In [16]:
# checking for columns with missing values
missing_data = check_missing_data(df_relevant_cols)
print(missing_data)

Success: No missing values.


#### Testing Risk Difference Across Provinces

In [17]:
# Select metrics
province_data = df_relevant_cols[['Province', 'TotalClaims']]

In [18]:
# data segementation 
grouped_data = province_data.groupby('Province')['TotalClaims']

In [19]:
# Extract values for each province
province_values = [group for name, group in grouped_data]

In [20]:
# Perform ANOVA test
f_stat, p_value = f_oneway(*province_values)
print(f'F-statistic: {f_stat}, p-value: {p_value}')

F-statistic: 5.84941376240761, p-value: 1.6782057588675906e-07


In [25]:
alpha = 0.05  # Significance level
if p_value < alpha:
    print("Reject the null hypothesis: There are significant risk differences across provinces.")
else:
    print("Fail to reject the null hypothesis: There are no significant risk differences across provinces.")

Fail to reject the null hypothesis: There are no significant risk differences across provinces.


#### Testing risk differences between zip codes

In [21]:
# Select metrics
zipcode_data = df_relevant_cols[['PostalCode', 'TotalClaims']]

In [22]:
# data segementation
grouped_zipcode_data = zipcode_data.groupby('PostalCode')['TotalClaims']

In [23]:
# Extract values for each zip code
zipcode_values = [group for name, group in grouped_zipcode_data]

In [24]:
# Perform ANOVA test
f_stat, p_value = f_oneway(*zipcode_values)
print(f'F-statistic: {f_stat}, p-value: {p_value}')

F-statistic: 0.9419762214391832, p-value: 0.8906511279164051


In [26]:
if p_value < alpha:
    print("Reject the null hypothesis: There are significant risk differences between zip codes.")
else:
    print("Fail to reject the null hypothesis: There are no significant risk differences between zip codes.")

Fail to reject the null hypothesis: There are no significant risk differences between zip codes.


#### Testing margin differences between zip codes

In [29]:
# Calculate margin
df_relevant_cols['Margin'] = df_relevant_cols['TotalPremium'] - df_relevant_cols['TotalClaims']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant_cols['Margin'] = df_relevant_cols['TotalPremium'] - df_relevant_cols['TotalClaims']


In [31]:
# Select metric
margin_data = df_relevant_cols[['PostalCode', 'Margin']]

In [32]:
# Group data by zip code
grouped_margin_data = margin_data.groupby('PostalCode')['Margin']

In [33]:
# Extract values for each zip code
margin_values = [group for name, group in grouped_margin_data]

In [34]:
# Perform ANOVA test
f_stat, p_value = f_oneway(*margin_values)
print(f'F-statistic: {f_stat}, p-value: {p_value}')

F-statistic: 0.8707474893589258, p-value: 0.9976859758015036


In [35]:
if p_value < alpha:
    print("Reject the null hypothesis: There are significant margin differences between zip codes.")
else:
    print("Fail to reject the null hypothesis: There are no significant margin differences between zip codes.")

Fail to reject the null hypothesis: There are no significant margin differences between zip codes.


#### Testing risk differences between women and men

In [38]:
# select metrics 
gender_data = df_relevant_cols[['Gender', 'TotalClaims']]

In [39]:
# Separate data by gender
male_data = gender_data[gender_data['Gender'] == 'Male']['TotalClaims']
female_data = gender_data[gender_data['Gender'] == 'Female']['TotalClaims']

In [40]:
# Perform T-test
t_stat, p_value = ttest_ind(male_data, female_data)
print(f'T-statistic: {t_stat}, p-value: {p_value}')


T-statistic: -0.24803623812388725, p-value: 0.8041073961270343


In [41]:
if p_value < alpha:
    print("Reject the null hypothesis: There are significant risk differences between women and men.")
else:
    print("Fail to reject the null hypothesis: There are no significant risk differences between women and men.")

Fail to reject the null hypothesis: There are no significant risk differences between women and men.
