## A/B Hypothesis Testing

#### Importing 

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, chi2_contingency, f_oneway
from src.data_quality_checks import get_total_missing_percentage, check_missing_data
from src.descriptive_statistics import describe_numerical, summarize_categorical
from src.plots import plot_histograms, plot_bar_charts, plot_histogram, plot_bar_chart, display_summary_table, plot_time_series, plot_time_series_dots, scatter_plot_advanced, scatter_plot, box_plots

#### Data Preprocessing 

In [2]:
# Loading historical data
file_path = "../data/MachineLearningRating_v3.txt"

df = pd.read_csv(file_path, sep='|', low_memory=False)

selecting relevant columns

In [3]:
relevant_cols = ['TransactionMonth','TotalPremium', 'TotalClaims', 'Gender', 'PostalCode', 'Citizenship', 'Province', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'NewVehicle']

df_relevant_cols = df[relevant_cols]

Data cleaning

In [4]:
# checking for columns with missing values
missing_data = check_missing_data(df_relevant_cols)
print(missing_data)

    Column Name  Missing Values  Percentage Missing
3        Gender            9536            0.953507
7   VehicleType             552            0.055195
9          make             552            0.055195
10        Model             552            0.055195
11   NewVehicle          153295           15.327998


In [5]:
df_relevant_cols.loc[:, 'VehicleType'] = df_relevant_cols['VehicleType'].fillna(df_relevant_cols['VehicleType'].mode()[0])
df_relevant_cols.loc[:, 'make'] = df_relevant_cols['make'].fillna(df_relevant_cols['make'].mode()[0])
df_relevant_cols.loc[:, 'Model'] = df_relevant_cols['Model'].fillna(df_relevant_cols['Model'].mode()[0])
df_relevant_cols.loc[:, 'NewVehicle'] = df_relevant_cols['NewVehicle'].fillna(df_relevant_cols['NewVehicle'].mode()[0])
df_relevant_cols.loc[:, 'Gender'] = df_relevant_cols['Gender'].fillna(df_relevant_cols['Gender'].mode()[0])

In [6]:
# checking for columns with missing values
missing_data = check_missing_data(df_relevant_cols)
print(missing_data)

Success: No missing values.


#### Select Metrics

In [7]:
selected_cols = ['TotalClaims', 'TotalPremium', 'Province']

df_selected_cols = df_relevant_cols[selected_cols]

#### Testing Risk Difference Across Provinces

In [8]:
# Select metrics
province_data = df_selected_cols[['Province', 'TotalClaims']]

In [9]:
# data segementation 
grouped_data = province_data.groupby('Province')['TotalClaims']

In [10]:
# Extract values for each province
province_values = [group for name, group in grouped_data]

In [11]:
# Perform ANOVA test
f_stat, p_value = f_oneway(*province_values)
print(f'F-statistic: {f_stat}, p-value: {p_value}')

F-statistic: 5.84941376240761, p-value: 1.6782057588675906e-07
