In [1]:
# Import necessary libraries
import sys
import os

# Add the parent directory to sys.path
sys.path.append(os.path.abspath('../scripts'))

from stats_utils import segment_data, perform_ttest, perform_chi_square_test, report_hypothesis_result
from data_processing import load_data


In [2]:
# Load the data
file_path = '../data/MachineLearningRating_v3.txt'
df = load_data(file_path)

In [3]:
# Hypotheses Testing

# 1. Risk Differences Across Provinces (using T-test on 'TotalClaims' feature)
control_province = 'Gauteng'
test_province = 'Western Cape'
control_group, test_group = segment_data(df, 'Province', control_province, test_province)
p_value, t_statistic = perform_ttest(control_group, test_group, 'TotalClaims')

# Report Result
print(f"Risk Differences Across Provinces (T-test): p-value = {p_value}, t-statistic = {t_statistic}")
print(report_hypothesis_result(p_value))
print("--------------------------------------------------")



Risk Differences Across Provinces (T-test): p-value = 0.05632044649871883, t-statistic = 1.908551760101624
Fail to reject the null hypothesis (no significant result).
--------------------------------------------------


In [4]:
# 2. Risk Differences Between Zip Codes (using T-test on 'TotalClaims')
control_zip = 2000
test_zip = 122
control_group_zip, test_group_zip = segment_data(df, 'PostalCode', control_zip, test_zip)
p_value_zip, t_statistic_zip = perform_ttest(control_group_zip, test_group_zip, 'TotalClaims')

# Report Result
print(f"Risk Differences Between Zip Codes (T-test): p-value = {p_value_zip}, t-statistic = {t_statistic_zip}")
print(report_hypothesis_result(p_value_zip))
print("--------------------------------------------------")




Risk Differences Between Zip Codes (T-test): p-value = 0.5022856048770837, t-statistic = -0.6708991834146607
Fail to reject the null hypothesis (no significant result).
--------------------------------------------------


In [5]:
# 3. Margin Differences Between Zip Codes (using T-test on 'TotalPremium')
p_value_margin, t_statistic_margin = perform_ttest(control_group_zip, test_group_zip, 'TotalPremium')

# Report Result
print(f"Margin Differences Between Zip Codes (T-test): p-value = {p_value_margin}, t-statistic = {t_statistic_margin}")
print(report_hypothesis_result(p_value_margin))
print("--------------------------------------------------")



Margin Differences Between Zip Codes (T-test): p-value = 1.4837221465292035e-20, t-statistic = 9.295272259634377
Reject the null hypothesis (significant result).
--------------------------------------------------


In [6]:
# 4. Gender-Based Risk Differences (using Chi-square on 'Gender' and 'TotalClaims')
p_value_gender, chi2_statistic_gender = perform_chi_square_test(df, 'Gender', 'TotalClaims')

# Report Result
print(f"Risk Differences Between Men and Women (Chi-square): p-value = {p_value_gender}, chi2-statistic = {chi2_statistic_gender}")
print(report_hypothesis_result(p_value_gender))

Risk Differences Between Men and Women (Chi-square): p-value = 1.0, chi2-statistic = 2511.4776608232637
Fail to reject the null hypothesis (no significant result).
