In [10]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
# Load cleaned data (same one used in EDA)
df = pd.read_csv("../data/raw/ml_ratings.csv", low_memory=False)


# Filter and clean
df = df[df['TotalClaims'].notnull() & (df['TotalClaims'] >= 0)]
df = df[df['TotalPremium'].notnull() & (df['TotalPremium'] >= 0)]

# Metrics
df['ClaimOccurred'] = df['TotalClaims'] > 0
df['Margin'] = df['TotalPremium'] - df['TotalClaims']


In [7]:

# Create contingency table
province_table = pd.crosstab(df['Province'], df['ClaimOccurred'])

# Chi-squared test
chi2, p, dof, expected = chi2_contingency(province_table)
print(f"H₀₁ - Province Risk Test: p = {p:.4f}")


H₀₁ - Province Risk Test: p = 0.0000


In [8]:
# Choose top 2 zip codes
top_zips = df['PostalCode'].value_counts().head(2).index.tolist()
zip_df = df[df['PostalCode'].isin(top_zips)]

# Chi-squared test
zip_table = pd.crosstab(zip_df['PostalCode'], zip_df['ClaimOccurred'])
chi2, p, dof, expected = chi2_contingency(zip_table)
print(f"H₀₂ - Zip Code Risk Test: p = {p:.4f}")


H₀₂ - Zip Code Risk Test: p = 0.0580


In [11]:
z1, z2 = top_zips
m1 = df[df['PostalCode'] == z1]['Margin']
m2 = df[df['PostalCode'] == z2]['Margin']

# Independent two-sample t-test
t_stat, p = ttest_ind(m1, m2, equal_var=False)
print(f"H₀₃ - Zip Code Margin Test: p = {p:.4f}")

H₀₃ - Zip Code Margin Test: p = 0.2443


In [12]:
gender_table = pd.crosstab(df['Gender'], df['ClaimOccurred'])
chi2, p, dof, expected = chi2_contingency(gender_table)
print(f"H₀₄ - Gender Risk Test: p = {p:.4f}")


H₀₄ - Gender Risk Test: p = 0.0302


In [13]:
def interpret_result(p, threshold=0.05):
    return "Reject H₀ — significant difference" if p < threshold else " Fail to reject H₀ — no significant difference"
results = {
    "H₀₁: Province Risk": interpret_result(p),      
    "H₀₂: Zip Code Risk": interpret_result(p),
    "H₀₃: Zip Code Margin": interpret_result(p),
    "H₀₄: Gender Risk": interpret_result(p)
}

for hypothesis, conclusion in results.items():
    print(f"{hypothesis} → {conclusion}")
pd.DataFrame(results.items(), columns=["Hypothesis", "Result"]).to_csv("../data/processed/hypothesis_test_results.csv", index=False)


H₀₁: Province Risk → Reject H₀ — significant difference
H₀₂: Zip Code Risk → Reject H₀ — significant difference
H₀₃: Zip Code Margin → Reject H₀ — significant difference
H₀₄: Gender Risk → Reject H₀ — significant difference
