In [9]:
from scipy.stats import chi2_contingency, ttest_ind
import pandas as pd


df = pd.read_csv("../data/insurance.txt", sep="|")

# Quick check
df.head()

# Ensure your key columns exist
df['HasClaim'] = df['TotalClaims'] > 0
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

print("=== TASK 3: Statistical Hypothesis Testing ===\n")


  df = pd.read_csv("../data/insurance.txt", sep="|")


=== TASK 3: Statistical Hypothesis Testing ===



In [10]:
print("1️⃣ Claim Frequency Differences Across Provinces")
freq_table = pd.crosstab(df['Province'], df['HasClaim'])
chi2, p, dof, ex = chi2_contingency(freq_table)
if p < 0.05:
    print(f"Reject H0 (p={p:.4f}): Claim frequency differs across provinces.")
else:
    print(f"Fail to reject H0 (p={p:.4f}): No significant frequency difference across provinces.")
print()

print("Claim Severity Differences Across Provinces")
provinces = df['Province'].unique()
for i in range(len(provinces)):
    for j in range(i+1, len(provinces)):
        prov1, prov2 = provinces[i], provinces[j]
        sev1 = df.loc[df['Province']==prov1, 'TotalClaims']
        sev2 = df.loc[df['Province']==prov2, 'TotalClaims']
        t_stat, p = ttest_ind(sev1, sev2, equal_var=False)
        if p < 0.05:
            print(f"Reject H0 between {prov1} and {prov2} (p={p:.4f}): Severity differs.")
    
print()

1️⃣ Claim Frequency Differences Across Provinces
Reject H0 (p=0.0000): Claim frequency differs across provinces.

Claim Severity Differences Across Provinces
Reject H0 between Gauteng and Mpumalanga (p=0.0000): Severity differs.
Reject H0 between Gauteng and Eastern Cape (p=0.0327): Severity differs.
Reject H0 between Gauteng and Limpopo (p=0.0023): Severity differs.
Reject H0 between Gauteng and North West (p=0.0000): Severity differs.
Reject H0 between Gauteng and Northern Cape (p=0.0000): Severity differs.
Reject H0 between KwaZulu-Natal and Mpumalanga (p=0.0000): Severity differs.
Reject H0 between KwaZulu-Natal and Eastern Cape (p=0.0085): Severity differs.
Reject H0 between KwaZulu-Natal and Western Cape (p=0.0105): Severity differs.
Reject H0 between KwaZulu-Natal and Limpopo (p=0.0004): Severity differs.
Reject H0 between KwaZulu-Natal and North West (p=0.0000): Severity differs.
Reject H0 between KwaZulu-Natal and Northern Cape (p=0.0000): Severity differs.
Reject H0 between M

In [11]:

print("3️⃣ Claim Frequency Differences Between Genders")
freq_table_gender = pd.crosstab(df['Gender'], df['HasClaim'])
chi2, p, dof, ex = chi2_contingency(freq_table_gender)
if p < 0.05:
    print(f"Reject H0 (p={p:.4f}): Claim frequency differs between genders.")
else:
    print(f"Fail to reject H0 (p={p:.4f}): No significant difference in claim frequency between genders.")
print()


3️⃣ Claim Frequency Differences Between Genders
Reject H0 (p=0.0266): Claim frequency differs between genders.



In [12]:
genders = df['Gender'].unique()
if len(genders) == 2:
    sev1 = df.loc[df['Gender']==genders[0], 'TotalClaims']
    sev2 = df.loc[df['Gender']==genders[1], 'TotalClaims']
    t_stat, p = ttest_ind(sev1, sev2, equal_var=False)
    if p < 0.05:
        print(f"Reject H0 (p={p:.4f}): Claim severity differs between {genders[0]} and {genders[1]}.")
    else:
        print(f"Fail to reject H0 (p={p:.4f}): No significant severity difference between genders.")

In [13]:
zipcode_freq = pd.crosstab(df['PostalCode'], df['HasClaim'])
chi2, p, dof, ex = chi2_contingency(zipcode_freq)
print(f"H0: No risk differences across zip codes | p-value = {p:.4f}")
if p < 0.05:
    print("Reject H0 → Zip codes have significantly different claim frequencies.\n")
else:
    print("Fail to reject H0 → No significant difference across zip codes.\n")


zipcode_margin = df.groupby('PostalCode')['Margin'].mean()

top_zip = zipcode_margin.nlargest(50)
bottom_zip = zipcode_margin.nsmallest(50)
t_stat, p = ttest_ind(top_zip, bottom_zip, equal_var=False)
print(f"H0: No margin differences across zip codes | p-value = {p:.4f}")
if p < 0.05:
    print("Reject H0 → Margin differs significantly across zip codes.\n")
else:
    print("Fail to reject H0 → No significant margin difference.\n")

H0: No risk differences across zip codes | p-value = 0.0000
Reject H0 → Zip codes have significantly different claim frequencies.

H0: No margin differences across zip codes | p-value = 0.0000
Reject H0 → Margin differs significantly across zip codes.



In [14]:
gender_freq = pd.crosstab(df['Gender'], df['HasClaim'])
chi2, p, dof, ex = chi2_contingency(gender_freq)
print(f"H0: No risk difference between genders | p-value = {p:.4f}")
if p < 0.05:
    print("Reject H0 → Gender has a significant effect on claim frequency.\n")
else:
    print("Fail to reject H0 → No significant gender effect.\n")

H0: No risk difference between genders | p-value = 0.0266
Reject H0 → Gender has a significant effect on claim frequency.

