In [1]:
%cd /content/drive/MyDrive/Tenx program/week-3/

/content/drive/MyDrive/Tenx program/week-3


In [18]:
# Imports & Data Load
import pandas as pd
import numpy as np
from statsmodels.stats.multitest import multipletests
from scripts.data_loader import load_clean_data
from scripts.hypothesis_testing import (
    add_claim_indicators,
    test_chi2_claim_frequency,
    test_anova_margin,
    test_proportion_z
)

In [19]:
# Load cleaned data
df = load_clean_data('claims_clean.csv')

In [12]:
# Add indicators
df = add_claim_indicators(df)
n_total = len(df)
df.head()

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims,has_claim,margin
0,145249,12827,2015-03-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,0,21.929825
1,145249,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,0,21.929825
2,145249,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,0,0.0
3,145255,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,54.824561,0.0,0,54.824561
4,145255,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,0,0.0


In [13]:
# χ² Test by Province + Assumptions + Cramér’s V

chi2_pov, p_pov, dof_pov, exp_pov = test_chi2_claim_frequency(df, "Province")

# assumption check
low_exp = (exp_pov < 5).sum()
total_cells = exp_pov.size
pct_low = 100 * low_exp / total_cells

# Cramér’s V
r, k = exp_pov.shape
cramers_v = np.sqrt(chi2_pov / (n_total * (min(r-1, k-1))))

print(f"Province χ² = {chi2_pov:.2f}, p = {p_pov:.4g}")
print(f"  → {low_exp}/{total_cells} cells ({pct_low:.1f}%) have expected < 5")
print(f"  → Cramér’s V = {cramers_v:.3f}")
print("  →", "Reject H₀" if p_pov < 0.05 else "Fail to reject H₀")


Province χ² = 110.73, p = 2.694e-20
  → 0/18 cells (0.0%) have expected < 5
  → Cramér’s V = 0.011
  → Reject H₀


In [14]:
# χ² Test by PostalCode (top 20 zips) + Cramér’s V

top_zips = df.PostalCode.value_counts().nlargest(20).index
sub = df[df.PostalCode.isin(top_zips)]

chi2_zip, p_zip, dof_zip, exp_zip = test_chi2_claim_frequency(sub, "PostalCode")
cramers_v_zip = np.sqrt(chi2_zip / (len(sub) * (min(exp_zip.shape)-1)))

print(f"Top 20 Zips χ² = {chi2_zip:.2f}, p = {p_zip:.4g}")
print(f"  → Cramér’s V ≈ {cramers_v_zip:.3f}")
print("  →", "Reject H₀" if p_zip < 0.05 else "Fail to reject H₀")


Top 20 Zips χ² = 103.97, p = 1.015e-13
  → Cramér’s V ≈ 0.016
  → Reject H₀


In [15]:
# Pairwise Province Z-tests + Bonferroni

provs = df.Province.unique()
pairs = [(a,b) for i,a in enumerate(provs) for b in provs[i+1:]]
results = []
for a,b in pairs:
    z, p = test_proportion_z(df, "Province", a, b)
    results.append({"A":a, "B":b, "z":z, "p_raw":p})

res = pd.DataFrame(results)
res["p_adj"] = multipletests(res["p_raw"], method="bonferroni")[1]
sig = res[res.p_adj < 0.05].sort_values("p_adj")
print(f"Significant province pairs ({len(sig)}/{len(res)}):")
display(sig.head(10))


Significant province pairs (6/36):


Unnamed: 0,A,B,z,p_raw,p_adj
3,Gauteng,Western Cape,8.014176,1.108775e-15,3.99159e-14
5,Gauteng,North West,5.369911,7.877568e-08,2.835924e-06
2,Gauteng,Eastern Cape,-5.053876,4.329319e-07,1.558555e-05
10,KwaZulu-Natal,Western Cape,4.41956,9.89021e-06,0.0003560476
9,KwaZulu-Natal,Eastern Cape,-3.724781,0.0001954847,0.00703745
1,Gauteng,Mpumalanga,3.50922,0.0004494227,0.01617922


In [16]:
# ANOVA Margin by PostalCode + η²

f_stat, p_margin = test_anova_margin(sub, "PostalCode")

# η² effect size
grand_mean = sub.margin.mean()
ss_between = sum(len(g)* (g.margin.mean() - grand_mean)**2 for _,g in sub.groupby("PostalCode"))
ss_total   = ((sub.margin - grand_mean)**2).sum()
eta2 = ss_between / ss_total

print(f"PostalCode ANOVA F = {f_stat:.2f}, p = {p_margin:.4g}")
print(f"  → η² = {eta2:.3f}")
print("  →", "Reject H₀" if p_margin < 0.05 else "Fail to reject H₀")


PostalCode ANOVA F = 2.69, p = 9.242e-05
  → η² = 0.000
  → Reject H₀


In [17]:
# Z-test by Gender + Cohen’s h

zg, p_g = test_proportion_z(df, "Gender", "Female", "Male")

p_f = df[df.Gender=="Female"].has_claim.mean()
p_m = df[df.Gender=="Male"].has_claim.mean()
h = 2*(np.arcsin(np.sqrt(p_f)) - np.arcsin(np.sqrt(p_m)))

print(f"Gender Z = {zg:.2f}, p = {p_g:.4g}")
print(f"  → Cohen’s h = {h:.3f}")
print("  →", "Reject H₀" if p_g < 0.05 else "Fail to reject H₀")


Gender Z = -0.20, p = 0.8405
  → Cohen’s h = -0.003
  → Fail to reject H₀
