In [1]:

import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency


df = pd.read_csv("../data/insurance.txt", sep="|")


  df = pd.read_csv("../data/insurance.txt", sep="|")


In [2]:
df["HasClaim"] = df["TotalClaims"] > 0
df["ClaimFrequency"] = df["HasClaim"].astype(int)
df["ClaimSeverity"] = df["TotalClaims"] / df["HasClaim"].replace({0:1})  # avoids division by zero
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]

In [3]:
province_groups = df["Province"].unique()
if len(province_groups) >= 2:
    province_a, province_b = province_groups[:2]
    group_prov_a = df[df["Province"] == province_a]
    group_prov_b = df[df["Province"] == province_b]


group_male = df[df["Gender"].str.lower() == "male"]
group_female = df[df["Gender"].str.lower() == "female"]

postal_groups = df["PostalCode"].dropna().unique()
if len(postal_groups) >= 2:
    postal_a, postal_b = postal_groups[:2]
    group_postal_a = df[df["PostalCode"] == postal_a]
    group_postal_b = df[df["PostalCode"] == postal_b]

In [4]:
results = []

contingency_prov = pd.crosstab(df["Province"], df["HasClaim"])
chi2, p_freq_prov, _, _ = chi2_contingency(contingency_prov)
results.append({
    "Feature": "Province",
    "Metric": "ClaimFrequency",
    "p_value": p_freq_prov,
    "Reject_H0": p_freq_prov < 0.05,
    "Interpretation": f"{province_a} vs {province_b}" 
})


t_stat, p_sev_prov = ttest_ind(group_prov_a["ClaimSeverity"], group_prov_b["ClaimSeverity"], equal_var=False)
results.append({
    "Feature": "Province",
    "Metric": "ClaimSeverity",
    "p_value": p_sev_prov,
    "Reject_H0": p_sev_prov < 0.05,
    "Interpretation": f"{province_a} vs {province_b}"
})

# Margin (t-test)
t_stat, p_margin_prov = ttest_ind(group_prov_a["Margin"], group_prov_b["Margin"], equal_var=False)
results.append({
    "Feature": "Province",
    "Metric": "Margin",
    "p_value": p_margin_prov,
    "Reject_H0": p_margin_prov < 0.05,
    "Interpretation": f"{province_a} vs {province_b}"
})


contingency_gender = pd.crosstab(df["Gender"], df["HasClaim"])
chi2, p_freq_gender, _, _ = chi2_contingency(contingency_gender)
results.append({
    "Feature": "Gender",
    "Metric": "ClaimFrequency",
    "p_value": p_freq_gender,
    "Reject_H0": p_freq_gender < 0.05,
    "Interpretation": "Male vs Female"
})


t_stat, p_sev_gender = ttest_ind(group_male["ClaimSeverity"], group_female["ClaimSeverity"], equal_var=False)
results.append({
    "Feature": "Gender",
    "Metric": "ClaimSeverity",
    "p_value": p_sev_gender,
    "Reject_H0": p_sev_gender < 0.05,
    "Interpretation": "Male vs Female"
})

# Margin (t-test)
t_stat, p_margin_gender = ttest_ind(group_male["Margin"], group_female["Margin"], equal_var=False)
results.append({
    "Feature": "Gender",
    "Metric": "Margin",
    "p_value": p_margin_gender,
    "Reject_H0": p_margin_gender < 0.05,
    "Interpretation": "Male vs Female"
})

# ---- PostalCode ----
# Claim Frequency (Chi-squared)
contingency_postal = pd.crosstab(df["PostalCode"], df["HasClaim"])
chi2, p_freq_postal, _, _ = chi2_contingency(contingency_postal)
results.append({
    "Feature": "PostalCode",
    "Metric": "ClaimFrequency",
    "p_value": p_freq_postal,
    "Reject_H0": p_freq_postal < 0.05,
    "Interpretation": f"{postal_a} vs {postal_b}"
})

# Claim Severity (t-test)
t_stat, p_sev_postal = ttest_ind(group_postal_a["ClaimSeverity"], group_postal_b["ClaimSeverity"], equal_var=False)
results.append({
    "Feature": "PostalCode",
    "Metric": "ClaimSeverity",
    "p_value": p_sev_postal,
    "Reject_H0": p_sev_postal < 0.05,
    "Interpretation": f"{postal_a} vs {postal_b}"
})

# Margin (t-test)
t_stat, p_margin_postal = ttest_ind(group_postal_a["Margin"], group_postal_b["Margin"], equal_var=False)
results.append({
    "Feature": "PostalCode",
    "Metric": "Margin",
    "p_value": p_margin_postal,
    "Reject_H0": p_margin_postal < 0.05,
    "Interpretation": f"{postal_a} vs {postal_b}"
})

In [5]:
results_df = pd.DataFrame(results)
print(results_df)

      Feature          Metric       p_value  Reject_H0  \
0    Province  ClaimFrequency  5.925511e-19       True   
1    Province   ClaimSeverity           NaN      False   
2    Province          Margin  3.530213e-01      False   
3      Gender  ClaimFrequency  2.657025e-02       True   
4      Gender   ClaimSeverity           NaN      False   
5      Gender          Margin  8.015464e-01      False   
6  PostalCode  ClaimFrequency  3.152172e-30       True   
7  PostalCode   ClaimSeverity           NaN      False   
8  PostalCode          Margin  6.630316e-01      False   

             Interpretation  
0  Gauteng vs KwaZulu-Natal  
1  Gauteng vs KwaZulu-Natal  
2  Gauteng vs KwaZulu-Natal  
3            Male vs Female  
4            Male vs Female  
5            Male vs Female  
6              1459 vs 1513  
7              1459 vs 1513  
8              1459 vs 1513  


In [8]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt

# Metrics
df["HasClaim"] = df["TotalClaims"] > 0
df["ClaimFrequency"] = df["HasClaim"].astype(int)
df["ClaimSeverity"] = df["TotalClaims"] / df["HasClaim"].replace({False: np.nan})
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]


In [9]:
# Example for Province
kpi = "ClaimFrequency"
group_feature = "Province"


In [10]:
contingency_table = pd.crosstab(df[group_feature], df["HasClaim"])
chi2, p, dof, ex = stats.chi2_contingency(contingency_table)
print(f"Chi2: {chi2}, p-value: {p}")


Chi2: 104.19088107029361, p-value: 5.925510718204678e-19


In [11]:
# ANOVA for ClaimSeverity across provinces
groups = [group["ClaimSeverity"].dropna() for name, group in df.groupby(group_feature)]
f_stat, p_val = stats.f_oneway(*groups)
print(f"ANOVA F-statistic: {f_stat}, p-value: {p_val}")


ANOVA F-statistic: 4.830165899976341, p-value: 6.304916760425176e-06


In [12]:
if p_val < 0.05:
    print(f"Reject H0: {group_feature} has a significant impact on {kpi}.")
    # Optional: check top/bottom groups
    summary = df.groupby(group_feature)[kpi].mean().sort_values(ascending=False)
    print(summary.head(5))
else:
    print(f"Fail to reject H0: {group_feature} does not significantly affect {kpi}.")


Reject H0: Province has a significant impact on ClaimFrequency.
Province
Gauteng          0.003356
KwaZulu-Natal    0.002845
Limpopo          0.002698
North West       0.002436
Mpumalanga       0.002428
Name: ClaimFrequency, dtype: float64
