In [39]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

# Optional: Display float values with 3 decimals
pd.set_option('display.float_format', lambda x: f'{x:.3f}')
# Load the CSV dataset into a pandas DataFrame
# Replace 'your_data.csv' with the correct path or filename
df = pd.read_csv('../data/cleaned_insurance_data.csv')

# Preview the first few rows of the dataset
df.head()


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,mmcode_was_missing,make_was_missing,Model_was_missing,bodytype_was_missing,NumberOfDoors_was_missing,VehicleIntroDate_was_missing,Cylinders_was_missing,cubiccapacity_was_missing,kilowatts_was_missing,LossRatio
0,145249,12827,2015-03-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,0,0,0,0,0,0,0,0,0,0.0
1,145249,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,0,0,0,0,0,0,0,0,0,0.0
2,145255,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,0,0,0,0,0,0,0,0,0,0.0
3,145247,12827,2015-01-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,0,0,0,0,0,0,0,0,0,0.0
4,145247,12827,2015-04-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,0,0,0,0,0,0,0,0,0,0.0


In [40]:
# Claim Frequency = Num policies with at least one claim / total policies
# Claim Severity = Average claim amount for customers who had at least one claim
# Margin = TotalPremium - TotalClaims

# Ensure the columns exist: 'HasClaim', 'ClaimAmount', 'TotalPremium', 'TotalClaims', 'Gender', 'Province', 'ZipCode'

# Add derived metrics
df['Margin'] = df['TotalPremium'] - df['TotalClaims']
df['HasClaim'] = df['TotalClaims'] > 0


In [41]:
# Group by Province to calculate claim frequency
province_freq = df.groupby('Province')['HasClaim'].mean().reset_index()
print(province_freq)


        Province  HasClaim
0   Eastern Cape     0.002
1     Free State     0.002
2        Gauteng     0.005
3  KwaZulu-Natal     0.004
4        Limpopo     0.004
5     Mpumalanga     0.004
6     North West     0.004
7  Northern Cape     0.002
8   Western Cape     0.004


In [42]:
# Create a contingency table of Province vs HasClaim
province_contingency = pd.crosstab(df['Province'], df['HasClaim'])

# Perform Chi-square test
province_chi2, province_chi_p, province_dof, province_expected = stats.chi2_contingency(province_contingency)

# Print the result
print(f'Chi-square Statistic: {province_chi2:.3f}, p-value: {province_chi_p:.5f}')


Chi-square Statistic: 93.695, p-value: 0.00000


In [43]:
severity_data = df[df['HasClaim'] == True]
province_severity = severity_data.groupby('Province')['TotalClaims'].mean().reset_index()
print(province_severity)


        Province  TotalClaims
0   Eastern Cape    28644.846
1     Free State    29552.767
2        Gauteng    22520.738
3  KwaZulu-Natal    29859.496
4        Limpopo    15028.681
5     Mpumalanga    16286.340
6     North West    16747.673
7  Northern Cape    11186.314
8   Western Cape    28786.386


In [44]:
# Filter data to include only rows with actual claims (severity)
severity_data = df[df['TotalClaims'] > 0]

# ANOVA to test differences in claim severity between provinces
anova_result = stats.f_oneway(*[group['TotalClaims'].values for name, group in severity_data.groupby('Province')])

# Assign p-value to variable for summary table use
province_anova_p = anova_result.pvalue

print(f'F-statistic: {anova_result.statistic:.3f}, p-value: {province_anova_p:.5f}')


F-statistic: 4.813, p-value: 0.00001


In [45]:
# Filter most common zip codes (optional)
top_zipcodes = df['PostalCode'].value_counts().head(5).index
df_zip = df[df['PostalCode'].isin(top_zipcodes)]


In [46]:
# Chi-square test for claim frequency by ZipCode
zip_contingency = pd.crosstab(df_zip['PostalCode'], df_zip['HasClaim'])
chi2, zip_chi_p, dof, expected = stats.chi2_contingency(zip_contingency)

print(f'ZipCode - Chi-square Statistic: {chi2:.3f}, p-value: {zip_chi_p:.5f}')


ZipCode - Chi-square Statistic: 42.205, p-value: 0.00000


In [47]:
# ANOVA for severity between zip codes
severity_zip_data = df_zip[df_zip['HasClaim'] == True]
anova_zip = stats.f_oneway(*[group['TotalClaims'].values for name, group in severity_zip_data.groupby('PostalCode')])

print(f'ZipCode Severity - F-statistic: {anova_zip.statistic:.3f}, p-value: {anova_zip.pvalue:.5f}')

# Save p-value for summary
zip_anova_p = anova_zip.pvalue


ZipCode Severity - F-statistic: 4.468, p-value: 0.00142


In [48]:
# ANOVA on Margin by ZipCode
anova_margin = stats.f_oneway(*[group['Margin'].values for name, group in df_zip.groupby('PostalCode')])
print(f'Margin by ZipCode - F-statistic: {anova_margin.statistic:.3f}, p-value: {anova_margin.pvalue:.5f}')


Margin by ZipCode - F-statistic: 1.972, p-value: 0.09578


In [49]:
zip_margin_p = anova_margin.pvalue


In [50]:
# Calculate claim frequency by Gender
gender_freq = df.groupby('Gender')['HasClaim'].mean()
print("Claim Frequency by Gender:")
print(gender_freq)

# Create contingency table for Gender vs HasClaim
gender_contingency = pd.crosstab(df['Gender'], df['HasClaim'])

# Perform Chi-square test
chi2, gender_chi_p, dof, expected = stats.chi2_contingency(gender_contingency)
print(f'Gender Risk - Chi-square Statistic: {chi2:.3f}, p-value: {gender_chi_p:.5f}')


Claim Frequency by Gender:
Gender
Female          0.004
Male            0.004
Not specified   0.004
Name: HasClaim, dtype: float64
Gender Risk - Chi-square Statistic: 0.305, p-value: 0.85839


In [51]:
summary_results = pd.DataFrame({
    'Hypothesis': [
        'Risk differs by Province (Frequency)',
        'Risk differs by Province (Severity)',
        'Risk differs by ZipCode (Frequency)',
        'Risk differs by ZipCode (Severity)',
        'Margin differs by ZipCode',
        'Risk differs by Gender'
    ],
    'p-value': [
        province_chi_p,
        province_anova_p,
        zip_chi_p,
        zip_anova_p,
        zip_margin_p,
        gender_chi_p
    ]
})  

summary_results['Result'] = summary_results['p-value'].apply(lambda p: 'Reject H₀' if p < 0.05 else 'Fail to Reject H₀')
summary_results


Unnamed: 0,Hypothesis,p-value,Result
0,Risk differs by Province (Frequency),0.0,Reject H₀
1,Risk differs by Province (Severity),0.0,Reject H₀
2,Risk differs by ZipCode (Frequency),0.0,Reject H₀
3,Risk differs by ZipCode (Severity),0.001,Reject H₀
4,Margin differs by ZipCode,0.096,Fail to Reject H₀
5,Risk differs by Gender,0.858,Fail to Reject H₀
