# AlphaCare Insurance Solutions (ACIS) 


AlphaCare Insurance Solutions (ACIS) is committed to developing cutting-edge risk and predictive analytics in the area of car insurance
planning and marketing in South Africa.This is a part of the final report on hypothesis testing,

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add the parent directory to the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

from scripts.data_loader import load_data

# Load the text file into a Pandas DataFrame
source = pd.read_csv('../data/data.txt', delimiter='|')  

source.to_csv('../data/data2.csv', index=False)
df= pd.read_csv('../data/data2.csv')
df.head()


## Null Hypotesis to be tested 

There are no risk differences across provinces, There are no risk differences between zip codes, There are no significant margin (profit) difference between zip codes, and There are not significant risk difference between Women and Men


## Identify KPIs 

In [None]:
# Loss ratio by province
df['LossRatio']  = df['TotalClaims']/ df[df['TotalPremium'] >0]['TotalPremium'] 
province_loss_ratio = df.groupby('Province')['LossRatio'].mean()
province_loss_ratio

In [None]:
# Loss ratio by postal code
postalcode_loss_ratio = df.groupby('PostalCode')['LossRatio'].mean()
postalcode_loss_ratio.head(10)

In [None]:
# Profit margin by postal code
df['ProfitMargin']  = df['TotalPremium'] - df['TotalClaims']/ df[df['TotalPremium'] >0]['TotalPremium']
postal_code_profit_margin = df.groupby('PostalCode')['ProfitMargin'].mean()
postal_code_profit_margin

In [None]:
# Loss ratio by postal code by gender
postalcode_loss_ratio = df.groupby('Gender')['LossRatio'].mean()
postalcode_loss_ratio.head(10)

## Data Segementations

In [None]:
feature_to_test = 'CoverType'


# Create two groups

group_a = df[df[feature_to_test].isin(['Windscreen', 'Own Damage' ,'Third Party', 'Passenger Liability', 'Signage and Vehicle Wraps', 'Keys and Alarms', 
'Cleaning and Removal of Accident Debris' ,'Emergency Charges', 'Credit Protection' ,'Roadside Assistance', 'Accidental Death' ])]
group_b = df[df[feature_to_test].isin(['Basic Excess Waiver' ,'Income Protector' ,'Deposit Cover',  'Standalone passenger liability' ,'Baggage/Luggage',
 'Asset Value Preserver', 'Cash Takings', 'Third Party Only' 'Trailer',  'Fire and Theft' ,'Factory Fitted Sound aner Electronic Equipment'])]

group_a = group_a[group_a['TotalPremium']>0]
group_b = group_b[group_b['TotalPremium']>0]


In [None]:
from scipy.stats import ttest_ind, chi2_contingency
numerical_columns = ['SumInsured', 'CalculatedPremiumPerTerm', 'CustomValueEstimate']

for col in numerical_columns:
    t_stat, p_val = ttest_ind(group_a[col].dropna(), group_b[col].dropna())
    print(f"{col}: T-statistic = {t_stat:.2f}, P-value = {p_val:.3f}")
    if p_val > 0.05:
        print(f"  -> No significant difference (groups are equivalent for {col})")
    else:
        print(f"  -> Significant difference (adjust groups for {col})")

In [None]:
categorical_columns = ['Gender', 'VehicleType', 'MaritalStatus']

for col in categorical_columns:
    contingency_table = pd.crosstab(df[col], df[feature_to_test])
    chi2, p_val, _, _ = chi2_contingency(contingency_table)
    print(f"{col}: Chi-square = {chi2:.2f}, P-value = {p_val:.3f}")
    if p_val > 0.05:
        print(f"  -> No significant difference (groups are equivalent for {col})")
    else:
        print(f"  -> Significant difference (adjust groups for {col})")

In [None]:
# Add a column to indicate the group
group_a['Group'] = 'A'  # Control group
group_b['Group'] = 'B'  # Test group

# Combine into one dataset for further analysis
segmented_data = pd.concat([group_a, group_b], ignore_index=True)

# Save to a new file 
segmented_data.to_csv('segmented_data.csv', index=False)


In [None]:
# Claims Frequency
group_a_claims_frequency = group_a['TotalClaims'].notnull().sum() / len(group_a)
group_b_claims_frequency = group_b['TotalClaims'].notnull().sum() / len(group_b)

print(f"Group A Claims Frequency: {group_a_claims_frequency:.2%}")
print(f"Group B Claims Frequency: {group_b_claims_frequency:.2%}")

# Loss Ratio
group_a_loss_ratio = group_a['TotalClaims'].sum() / group_a[group_a['TotalPremium'] >0]['TotalPremium'].sum()
group_b_loss_ratio = group_b['TotalClaims'].sum() / group_b[group_b['TotalPremium'] >0]['TotalPremium'].sum()

print(f"Group A Loss Ratio: {group_a_loss_ratio:.2%}")
print(f"Group B Loss Ratio: {group_b_loss_ratio:.2%}")