## A/B Hypothesis Testing

In [1]:
import pandas as pd
import sys
import os
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

# Add src to path to import utils
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
from utils import load_data, clean_data, get_summary_statistics

# Load the data
data_path = '../data/MachineLearningRating_v3.txt'
df = load_data(data_path)

if df is not None:
    print("Data loaded successfully!")
    print(f"Shape of the dataset: {df.shape}")

  df = pd.read_csv(file_path, sep='|', encoding='utf-8')


Data loaded successfully!
Shape of the dataset: (1000098, 52)


In [3]:
# Define KPIs
# Claim Frequency = Proportion of policies with at least on claim
df["ClaimOccured"] = df["TotalClaims"] > 0
claim_frequency = df.groupby("Province")["ClaimOccured"].mean().reset_index()

In [8]:
# Claim_severity = average claim amount given claim occurred
claim_severity = df[df["TotalClaims"] > 0].groupby("Province")["TotalClaims"].mean().reset_index(name="ClaimSeverity")

# Margin = Total Premium - Total Claim Amount
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]
margin = df.groupby("Province")["Margin"].mean().reset_index()

print(claim_frequency.head())
print("---------------------")
print(claim_severity.head())
print("---------------------")
print(margin.head())

        Province  ClaimOccured
0   Eastern Cape      0.001648
1     Free State      0.001358
2        Gauteng      0.003356
3  KwaZulu-Natal      0.002845
4        Limpopo      0.002698
---------------------
        Province  ClaimSeverity
0   Eastern Cape   27128.533277
1     Free State   32265.661085
2        Gauteng   22243.878396
3  KwaZulu-Natal   29609.487473
4        Limpopo   15171.294187
---------------------
        Province     Margin
0   Eastern Cape  25.833240
1     Free State  20.550805
2        Gauteng -13.558894
3  KwaZulu-Natal  -6.433598
4        Limpopo  20.971484


In [9]:
# Hypothesis Testing
from scipy.stats import chi2_contingency, ttest_ind

def hypothesis_testing_kpi(group_col, kpi_col, df, test_type="t-test"):
    """
    Perform statistical test for KPI between groups
    :param group_col: categorical column to split the groups (e.g., 'Province')
    :param kpi_col: KPI to test (numerical)
    :param df: DataFrame
    :param test_type: 't-test' or 'chi2' (for categorical)
    """

    groups = df[group_col].unique()
    if len(groups) < 2:
        print("Not enough groups to compare.")
        return None
    
    group_a = df[df[group_col] == groups[0]][kpi_col]
    group_b = df[df[group_col] == groups[1]][kpi_col]

    if test_type == "t-test":
        stat, p_value = ttest_ind(group_a, group_b, equal_var=False)
    elif test_type == "chi2":
        contingency = pd.crosstab(df[group_col], df[kpi_col])
        stat, p_value, dof, expected = chi2_contingency(contingency)
    else:
        raise ValueError("Invalid test_type. Use 't-test' or 'chi2'.")
    
    print(f"Testing {kpi_col} between {groups[0]} and {groups[1]}: p_value = {p_value}")

    if p_value < 0.05:
        print("Reject null hypothesis: Significant difference found.")
    else:
        print("Fail to reject null hypothesis: No significant difference found.")




In [10]:
# Test Hypotheses
from itertools import combinations
provinces = df["Province"].unique()
for province_a, province_b in combinations(provinces, 2):
    group_df = df[df["Province"].isin([province_a, province_b])]
    print(f"\nComparing {province_a} and {province_b}:")
    hypothesis_testing_kpi("Province", "ClaimOccured", group_df, test_type="t-test")
    hypothesis_testing_kpi("Province", "TotalClaims", group_df, test_type="t-test")

hypothesis_testing_kpi("Gender", "ClaimOccured", df, test_type="t-test")
hypothesis_testing_kpi("Gender", "Margin", df, test_type="t-test")


Comparing Gauteng and KwaZulu-Natal:
Testing ClaimOccured between Gauteng and KwaZulu-Natal: p_value = 0.0012690609180115516
Reject null hypothesis: Significant difference found.
Testing TotalClaims between Gauteng and KwaZulu-Natal: p_value = 0.21269135333580616
Fail to reject null hypothesis: No significant difference found.

Comparing Gauteng and Mpumalanga:
Testing ClaimOccured between Gauteng and Mpumalanga: p_value = 6.917217553664877e-05
Reject null hypothesis: Significant difference found.
Testing TotalClaims between Gauteng and Mpumalanga: p_value = 1.1873384663398435e-05
Reject null hypothesis: Significant difference found.

Comparing Gauteng and Eastern Cape:
Testing ClaimOccured between Gauteng and Eastern Cape: p_value = 9.221360495930388e-12
Reject null hypothesis: Significant difference found.
Testing TotalClaims between Gauteng and Eastern Cape: p_value = 0.03274492060339594
Reject null hypothesis: Significant difference found.

Comparing Gauteng and Western Cape:
Test