In [1]:
import os, sys
curr_dir = os.getcwd()
parent_dir = os.path.dirname(curr_dir)
sys.path.insert(0, parent_dir)

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("../data/final_cleaned_data.csv")

In [4]:
from scripts.hypothesis_testing import ABHypothesisTesting
hypothesis_tester = ABHypothesisTesting(df)

In [5]:
# Example 1: Categorical Segmentation
# Segment by Gender (categorical feature)
hypothesis_tester.select_metrics(kpi="TotalClaims")  # Select KPI to test
hypothesis_tester.segment_by_category(feature="Gender", group_a_value="Female", group_b_value="Male")

# Perform t-test (default is t-test)
p_value = hypothesis_tester.perform_statistical_test(test_type="t-test")

# Analyze the result
hypothesis_tester.analyze_results(p_value)




KPI selected: TotalClaims
Data segmented by Gender: Group A (Female), Group B (Male)
Performed t-test: p-value = 0.8041073961270343
Fail to reject the null hypothesis: The feature does not have a significant effect.


In [6]:
# Example 2: Numerical Segmentation
# Segment by a numeric feature's median (e.g., "PostalCode")
hypothesis_tester.segment_by_numeric_median(feature="PostalCode")

# Perform t-test again (default is t-test)
p_value = hypothesis_tester.perform_statistical_test(test_type="t-test")

# Analyze the result
hypothesis_tester.analyze_results(p_value)

Data segmented by PostalCode median: Group A (>= 2000.0), Group B (< 2000.0)
Performed t-test: p-value = 0.0031039309970063243
Reject the null hypothesis: The feature has a statistically significant effect.


In [7]:
# Assuming hypothesis_tester is already an instance of ABHypothesisTesting

provinces = df["Province"].unique()

for i in range(len(provinces) - 1):
    for j in range(i + 1, len(provinces)):
        # Pass the province values, not the pre-filtered data
        hypothesis_tester.segment_by_category(feature="Province", 
                                               group_a_value=provinces[i], 
                                               group_b_value=provinces[j])
        
        p_value = hypothesis_tester.perform_statistical_test(test_type="t-test")
        
        # Analyze the result
        hypothesis_tester.analyze_results(p_value)


Data segmented by Province: Group A (Gauteng), Group B (KwaZulu-Natal)
Performed t-test: p-value = 0.19282435112283478
Fail to reject the null hypothesis: The feature does not have a significant effect.
Data segmented by Province: Group A (Gauteng), Group B (Mpumalanga)
Performed t-test: p-value = 0.0011087960706090535
Reject the null hypothesis: The feature has a statistically significant effect.
Data segmented by Province: Group A (Gauteng), Group B (Eastern Cape)
Performed t-test: p-value = 0.03983195104800561
Reject the null hypothesis: The feature has a statistically significant effect.
Data segmented by Province: Group A (Gauteng), Group B (Western Cape)
Performed t-test: p-value = 0.05632044649871912
Fail to reject the null hypothesis: The feature does not have a significant effect.
Data segmented by Province: Group A (Gauteng), Group B (Limpopo)
Performed t-test: p-value = 0.03249850185561555
Reject the null hypothesis: The feature has a statistically significant effect.
Data s

In [8]:
# Testing the Margin(Profit) between Postal codes
df['ProfitMargin'] = df['TotalPremium'] - df['TotalClaims']
hypothesis_tester.select_metrics(kpi="ProfitMargin")

hypothesis_tester.segment_by_numeric_median(feature="PostalCode")
p_value = hypothesis_tester.perform_statistical_test(test_type="t-test")
hypothesis_tester.analyze_results(p_value)


KPI selected: ProfitMargin
Data segmented by PostalCode median: Group A (>= 2000.0), Group B (< 2000.0)
Performed t-test: p-value = 0.2885925183148811
Fail to reject the null hypothesis: The feature does not have a significant effect.
