## Chi-Squared test

In [49]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from statsmodels.stats.power import TTestIndPower
from statsmodels.stats.power import tt_ind_solve_power
from statsmodels.stats.power import FTestAnovaPower
from src.data_quality_checks import check_missing_data
from scipy.stats import chi2_contingency

#### Data preparation

In [36]:
# Loading historical data
file_path = "../data/MachineLearningRating_v3.txt"

df = pd.read_csv(file_path, sep='|', low_memory=False)

In [37]:
# selecting relevant columns
relevant_cols = ['TotalPremium', 'TotalClaims', 'Gender', 'PostalCode', 'Province', 'VehicleType', 'RegistrationYear', 'NewVehicle']

df_relevant_cols = df[relevant_cols]

In [38]:
# checking for columns with missing values
missing_data = check_missing_data(df_relevant_cols)
print(missing_data)

   Column Name  Missing Values  Percentage Missing
2       Gender            9536            0.953507
5  VehicleType             552            0.055195
7   NewVehicle          153295           15.327998


In [39]:
# data cleaning
df_relevant_cols.loc[:, 'VehicleType'] = df_relevant_cols['VehicleType'].fillna(df_relevant_cols['VehicleType'].mode()[0])
df_relevant_cols.loc[:, 'NewVehicle'] = df_relevant_cols['NewVehicle'].fillna(df_relevant_cols['NewVehicle'].mode()[0])
df_relevant_cols.loc[:, 'Gender'] = df_relevant_cols['Gender'].fillna(df_relevant_cols['Gender'].mode()[0])

In [40]:
# checking for columns with missing values
missing_data = check_missing_data(df_relevant_cols)
print(missing_data)

Success: No missing values.


#### Testing Risk Difference Across Provinces

In [66]:
H0 = "There is no relationship between province and total claims. They are independent"
H1 = "There is a relationship between province and total claims. They are not independent"

In [67]:
province_data = df_relevant_cols[['Province', 'TotalClaims']]

# Create 'NoClaim' and 'ClaimTaken' columns
province_data['NoClaim'] = province_data['TotalClaims'].apply(lambda x: 1 if x == 0 else 0)
province_data['ClaimTaken'] = province_data['TotalClaims'].apply(lambda x: 1 if x > 0 else 0)

# Group by 'Province' and sum 'NoClaim' and 'ClaimTaken'
grouped_data = province_data.groupby('Province')[['NoClaim', 'ClaimTaken']].sum()

print(grouped_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  province_data['NoClaim'] = province_data['TotalClaims'].apply(lambda x: 1 if x == 0 else 0)


               NoClaim  ClaimTaken
Province                          
Eastern Cape     30286          50
Free State        8088          11
Gauteng         392541        1322
KwaZulu-Natal   169298         483
Limpopo          24769          67
Mpumalanga       52588         128
North West      142938         349
Northern Cape     6372           8
Western Cape    170425         370


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  province_data['ClaimTaken'] = province_data['TotalClaims'].apply(lambda x: 1 if x > 0 else 0)


In [68]:
# Convert grouped DataFrame to a list of lists
grouped_data_list = grouped_data.values.tolist()

print(grouped_data_list)


[[30286, 50], [8088, 11], [392541, 1322], [169298, 483], [24769, 67], [52588, 128], [142938, 349], [6372, 8], [170425, 370]]


In [69]:
# Perform Chi-square test of independence
chi2_stat, p_value, dof, expected_values = chi2_contingency(grouped_data_list)

print(f'Chi-square statistic: {chi2_stat}')
print(f'p-value: {p_value}')
print(f'Degrees of freedom: {dof}')
print('Expected values:')
print(expected_values)

Chi-square statistic: 104.19041870171219
p-value: 5.926803398618003e-19
Degrees of freedom: 8
Expected values:
[[3.02514311e+04 8.45689031e+01]
 [8.07642209e+03 2.25779123e+01]
 [3.92765012e+05 1.09798793e+03]
 [1.69307695e+05 4.73305411e+02]
 [2.47667637e+04 6.92363290e+01]
 [5.25690415e+04 1.46958541e+02]
 [1.42887553e+05 3.99447007e+02]
 [6.36221421e+03 1.77857859e+01]
 [1.70318868e+05 4.76132180e+02]]


In [70]:
pro=0.95
alpha = 1- pro
print('Significance=%0.3f, p = %0.3f' % (alpha, p_value))
if p_value <= alpha:
    print('Dependent, we reject the null hypothesis')
    print(H1)
    
else:
    print('Independent, we fail to reject the null hypothesis')
    print(H0)
    

Significance=0.050, p = 0.000
Dependent, we reject the null hypothesis
There is a relationship between province and total claims. They are not independent


#### Testing risk differences between zip codes

In [71]:
H0 = "There is no relationship between postal code and total claims. They are independent"
H1 = "There is a relationship between postal code and total claims. They are not independent"

In [73]:
postalcode_data = df_relevant_cols[['PostalCode', 'TotalClaims']]

# Create 'NoClaim' and 'ClaimTaken' columns
postalcode_data['NoClaim'] = postalcode_data['TotalClaims'].apply(lambda x: 1 if x == 0 else 0)
postalcode_data['ClaimTaken'] = postalcode_data['TotalClaims'].apply(lambda x: 1 if x > 0 else 0)

# Group by 'Province' and sum 'NoClaim' and 'ClaimTaken'
grouped_data = postalcode_data.groupby('PostalCode')[['NoClaim', 'ClaimTaken']].sum()

print(grouped_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  postalcode_data['NoClaim'] = postalcode_data['TotalClaims'].apply(lambda x: 1 if x == 0 else 0)


            NoClaim  ClaimTaken
PostalCode                     
1              5329          12
2              1482           6
4                77           0
5               396           4
6               438           2
...             ...         ...
9781            640           3
9830             56           0
9868            100           0
9869           1414           1
9870            220           0

[888 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  postalcode_data['ClaimTaken'] = postalcode_data['TotalClaims'].apply(lambda x: 1 if x > 0 else 0)


In [74]:
# Convert grouped DataFrame to a list of lists
grouped_data_list = grouped_data.values.tolist()

print(grouped_data_list)

[[5329, 12], [1482, 6], [77, 0], [396, 4], [438, 2], [356, 0], [1278, 1], [160, 0], [412, 3], [605, 0], [453, 1], [132, 3], [343, 0], [484, 2], [373, 1], [22, 0], [119, 1], [230, 0], [308, 2], [216, 0], [662, 7], [700, 1], [181, 0], [50, 0], [100, 0], [156, 0], [1813, 7], [100, 0], [48961, 210], [1031, 7], [510, 0], [99, 1], [108, 2], [250, 0], [214, 0], [50, 0], [360, 0], [9391, 32], [263, 1], [192, 0], [792, 7], [467, 3], [3546, 10], [906, 1], [585, 0], [380, 0], [478, 2], [1353, 8], [921, 6], [154, 4], [1655, 9], [2502, 13], [395, 2], [1735, 7], [330, 0], [3283, 7], [274, 0], [161, 1], [73, 0], [121, 0], [81, 0], [366, 6], [108, 2], [2040, 4], [805, 5], [1764, 1], [250, 0], [907, 0], [63, 0], [188, 0], [192, 0], [802, 0], [328, 3], [1, 0], [108, 0], [243, 0], [25479, 67], [4983, 7], [90, 0], [9510, 21], [2278, 6], [2547, 8], [1001, 2], [1935, 2], [1563, 5], [160, 0], [543, 1], [5043, 4], [5602, 9], [386, 0], [3500, 3], [728, 1], [1, 0], [60, 0], [7, 0], [831, 3], [714, 3], [60, 0], 

In [75]:
# Perform Chi-square test of independence
chi2_stat, p_value, dof, expected_values = chi2_contingency(grouped_data_list)

print(f'Chi-square statistic: {chi2_stat}')
print(f'p-value: {p_value}')
print(f'Degrees of freedom: {dof}')
print('Expected values:')
print(expected_values)

Chi-square statistic: 1454.456517818413
p-value: 3.159060974473238e-30
Degrees of freedom: 887
Expected values:
[[5.32611068e+03 1.48893233e+01]
 [1.48385184e+03 4.14815822e+00]
 [7.67853440e+01 2.14656037e-01]
 ...
 [9.97212259e+01 2.78774074e-01]
 [1.41105535e+03 3.94465315e+00]
 [2.19386697e+02 6.13302963e-01]]


In [76]:
pro=0.95
alpha = 1- pro
print('Significance=%0.3f, p = %0.3f' % (alpha, p_value))
if p_value <= alpha:
    print('Dependent, we reject the null hypothesis')
    print(H1)
    
else:
    print('Independent, we fail to reject the null hypothesis')
    print(H0)
    

Significance=0.050, p = 0.000
Dependent, we reject the null hypothesis
There is a relationship between postal code and total claims. They are not independent


#### Testing margin differences between zip codes

In [87]:
H0 = "There is no relationship between postal code and margin. They are independent"
H1 = "There is a relationship between postal code and margin. They are not independent"

In [83]:
# Calculate margin
df_relevant_cols['Margin'] = df_relevant_cols['TotalPremium'] - df_relevant_cols['TotalClaims']
postalcode_data = df_relevant_cols[['PostalCode', 'Margin']]

# Create 'Loss' and 'Profit' columns
postalcode_data['Loss'] = postalcode_data['Margin'].apply(lambda x: 1 if x < 0 else 0)
postalcode_data['Profit'] = postalcode_data['Margin'].apply(lambda x: 1 if x > 0 else 0)

# Group by 'PostalCode' and sum 'Loss' and 'Profit'
grouped_data = postalcode_data.groupby('PostalCode')[['Loss', 'Profit']].sum()

# Add a small constant to avoid zero frequencies
grouped_data += 0.5

print(grouped_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant_cols['Margin'] = df_relevant_cols['TotalPremium'] - df_relevant_cols['TotalClaims']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  postalcode_data['Loss'] = postalcode_data['Margin'].apply(lambda x: 1 if x < 0 else 0)


            Loss  Profit
PostalCode              
1           12.5  2838.5
2            6.5   647.5
4            0.5    77.5
5            4.5   356.5
6            2.5   218.5
...          ...     ...
9781         3.5   429.5
9830         0.5    56.5
9868         0.5   100.5
9869         1.5   844.5
9870         0.5   220.5

[888 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  postalcode_data['Profit'] = postalcode_data['Margin'].apply(lambda x: 1 if x > 0 else 0)


In [84]:
# Convert grouped DataFrame to a list of lists
grouped_data_list = grouped_data.values.tolist()

print(grouped_data_list)

[[12.5, 2838.5], [6.5, 647.5], [0.5, 77.5], [4.5, 356.5], [2.5, 218.5], [0.5, 108.5], [1.5, 1023.5], [0.5, 160.5], [3.5, 398.5], [0.5, 384.5], [1.5, 384.5], [3.5, 132.5], [0.5, 204.5], [2.5, 457.5], [1.5, 329.5], [0.5, 11.5], [1.5, 39.5], [0.5, 230.5], [2.5, 308.5], [0.5, 36.5], [7.5, 596.5], [1.5, 296.5], [0.5, 181.5], [0.5, 50.5], [0.5, 100.5], [0.5, 111.5], [7.5, 1473.5], [0.5, 100.5], [209.5, 27709.5], [7.5, 803.5], [0.5, 150.5], [1.5, 99.5], [2.5, 108.5], [0.5, 40.5], [0.5, 58.5], [0.5, 50.5], [0.5, 90.5], [32.5, 5964.5], [1.5, 66.5], [0.5, 24.5], [7.5, 686.5], [3.5, 359.5], [10.5, 2073.5], [1.5, 185.5], [0.5, 362.5], [0.5, 150.5], [2.5, 296.5], [8.5, 700.5], [6.5, 791.5], [4.5, 133.5], [9.5, 1246.5], [13.5, 1082.5], [2.5, 295.5], [7.5, 1078.5], [0.5, 220.5], [7.5, 1856.5], [0.5, 0.5], [1.5, 137.5], [0.5, 73.5], [0.5, 121.5], [0.5, 0.5], [6.5, 266.5], [2.5, 108.5], [4.5, 1281.5], [5.5, 685.5], [1.5, 1042.5], [0.5, 130.5], [0.5, 772.5], [0.5, 27.5], [0.5, 170.5], [0.5, 192.5], [0.5

In [85]:
# Perform Chi-square test of independence
chi2_stat, p_value, dof, expected_values = chi2_contingency(grouped_data_list)

print(f'Chi-square statistic: {chi2_stat}')
print(f'p-value: {p_value}')
print(f'Degrees of freedom: {dof}')
print('Expected values:')
print(expected_values)

Chi-square statistic: 7611.3529282569325
p-value: 0.0
Degrees of freedom: 887
Expected values:
[[1.61257010e+01 2.83487430e+03]
 [3.69912607e+00 6.50300874e+02]
 [4.41180174e-01 7.75588198e+01]
 ...
 [5.71271763e-01 1.00428728e+02]
 [4.78510804e+00 8.41214892e+02]
 [1.25001049e+00 2.19749990e+02]]


In [89]:
pro=0.95
alpha = 1- pro
print('Significance=%0.3f, p = %0.3f' % (alpha, p_value))
if p_value <= alpha:
    print('Dependent, we reject the null hypothesis')
    print(H1)
    
else:
    print('Independent, we fail to reject the null hypothesis')
    print(H0)
    

Significance=0.050, p = 0.000
Dependent, we reject the null hypothesis
There is a relationship between postal code and margin. They are not independent


#### Testing risk differences between women and men

In [90]:
H0 = "There is no risk difference between women and men. They are independent"
H1 = "There is a risk difference between women and men. They are not independent"

In [94]:
gender_data = df_relevant_cols[['Gender', 'TotalClaims']]

# Create 'NoClaim' and 'ClaimTaken' columns
gender_data['NoClaim'] = gender_data['TotalClaims'].apply(lambda x: 1 if x == 0 else 0)
gender_data['ClaimTaken'] = gender_data['TotalClaims'].apply(lambda x: 1 if x > 0 else 0)

# Group by 'Province' and sum 'NoClaim' and 'ClaimTaken'
grouped_data = gender_data.groupby('Gender')[['NoClaim', 'ClaimTaken']].sum()

print(grouped_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gender_data['NoClaim'] = gender_data['TotalClaims'].apply(lambda x: 1 if x == 0 else 0)


               NoClaim  ClaimTaken
Gender                            
Female            6741          14
Male             42723          94
Not specified   947841        2680


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gender_data['ClaimTaken'] = gender_data['TotalClaims'].apply(lambda x: 1 if x > 0 else 0)


In [96]:
# Convert grouped DataFrame to a list of lists
grouped_data_list = grouped_data.values.tolist()

print(grouped_data_list)

[[6741, 14], [42723, 94], [947841, 2680]]


In [97]:
# Perform Chi-square test of independence
chi2_stat, p_value, dof, expected_values = chi2_contingency(grouped_data_list)

print(f'Chi-square statistic: {chi2_stat}')
print(f'p-value: {p_value}')
print(f'Degrees of freedom: {dof}')
print('Expected values:')
print(expected_values)

Chi-square statistic: 6.9921715769627895
p-value: 0.030315813998297487
Degrees of freedom: 2
Expected values:
[[6.73616881e+03 1.88311887e+01]
 [4.26976373e+04 1.19362695e+02]
 [9.47871194e+05 2.64980612e+03]]


In [98]:
pro=0.95
alpha = 1- pro
print('Significance=%0.3f, p = %0.3f' % (alpha, p_value))
if p_value <= alpha:
    print('Dependent, we reject the null hypothesis')
    print(H1)
    
else:
    print('Independent, we fail to reject the null hypothesis')
    print(H0)
    

Significance=0.050, p = 0.030
Dependent, we reject the null hypothesis
There is a risk difference between women and men. They are not independent
