In [0]:
import pandas as pd
import numpy as np
from google.colab import drive
from scipy import stats
from sklearn.feature_selection import SelectKBest, chi2

In [0]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
file_path = '/content/drive/My Drive/accident.xlsx'

In [0]:
df = pd.read_excel(file_path)

In [0]:
df.head()

Unnamed: 0,is_adult,is_male,accident
0,1,0,1
1,1,1,1
2,1,1,0
3,1,1,0
4,1,0,0


In [0]:
adult_accident_crosstab = pd.crosstab(df['is_adult'], df['accident'], 
                                      margins=True)
adult_accident_crosstab

accident,0,1,All
is_adult,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,781,146,927
All,781,146,927


In [0]:
gender_accident_crosstab = pd.crosstab(df['is_male'], df['accident'], 
                                       margins=True)
gender_accident_crosstab

accident,0,1,All
is_male,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,154,47,201
1,627,99,726
All,781,146,927


In [0]:
def check_categorical_dependency(crosstab_table, confidence_interval):
    stat, p, dof, expected = stats.chi2_contingency(crosstab_table)
    print ("Chi-Square Statistic value = {}".format(stat))
    print ("P - Value = {}".format(p))
    alpha = 1.0 - confidence_interval
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
	      print('Independent (fail to reject H0)')
    return expected

In [0]:
exp_table_1 = check_categorical_dependency(adult_accident_crosstab, 0.95)

Chi-Square Statistic value = 0.0
P - Value = 1.0
Independent (fail to reject H0)


In [0]:
pd.DataFrame(exp_table_1)

Unnamed: 0,0,1,2
0,781.0,146.0,927.0
1,781.0,146.0,927.0


In [0]:
exp_table_2 = check_categorical_dependency(gender_accident_crosstab, 0.95)

Chi-Square Statistic value = 11.270043347013548
P - Value = 0.023691007358727482
Dependent (reject H0)


In [0]:
pd.DataFrame(exp_table_2)

Unnamed: 0,0,1,2
0,169.343042,31.656958,201.0
1,611.656958,114.343042,726.0
2,781.0,146.0,927.0


# **Feature Selection using Chi-Square**

In [0]:
X = df[["is_adult",	"is_male"]]

In [0]:
y = df[["accident"]]

In [0]:
X_new = SelectKBest(chi2, k=1).fit_transform(X, y)

In [0]:
X_new.shape

(927, 1)

In [0]:
pd.crosstab(np.squeeze(X_new), np.squeeze(y))

accident,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,154,47
1,627,99
