In [50]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.feature_selection import SelectKBest, chi2

In [51]:
file_path = 'accident.xlsx'

In [52]:
df = pd.read_excel(file_path)
df.head(11)

Unnamed: 0,is_adult,is_male,accident
0,1,0,1
1,1,1,1
2,1,1,0
3,1,1,0
4,1,0,0
5,1,1,0
6,1,1,0
7,1,1,0
8,1,1,0
9,1,1,0


In [53]:
df.describe()

Unnamed: 0,is_adult,is_male,accident
count,927.0,927.0,927.0
mean,1.0,0.783172,0.157497
std,0.0,0.412307,0.364466
min,1.0,0.0,0.0
25%,1.0,1.0,0.0
50%,1.0,1.0,0.0
75%,1.0,1.0,0.0
max,1.0,1.0,1.0


In [54]:
df.columns

Index(['is_adult', 'is_male', 'accident'], dtype='object')

In [55]:
df.loc[(df.accident == 1)].head(8)

Unnamed: 0,is_adult,is_male,accident
0,1,0,1
1,1,1,1
11,1,0,1
13,1,0,1
15,1,1,1
16,1,1,1
20,1,0,1
21,1,0,1


In [56]:
adult_acc_crosstab = pd.crosstab(df['is_adult'], df['accident'], margins=True)
print(adult_acc_crosstab)

accident    0    1  All
is_adult               
1         781  146  927
All       781  146  927


In [57]:
male_acc_crosstab = pd.crosstab(df['is_male'], df['accident'], margins=True)
print(male_acc_crosstab)

accident    0    1  All
is_male                
0         154   47  201
1         627   99  726
All       781  146  927


In [77]:
# defining function to pass crosstab tables and confidence interval
def check_categorical_dependency(crosstab_table, confidence_interval):
    stat, p, dof, expected = stats.chi2_contingency(crosstab_table)
    print("chi square statistic value = {}".format(stat))
    print("pvalue = {}".format(p))
    print('DOF ={}'.format(dof))
    #print('DOF ={}'.format(expected))
    alpha = 1.0 - confidence_interval
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')
    return expected
    

In [78]:
exp1 = check_categorical_dependency(adult_acc_crosstab, 0.95)

chi square statistic value = 0.0
pvalue = 1.0
DOF =2
Independent (fail to reject H0)


In [61]:
pd.DataFrame(exp1)

Unnamed: 0,0,1,2
0,781.0,146.0,927.0
1,781.0,146.0,927.0


In [48]:
exp2 = check_categorical_dependency(male_acc_crosstab, 0.95)

chi square statistic value = 11.270043347013548
pvalue = 0.023691007358727482
Dependent (reject H0)


In [62]:
pd.DataFrame(exp2)

Unnamed: 0,0,1,2
0,169.343042,31.656958,201.0
1,611.656958,114.343042,726.0
2,781.0,146.0,927.0


# Feature selection using chi-square

In [63]:
a = df[['is_adult', 'is_male']]

In [64]:
b = df[['accident']]

In [65]:
a_new = SelectKBest(chi2, k=1).fit_transform(a, b)

In [66]:
a_new.shape

(927, 1)

In [71]:
x = pd.crosstab(np.squeeze(a_new), np.squeeze(b))

In [72]:
print(x)

accident    0   1
row_0            
0         154  47
1         627  99
