In [198]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2
from scipy.stats import normaltest
pd.options.display.float_format = '{:.4f}'.format 

Loading the data

In [None]:
# Loading data for normality test
df = pd.read_csv('Data/loan.csv')
print('df is loaded')
df.drop(['issue_d','earliest_cr_line','last_pymnt_d',
        'last_credit_pull_d','id','member_id','settlement_date',
        'next_pymnt_d','zip_code'],axis='columns',inplace=True)

df = df.infer_objects()
df.dropna(axis=1,how='any',thresh=int(0.3*len(df)),inplace=True)
print(df.shape)

df = df.loc[df.loan_status!='Current']
df = df.loc[df.loan_status!='Late (31-120 days)']
df = df.loc[df.loan_status!='Late (16-30 days)']
df = df.loc[df.loan_status!='In Grace Period']
df = df.loc[df.loan_status!='Default']

dictionary = {'Does not meet the credit policy. Status:Fully Paid':'Fully Paid',
             'Does not meet the credit policy. Status:Charged Off':'Charged Off'}

df['loan_status'].replace(dictionary,inplace=True)
df['loan_status'].value_counts(normalize=True)

categorical = df.select_dtypes(include=['object'])
numerical = df.select_dtypes(exclude=['object'])

categorical.fillna('other',inplace=True)
numerical.fillna(numerical.median(),inplace=True)

In [182]:
# Loading data for chi2 test
df = pd.read_csv('Data/Binary_label_K_bins_train.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)

# Noramlity test

In [199]:
# D’Agostino’s K^2 Normality test 
stat, p = normaltest(numerical)
normality_test = pd.DataFrame({'feature':numerical.columns,'statistic':stat,
                               'p-value':p})

In [207]:
normality_test

Unnamed: 0,feature,statistic,p-value
0,loan_amnt,105354.4942,0.0
1,funded_amnt,105660.8966,0.0
2,funded_amnt_inv,105419.5597,0.0
3,int_rate,98178.1892,0.0
4,installment,172480.4441,0.0
5,annual_inc,4032449.766,0.0
6,dti,3244214.3085,0.0
7,delinq_2yrs,1445901.387,0.0
8,inq_last_6mths,617958.816,0.0
9,mths_since_last_delinq,196813.0188,0.0


# chi2 test

* identify features with the highest predictive values

In [183]:
y = df[['training labels']]
X = df.drop('training labels',axis='columns')

In [184]:
scores, pvalues = chi2(X, y)

In [205]:
chi2_table = pd.DataFrame({'feature':X.columns,
                     'statistic':scores,
                     'p-value':pvalues})
chi2_table.dropna(inplace=True)

In [206]:
chi2_table.sort_values('statistic',ascending=False)

Unnamed: 0,feature,statistic,p-value
22,last_pymnt_amnt,62822.9637,0.0
17,total_rec_prncp,42694.9679,0.0
15,total_pymnt,22972.1608,0.0
16,total_pymnt_inv,22906.4785,0.0
79,grade_A,20865.604,0.0
83,grade_E,14766.6788,0.0
97,sub_grade_other,13072.8918,0.0
3,int_rate,10572.7377,0.0
82,grade_D,9809.4045,0.0
80,grade_B,7833.3168,0.0
