In [1]:
import pandas as pd
import numpy as np
import scorecardpy as sc

%pylab inline
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

import statsmodels.api as sm

import lightgbm
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier, plot_importance

import pickle
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from matplotlib import rc
rc('font', family = 'Verdana')

Populating the interactive namespace from numpy and matplotlib


In [2]:
data = pd.read_excel(r'C:\Users\Alfa\Desktop\1.xlsx')
print(data.shape)
for column_name in data.columns:
    print(column_name, ' - ', data[column_name].dtype)

(30000, 25)
ID  -  int64
LIMIT_BAL  -  int64
SEX  -  object
EDUCATION  -  object
MARRIAGE  -  object
AGE  -  int64
PAY_0  -  int64
PAY_2  -  int64
PAY_3  -  int64
PAY_4  -  int64
PAY_5  -  int64
PAY_6  -  int64
BILL_AMT1  -  int64
BILL_AMT2  -  int64
BILL_AMT3  -  int64
BILL_AMT4  -  int64
BILL_AMT5  -  int64
BILL_AMT6  -  int64
PAY_AMT1  -  int64
PAY_AMT2  -  int64
PAY_AMT3  -  int64
PAY_AMT4  -  int64
PAY_AMT5  -  int64
PAY_AMT6  -  int64
target  -  int64


In [12]:
data.drop(columns = ['ID'], inplace = True)

In [13]:
data.fillna(-9999, inplace=True)

In [14]:
text_features = [col for col in data.columns if data[col].dtype == 'object']
text_data = pd.get_dummies(data[text_features], prefix = text_features, drop_first = True)

numeric_features = [col for col in data.columns if data[col].dtype != 'object']
data_numeric = pd.concat([data[numeric_features], text_data], axis = 1)

print(data.shape, data_numeric.shape)
data.head(5)

(30000, 24) (30000, 31)


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,target
0,20000,female,bachelor,married,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,female,bachelor,single,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,female,bachelor,single,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,female,bachelor,married,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,male,bachelor,married,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [15]:
data_numeric

Unnamed: 0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,SEX_male,EDUCATION_bachelor,EDUCATION_college,EDUCATION_doctoral studies,EDUCATION_graduate school,EDUCATION_high school,EDUCATION_magistracy,MARRIAGE_divorced,MARRIAGE_married,MARRIAGE_single
0,20000,24,2,2,-1,-1,-2,-2,3913,3102,...,0,1,0,0,0,0,0,0,1,0
1,120000,26,-1,2,0,0,0,2,2682,1725,...,0,1,0,0,0,0,0,0,0,1
2,90000,34,0,0,0,0,0,0,29239,14027,...,0,1,0,0,0,0,0,0,0,1
3,50000,37,0,0,0,0,0,0,46990,48233,...,0,1,0,0,0,0,0,0,1,0
4,50000,57,-1,0,-1,0,0,0,8617,5670,...,1,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,39,0,0,0,0,0,0,188948,192815,...,1,0,0,0,0,1,0,0,1,0
29996,150000,43,-1,-1,-1,-1,0,0,1683,1828,...,1,0,0,0,0,1,0,0,0,1
29997,30000,37,4,3,2,-1,0,0,3565,3356,...,1,1,0,0,0,0,0,0,0,1
29998,80000,41,1,-1,0,0,0,-1,-1645,78379,...,1,0,0,0,0,1,0,0,1,0


In [16]:
from sklearn.metrics import roc_auc_score

In [17]:
from sklearn.model_selection import StratifiedKFold

In [18]:
def individual_gini(v):
    if data[v].dtypes == 'O':
        x = pd.get_dummies(data[v]).values
        obvious_gini = 0
    else:
        x = data[v].values.reshape(-1, 1)
        obvious_gini = abs(roc_auc_score(data.target.values,x)*2-1)
    
    parameters = {'min_weight_fraction_leaf':[0.01, 0.025, 0.05, 0.1]}
    dt = DecisionTreeClassifier(random_state=123)
    kfolds = StratifiedKFold(4)
    clf = GridSearchCV(dt, parameters, cv=kfolds.split(x, data.target.values), scoring='roc_auc')
    clf.fit(x, data.target.values)
    true_gini = abs(clf.best_score_*2-1)
    return max(obvious_gini, true_gini)

In [19]:
df_vars_ginis = pd.DataFrame(data = data.columns.values, columns = ['vars'])
df_vars_ginis['gini'] = df_vars_ginis['vars'].apply(individual_gini)

print(df_vars_ginis)

         vars      gini
0   LIMIT_BAL  0.236761
1         SEX  0.047092
2   EDUCATION  0.080282
3    MARRIAGE  0.037808
4         AGE  0.066473
5       PAY_0  0.421178
6       PAY_2  0.297225
7       PAY_3  0.266843
8       PAY_4  0.238613
9       PAY_5  0.218534
10      PAY_6  0.203236
11  BILL_AMT1  0.059442
12  BILL_AMT2  0.050444
13  BILL_AMT3  0.050912
14  BILL_AMT4  0.050309
15  BILL_AMT5  0.072859
16  BILL_AMT6  0.064760
17   PAY_AMT1  0.225657
18   PAY_AMT2  0.210601
19   PAY_AMT3  0.193110
20   PAY_AMT4  0.181013
21   PAY_AMT5  0.161249
22   PAY_AMT6  0.169705
23     target  1.000000


In [20]:
categorical_vars = []
numeric_vars = []

for col in data.columns:
    if data[col].nunique() < 4 or data[col].dtypes == 'O':
        categorical_vars.append(col)
    else:
        numeric_vars.append(col)

In [21]:
print('categorical: ', categorical_vars)
print(' ')
print('numerical: ', numeric_vars)

categorical:  ['SEX', 'EDUCATION', 'MARRIAGE', 'target']
 
numerical:  ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
