In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score
warnings.simplefilter(action='ignore')

In [9]:
data = pd.read_csv('data/clean_data_ohe.csv')
print(data.shape)
target = data['bad']
data.drop(['bad', 'DaysInCollection_one'], axis=1, inplace=True)

(17831, 42)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.15, random_state=42)

In [13]:
num_feats = ['mob', 'MOB_term', 'Credit_TermApr', 'maxdelay_one', 'MA_AGE', 'MA_MONTH_AT_CURR_ADDRESS', 'MA_MONTH_AT_CURR_PASSP', 'MA_MONTH_AT_CURR_JOB',
            'MA_Time_Previous_Job', 'MA_Proposed_Amount', 'TOT_INCOME', 'PARTWH_INCOME', 'PAYMD2TOTPAYM', 'PTI', 'ratio_curr_cap_share', 'larger_diff_term', 'ratio_inst_amount', 'freq_nnkd']

In [6]:
data[num_feats]

Unnamed: 0,mob,MOB_term,Credit_TermApr,maxdelay_one,DaysInCollection_one,MA_AGE,MA_MONTH_AT_CURR_ADDRESS,MA_MONTH_AT_CURR_PASSP,MA_MONTH_AT_CURR_JOB,MA_Time_Previous_Job,MA_Proposed_Amount,TOT_INCOME,PARTWH_INCOME,PAYMD2TOTPAYM,PTI,ratio_curr_cap_share,larger_diff_term,ratio_inst_amount,freq_nnkd
0,11,0.180328,61,27,17,39,419,48,71,4,485000,44700.00,1.0,0.324721,0.312136,0.092341,1,228.728852,8
1,8,0.666667,12,16,18,54,245,108,15,84,175000,51370.00,1.0,0.823204,0.322132,0.886047,1,1378.994167,6
2,6,0.500000,12,16,4,54,245,108,15,84,175000,51370.00,1.0,0.802756,0.322132,0.441420,1,1378.994167,6
3,8,0.131148,61,29,32,37,60,49,54,156,400000,50000.00,1.0,0.323370,0.230144,0.059780,1,188.642459,8
4,6,0.162162,37,48,32,36,249,6,23,38,240000,36726.85,0.0,0.461872,0.282515,0.087343,1,280.429189,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17826,12,0.705882,17,3,2,25,88,52,9,2,50000,17000.00,0.0,0.781559,0.205794,1.207949,1,205.793529,6
17827,2,0.032787,61,0,15,23,89,39,5,6,200000,32000.00,1.0,0.319020,0.172618,0.009013,1,90.553770,11
17828,8,0.163265,49,0,1,26,107,57,6,26,150000,22900.00,1.0,0.435282,0.206372,0.106248,1,96.447143,1
17829,4,0.166667,24,0,4,41,96,65,83,60,40000,14500.00,1.0,0.616386,0.156886,0.111367,1,94.785000,1


In [43]:
sc = StandardScaler()
X_train_scaled, X_test_scaled = X_train.copy(), X_test.copy()

X_train_scaled[num_feats] = sc.fit_transform(X_train_scaled[num_feats])
X_test_scaled[num_feats] = sc.transform(X_test_scaled[num_feats])

In [96]:
#c_list = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]
c_list = np.linspace(0.002, 0.005, num=50)
tol_list = [0.01]
l1_ration_list = [0.45, 0.5, 0.55]

In [None]:
%%time
max_par = [0, 0, 0]
for c in c_list:
    for tol in tol_list:
        log_reg = LogisticRegression(penalty='l2', solver='lbfgs', C=c, tol=tol, random_state=0, class_weight='balanced').fit(X_train_scaled, y_train)
        pred = log_reg.predict(X_test_scaled)
        score = roc_auc_score(y_test, pred)
        if max_par[2] < score:
            max_par[0] = c
            max_par[1] = tol
            max_par[2] = score
        print('C: {}, tol: {}, score: {:<8.7f}'.format(c, tol, score))

print(max_par)

In [75]:
log_reg = LogisticRegression(penalty='l2', solver='lbfgs', C=0.05931034482758621, tol=0.1, random_state=0, class_weight='balanced').fit(X_train_scaled, y_train)
pred = log_reg.predict(X_test_scaled)
roc_auc_score(y_test, pred)

0.6897996357012751

In [None]:
%%time
max_par = [0, 0, 0]
for c in c_list:
    for tol in tol_list:
        log_reg = LogisticRegression(penalty='l1', solver='liblinear', C=c, tol=tol, random_state=0, class_weight='balanced').fit(X_train_scaled, y_train)
        pred = log_reg.predict(X_test_scaled)
        score = roc_auc_score(y_test, pred)
        if max_par[2] < score:
            max_par[0] = c
            max_par[1] = tol
            max_par[2] = score
        print('C: {}, tol: {}, score: {:<8.7f}'.format(c, tol, score))

print(max_par)

In [85]:
log_reg = LogisticRegression(penalty='l1', solver='liblinear', C=0.0027894736842105266, tol=0.01, random_state=0, class_weight='balanced').fit(X_train_scaled, y_train)
pred = log_reg.predict(X_test_scaled)
roc_auc_score(y_test, pred)

0.6956826264203313

In [None]:
%%time
max_par = [0, 0, 0, 0]
for c in c_list:
    for tol in tol_list:
        for ll in l1_ration_list:
            log_reg = LogisticRegression(penalty='elasticnet', solver='saga', C=c, tol=tol, l1_ratio=ll, random_state=0, class_weight='balanced').fit(X_train_scaled, y_train)
            pred = log_reg.predict(X_test_scaled)
            score = roc_auc_score(y_test, pred)
            if max_par[3] < score:
                max_par[0] = c
                max_par[1] = tol
                max_par[2] = ll
                max_par[3] = score
            print('C: {}, tol: {}, ll: {}, score: {:<8.7f}'.format(c, tol, ll, score))

print(max_par)

In [98]:
log_reg = LogisticRegression(penalty='elasticnet', solver='saga', C=0.003408163265306122, tol=0.01, l1_ratio=0.45, random_state=0, class_weight='balanced').fit(X_train_scaled, y_train)
pred = log_reg.predict(X_test_scaled)
roc_auc_score(y_test, pred)

0.6973479920201231

In [120]:
log_reg.coef_[0]

array([-0.00466085,  0.        ,  0.        ,  0.98433908,  0.        ,
        0.02411937,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.07377623, -0.01678334,  0.        ,  0.        ,
        0.04257597,  0.        ,  0.        , -0.14719348,  0.00475934,
        0.05184266,  0.        ,  0.        , -0.53519138,  0.        ,
        0.        , -0.03201415,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.00884264,  0.        ])

In [112]:
indx = np.where(log_reg.coef_[0]!=0)

In [118]:
indx

(array([ 0,  3,  5, 11, 12, 15, 18, 19, 20, 23, 26, 38], dtype=int64),)

In [117]:
X_test.columns[indx]

Index(['mob', 'maxdelay_one', 'MA_AGE', 'MA_MONTH_AT_CURR_PASSP', 'MA_Exp_IND',
       'MA_Proposed_Amount', 'PAYMD2TOTPAYM', 'PTI', 'ratio_curr_cap_share',
       'freq_nnkd', 'MA_Education_2', 'MA_Residential_Status_4'],
      dtype='object')