In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

np.random.seed(27)

In [13]:
df = pd.read_csv('loan_data.csv')
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [14]:
df=df.rename(columns={"not.fully.paid": "not_fully_paid"})
X=df.drop(['purpose', 'not_fully_paid'], axis = 1) 
y=df['not_fully_paid']

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) 

In [16]:
# scaling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
# define models
ridge = linear_model.Ridge()
lasso = linear_model.Lasso()
elastic = linear_model.ElasticNet()
lasso_lars = linear_model.LassoLars()
bayesian_ridge = linear_model.BayesianRidge()
logistic = linear_model.LogisticRegression(solver='liblinear')
sgd = linear_model.SGDClassifier()

In [18]:
models = [ridge, lasso, elastic, lasso_lars, bayesian_ridge, logistic, sgd]

In [19]:
def get_cv_scores(model):
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    print('CV Mean: ', np.mean(scores))
    print('STD: ', np.std(scores))
    print('\n')

In [20]:
for model in models:
    print(model)
    get_cv_scores(model)

Ridge()
CV Mean:  0.6711591932643927
STD:  0.007442083906043405


Lasso()
CV Mean:  0.5
STD:  0.0


ElasticNet()
CV Mean:  0.5
STD:  0.0


LassoLars()
CV Mean:  0.5
STD:  0.0


BayesianRidge()
CV Mean:  0.6710967563191155
STD:  0.0070153942752273655


LogisticRegression(solver='liblinear')
CV Mean:  0.668921807277892
STD:  0.008072460072523584


SGDClassifier()
CV Mean:  0.5559095443104864
STD:  0.0709359495032159




# Grid Search

In [21]:
penalty = ['l1', 'l2']
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]
solver = ['liblinear', 'saga']

param_grid = dict(penalty=penalty,
                  C=C,
                  class_weight=class_weight,
                  solver=solver)

grid = GridSearchCV(estimator=logistic, param_grid=param_grid, scoring='roc_auc', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 626 tasks      | elapsed:   29.3s
[Parallel(n_jobs=-1)]: Done 633 out of 640 | elapsed:   29.6s remaining:    0.2s


Best Score:  0.6697812020796645
Best Params:  {'C': 0.01, 'class_weight': {1: 0.4, 0: 0.6}, 'penalty': 'l2', 'solver': 'liblinear'}


[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:   29.9s finished


In [22]:
logistic = linear_model.LogisticRegression(C=0.01, class_weight={1:0.4, 0:0.6}, penalty='l2', solver='liblinear')
get_cv_scores(logistic)

CV Mean:  0.6697812020796645
STD:  0.007152866460124461




# Random Search

In [24]:
randomSearch = RandomizedSearchCV(estimator=logistic, param_distributions=param_grid, scoring='roc_auc', verbose=1, n_jobs=-1)
randomSearch_result = randomSearch.fit(X_train, y_train)

print('Best Score: ', randomSearch_result.best_score_)
print('Best Params: ', randomSearch_result.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best Score:  0.6697812020796645
Best Params:  {'solver': 'liblinear', 'penalty': 'l2', 'class_weight': {1: 0.4, 0: 0.6}, 'C': 0.01}


[Parallel(n_jobs=-1)]: Done  43 out of  50 | elapsed:    1.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.4s finished
