In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import mglearn
%matplotlib inline

In [2]:
import os
path = 'E:\Python Consumer Credit'
os.chdir(path)

In [3]:
df_tr=pd.read_csv('df__under_tr.csv',header=None)
df_ts=pd.read_csv('df__under_ts.csv',header=None)

In [4]:
X_train = df_tr.iloc[:,:-1].values
y_train = df_tr.iloc[:,-1].values
X_test = df_ts.iloc[:,:-1].values
y_test = df_ts.iloc[:,-1].values
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(557088, 14)
(139272, 14)
(557088,)
(139272,)


In [5]:
unique, counts = np.unique(y_test,return_counts=True)
print(np.asarray((unique,counts)).T)

[[0.0000e+00 6.9358e+04]
 [1.0000e+00 6.9914e+04]]


#### Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(C=0.01, penalty='l2', solver='saga').fit(X_train,y_train)
y_tr_log_pred = log.predict(X_train)
y_ts_log_pred = log.predict(X_test)

In [6]:
from sklearn.metrics import confusion_matrix, accuracy_score
log_cm_tr = confusion_matrix(y_train,y_tr_log_pred)
print(log_cm_tr)
accuracy_score(y_train, y_tr_log_pred)

[[163763 115059]
 [115401 162865]]


0.5863131139066

In [7]:
log_cm_ts = confusion_matrix(y_test,y_ts_log_pred)
print(log_cm_ts)
accuracy_score(y_test, y_ts_log_pred)

[[40905 28453]
 [29022 40892]]


0.587318341087943

## Applying k-fold Cross Validation

In [8]:
from sklearn.model_selection import KFold, cross_val_score
accuracies = cross_val_score(estimator = log, X = X_train, y = y_train, cv = 5)
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard deviation: {:.2f} %'.format(accuracies.std()*100))

Accuracy: 58.62 %
Standard deviation: 0.15 %


In [9]:
accuracies

array([0.58476189, 0.58759805, 0.58666463, 0.58784566, 0.58430042])

## Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV
parameters = [{'penalty': ['none'], 'solver':['newton-cg', 'sag', 'saga', 'lbfgs']},
              {'penalty': ['elasticnet'], 'C': [0.01, 0.1, 0.25, 0.5, 0.75, 1, 5, 10], 'solver':['saga']},
              {'penalty': ['l2'], 'C': [0.01, 0.1, 0.25, 0.5, 0.75, 1, 5, 10], 'solver':['newton-cg', 'sag', 'saga', 'lbfgs']}]
grid_search = GridSearchCV(estimator = log, 
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print('Best accuracy: {:.2f} %'.format(best_accuracy*100))
print('Best parameters: ',best_parameters)

Best accuracy: 58.63 %
Best parameters:  {'C': 0.01, 'penalty': 'l2', 'solver': 'saga'}


## Metrics

In [8]:
from sklearn.metrics import roc_auc_score, jaccard_score, f1_score, precision_score, recall_score
print(roc_auc_score(y_test,y_ts_log_pred))
print(jaccard_score(y_test,y_ts_log_pred))
print(f1_score(y_test,y_ts_log_pred))
print(precision_score(y_test,y_ts_log_pred))
print(recall_score(y_test,y_ts_log_pred))

0.5873280743079788
0.41570852013378473
0.5872798167443397
0.5896892349844978
0.584890007723775
