In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv('/Users/amarjotsinghlohia/Documents/Dissertation/Data/home-credit-default-risk/application_train_upsampled.csv')
data.sample(10)

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,...,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,ORGANIZATION_TYPE,AGE,EMPLOYMENT_AGE,REGISTRATION_AGE,previous_loan_counts,NO_OF_PREV_APP
71965,79320,252767,0,0,0,0,1,0,112500.0,595903.5,...,1,0,3,1.0,0,27.939726,9.065753,4.353425,0.0,5.0
69305,76404,247046,0,1,1,0,0,0,81000.0,202500.0,...,0,0,3,2.0,0,54.178082,29.479452,12.205479,15.0,0.0
74602,82239,258263,0,0,1,0,1,1,90000.0,270000.0,...,0,0,1,3.0,1,33.734247,1.315068,1.608219,0.0,2.0
25870,28483,154762,0,0,1,1,1,0,157500.0,1125000.0,...,0,0,1,2.0,1,38.241096,6.684932,5.726027,3.0,0.0
123923,136574,362887,0,0,0,0,0,0,157500.0,178290.0,...,0,0,1,1.0,0,26.150685,2.832877,11.613699,0.0,8.0
89918,99101,290630,0,0,0,0,0,0,112500.0,360000.0,...,0,0,0,2.0,0,42.89589,12.008219,2.271233,3.0,1.0
37082,40819,178636,0,0,0,0,1,0,67500.0,526491.0,...,1,0,0,2.0,0,51.79726,9.189041,16.273973,0.0,1.0
144349,159038,406014,0,0,1,1,1,0,94500.0,328500.0,...,0,0,1,1.0,0,54.849315,1.221918,4.241096,0.0,5.0
180655,184655,455264,1,0,1,0,1,1,45000.0,755190.0,...,0,0,1,3.0,0,41.405479,14.191781,14.476712,0.0,1.0
48058,52917,202131,0,0,1,1,0,0,76500.0,225000.0,...,0,0,0,2.0,0,50.731507,10.734247,14.29863,0.0,2.0


In [3]:
data['TARGET'].value_counts()

0    168113
1    168113
Name: TARGET, dtype: int64

In [4]:
168113/17063

9.8524878391842

In [5]:
#Splitting into train and test dataset
X = data.iloc[:,3:]
Y = data['TARGET']
X.shape, Y.shape

((336226, 28), (336226,))

In [6]:
#Splitting into training and test data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, stratify= Y)
print(X.shape, X_train.shape, X_test.shape, Y.shape, Y_train.shape, Y_test.shape)

(336226, 28) (235358, 28) (100868, 28) (336226,) (235358,) (100868,)


In [7]:
model = LogisticRegression()
model.fit(X_train, Y_train)


In [8]:
print(model.score(X_test, Y_test))
Y_pred = model.predict(X_test)
print(classification_report(Y_pred,Y_test))

0.5697743585676329
              precision    recall  f1-score   support

           0       0.55      0.57      0.56     48908
           1       0.58      0.57      0.58     51960

    accuracy                           0.57    100868
   macro avg       0.57      0.57      0.57    100868
weighted avg       0.57      0.57      0.57    100868



### Grid Search CV

In [11]:
classifier = LogisticRegression()

In [12]:
parameter = {'penalty': ['l2'],
             'C': [1,2,3,4,5,6,10,20,30,40,50],
             'max_iter': [100,200,300]}

In [13]:
classifier_regressor = GridSearchCV(classifier, param_grid =parameter, scoring = 'accuracy', error_score='raise',cv = 5, verbose = 2)
classifier_regressor.fit(X_train, Y_train)

Fitting 5 folds for each of 33 candidates, totalling 165 fits
[CV] END ......................C=1, max_iter=100, penalty=l2; total time=   1.2s
[CV] END ......................C=1, max_iter=100, penalty=l2; total time=   1.1s
[CV] END ......................C=1, max_iter=100, penalty=l2; total time=   1.0s
[CV] END ......................C=1, max_iter=100, penalty=l2; total time=   0.9s
[CV] END ......................C=1, max_iter=100, penalty=l2; total time=   1.0s
[CV] END ......................C=1, max_iter=200, penalty=l2; total time=   0.9s
[CV] END ......................C=1, max_iter=200, penalty=l2; total time=   1.0s
[CV] END ......................C=1, max_iter=200, penalty=l2; total time=   0.7s
[CV] END ......................C=1, max_iter=200, penalty=l2; total time=   1.0s
[CV] END ......................C=1, max_iter=200, penalty=l2; total time=   0.9s
[CV] END ......................C=1, max_iter=300, penalty=l2; total time=   1.0s
[CV] END ......................C=1, max_iter=30

In [14]:
print(classifier_regressor.best_params_)

{'C': 1, 'max_iter': 100, 'penalty': 'l2'}


In [15]:
print(classifier_regressor.best_score_)

0.5669363277332717


In [16]:
##prediction
y_pred = classifier_regressor.predict(X_test)

In [17]:
score = accuracy_score(y_pred, Y_test)
print(score)

0.5697743585676329


In [18]:
report = classification_report(y_pred,Y_test)
print(report)

              precision    recall  f1-score   support

           0       0.55      0.57      0.56     48908
           1       0.58      0.57      0.58     51960

    accuracy                           0.57    100868
   macro avg       0.57      0.57      0.57    100868
weighted avg       0.57      0.57      0.57    100868

