In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

In [6]:
dataset = pd.read_excel('/content/a_Dataset_CreditScoring.xlsx')

Data Preparation

In [7]:
dataset.shape

(3000, 30)

In [8]:
dataset.head()

Unnamed: 0,TARGET,ID,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1,582,3,3,0,4,0.0,5,117,27,...,3.0,0.9179,0.2083,2,3,7,0.2083,4,4,0.0
1,1,662,15,9,0,3,1.0,3,14,14,...,1.0,0.8,0.0,0,0,0,1.0,12,0,1.0
2,1,805,0,0,0,1,5.0,1,354,7,...,5.0,0.3552,0.6538,0,1,1,0.7308,1,1,0.5263
3,1,1175,8,5,0,6,1.0,10,16,4,...,3.0,0.9127,0.25,1,1,1,0.75,7,1,1.3333
4,1,1373,3,1,0,9,0.0,8,130,52,...,1.0,1.2511,0.0,0,1,4,0.1429,3,1,0.0


In [9]:
dataset = dataset.drop('ID', axis = 1)
dataset.shape

(3000, 29)

In [10]:
dataset.isna().sum()

Unnamed: 0,0
TARGET,0
DerogCnt,0
CollectCnt,0
BanruptcyInd,0
InqCnt06,0
InqTimeLast,188
InqFinanceCnt24,0
TLTimeFirst,0
TLTimeLast,0
TLCnt03,0


In [11]:
dataset = dataset.fillna(dataset.mean())

In [12]:
dataset.isna().sum()

Unnamed: 0,0
TARGET,0
DerogCnt,0
CollectCnt,0
BanruptcyInd,0
InqCnt06,0
InqTimeLast,0
InqFinanceCnt24,0
TLTimeFirst,0
TLTimeLast,0
TLCnt03,0


Train Test Split

In [18]:
y = dataset.iloc[:, 0].values
X = dataset.iloc[:, 1:29].values

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

In [21]:
sc=StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Risk Model building

In [22]:
classifier=LogisticRegression()
classifier.fit(X_train, y_train)
y_pred  = classifier.predict(X_test)

Model Performance

In [23]:
print(confusion_matrix(y_test,y_pred))

[[487  13]
 [ 87  13]]


In [24]:
print(accuracy_score(y_test, y_pred))

0.8333333333333334


Writing output file

In [25]:
predictions = classifier.predict_proba(X_test)
predictions

array([[0.61644691, 0.38355309],
       [0.9885656 , 0.0114344 ],
       [0.87069686, 0.12930314],
       ...,
       [0.94450568, 0.05549432],
       [0.46756903, 0.53243097],
       [0.94014209, 0.05985791]])

In [28]:
# Exporting Logistic Regression Classifier for later use in prediction

import joblib
joblib.dump(classifier, '/content/f1_Classifier_CreditScoring')

['/content/f1_Classifier_CreditScoring']

In [29]:
# writing model output file

df_prediction_prob = pd.DataFrame(predictions, columns = ['prob_0', 'prob_1'])
df_prediction_target = pd.DataFrame(classifier.predict(X_test), columns = ['predicted_TARGET'])
df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])

dfx=pd.concat([df_test_dataset, df_prediction_prob, df_prediction_target], axis=1)

dfx.to_csv('/content/f1_Classifier_CreditScoring', sep=',', encoding='UTF-8')

dfx.head()

Unnamed: 0,Actual Outcome,prob_0,prob_1,predicted_TARGET
0,1,0.616447,0.383553,0
1,0,0.988566,0.011434,0
2,1,0.870697,0.129303,0
3,0,0.953963,0.046037,0
4,1,0.726633,0.273367,0
