In [1]:
#!pip install scikit-learn==0.23.1

# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn import metrics
from sklearn.metrics import jaccard_score
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score

# download dataset
# dataset: historical dataset from previous loan applications

!wget -O loan_train.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_train.csv

--2021-04-16 08:15:15--  https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_train.csv
Resolving s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)... 67.228.254.196
Connecting to s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)|67.228.254.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23101 (23K) [text/csv]
Saving to: ‘loan_train.csv’


2021-04-16 08:15:16 (249 KB/s) - ‘loan_train.csv’ saved [23101/23101]



In [2]:
df = pd.read_csv('loan_train.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_status,Principal,terms,effective_date,due_date,age,education,Gender
0,0,0,PAIDOFF,1000,30,9/8/2016,10/7/2016,45,High School or Below,male
1,2,2,PAIDOFF,1000,30,9/8/2016,10/7/2016,33,Bechalor,female
2,3,3,PAIDOFF,1000,15,9/8/2016,9/22/2016,27,college,male
3,4,4,PAIDOFF,1000,30,9/9/2016,10/8/2016,28,college,female
4,6,6,PAIDOFF,1000,30,9/9/2016,10/8/2016,29,college,male


In [3]:
# convert pandas data frame to numpy array

df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)

Feature = df[['Principal','terms','age','Gender']]
Feature = pd.concat([Feature,pd.get_dummies(df['education'])], axis=1)
Feature.drop(['Master or Above'], axis = 1,inplace=True)

X = Feature 
# labels
y = df['loan_status'].values
# normalize data
X= preprocessing.StandardScaler().fit(X).transform(X)

In [4]:
# train test split 
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)

### Building model using KNN, finding the best k and accuracy evaluation

In [5]:
from sklearn.neighbors import KNeighborsClassifier

accur = []
for k in range(1,100):
    neighbours = KNeighborsClassifier(n_neighbors=k)
    neighbours.fit(X_train,y_train)
    y_k = neighbours.predict(X_test)
    accur.append(metrics.accuracy_score(y_k,y_test))
    
max_k = accur.index(max(accur))
knn = KNeighborsClassifier(n_neighbors=max_k).fit(X_train,y_train)
y_pred_KNN = knn.predict(X_test)
y_KNN_prob = knn.predict_proba(X_test)

print("Using K-Nearest-Neighbors, maximum accuracy was achieved for k = ",max_k, ", and accuracy score ",max(accur) ,".")

Using K-Nearest-Neighbors, maximum accuracy was achieved for k =  12 , and accuracy score  0.8 .


In [6]:
# Jaccard score
print("The Jaccard score is : ",jaccard_score(y_test,y_pred_KNN,pos_label = "PAIDOFF"), ". ")

#F1-score
print("The F1-score is : ",f1_score(y_test, y_pred_KNN, pos_label = 'PAIDOFF', labels =['COLLECTION','PAIDOFF']), ". ")

# LogLoss: measures the performance of a classifier
print("The Log-Loss is : ",log_loss(y_test, y_KNN_prob, labels =['COLLECTION','PAIDOFF']), ". ")

The Jaccard score is :  0.7391304347826086 . 
The F1-score is :  0.8500000000000001 . 
The Log-Loss is :  0.576418964684553 . 


### Building model using Decision Tree and find the accuracy evaluation

In [7]:
from sklearn.tree import DecisionTreeClassifier

In [8]:
loanTree = DecisionTreeClassifier(criterion = "entropy", max_depth = 4)
loanTree.fit(X_train,y_train)
y_pred_DT = loanTree.predict(X_test)
y_DT_prob = loanTree.predict_proba(X_test)
print("Using DecisionTree, an accuracy of", metrics.accuracy_score(y_test, y_pred_DT), " was achieved.")

Using DecisionTree, an accuracy of 0.7857142857142857  was achieved.


In [9]:
# Jaccard score
print("The Jaccard score is : ",jaccard_score(y_test,y_pred_DT,pos_label = "PAIDOFF"), ". ")

#F1-score
print("The F1-score is : ",f1_score(y_test, y_pred_DT, pos_label = 'PAIDOFF', labels =['COLLECTION','PAIDOFF']), ". ")

# LogLoss: measures the performance of a classifier
print("The Log-Loss is : ",log_loss(y_test, y_DT_prob, labels =['COLLECTION','PAIDOFF']), ". ")

The Jaccard score is :  0.7857142857142857 . 
The F1-score is :  0.88 . 
The Log-Loss is :  0.9357519410468473 . 


### Building model using SVM and find the accuracy evaluation 

In [10]:
from sklearn import svm

In [11]:
# kernel functions: 1. 'linear', 2. 'poly', 3. 'rbf', 4. 'sigmoid'
kernel_f = ['linear', 'poly', 'rbf','sigmoid']

In [12]:
acc = []
for f in kernel_f:
    clf = svm.SVC(kernel = f, probability = True)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accur = jaccard_score(y_test,y_pred, pos_label = "PAIDOFF")
    acc.append(accur)
print("Using Support Vecor Machines, an accuracy of", max(acc), " was achieved.")

f = kernel_f[acc.index(max(acc))]
svm_model = svm.SVC(kernel = f, probability = True).fit(X_train,y_train)
y_pred_SVM = svm_model.predict(X_test)
y_SVM_prob = svm_model.predict_proba(X_test)

Using Support Vecor Machines, an accuracy of 0.7857142857142857  was achieved.


In [13]:
# Jaccard score
print("The Jaccard score is : ",jaccard_score(y_test,y_pred_SVM,pos_label = "PAIDOFF"), ". ")

#F1-score
print("The F1-score is : ",f1_score(y_test, y_pred_SVM, pos_label = 'PAIDOFF', labels =['COLLECTION','PAIDOFF']), ". ")

# LogLoss: measures the performance of a classifier
print("The Log-Loss is : ",log_loss(y_test, y_SVM_prob, labels =['COLLECTION','PAIDOFF']), ". ")

The Jaccard score is :  0.7857142857142857 . 
The F1-score is :  0.88 . 
The Log-Loss is :  0.5218223060917082 . 


### Building model using Logistic Regression and find the accuracy evaluation 

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
LR = LogisticRegression(C=0.01,solver='liblinear')
LR.fit(X_train,y_train)
y_pred_LR = LR.predict(X_test)

y_LR_prob = LR.predict_proba(X_test)

print("Using Logistic Regression, an accuracy of", metrics.accuracy_score(y_test, y_pred_LR), " was achieved.")

# Jaccard
print("The Jaccard score is : ",jaccard_score(y_test,y_pred_LR,pos_label = "PAIDOFF"), ". ")

#f1 score
print("The F1-score is : ",f1_score(y_test, y_pred_LR, pos_label = 'PAIDOFF', labels =['COLLECTION','PAIDOFF']), ". ")

# LogLoss: measures the performance of a classifier
print("The Log-Loss is : ",log_loss(y_test, y_LR_prob, labels =['COLLECTION','PAIDOFF']), ". ")

Using Logistic Regression, an accuracy of 0.7857142857142857  was achieved.
The Jaccard score is :  0.7857142857142857 . 
The F1-score is :  0.88 . 
The Log-Loss is :  0.600866678869581 . 


In [16]:
# Thank you for reading this far :)