# Machine Learning Project

You load a historical dataset from previous loan applications, clean the data, and apply different classification algorithm on the data. You are expected to use the following algorithms to build your models:

k-Nearest Neighbour,
Decision Tree,
Support Vector Machine,
Logistic Regression.

The results is reported as the accuracy of each classifier, using the following metrics when these are applicable:

Jaccard index,
F1-score,
LogLoass

# k-Nearest Neighbour

In [59]:
!wget -O loan_train.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_train.csv
#inport csv file

--2019-04-26 17:00:31--  https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_train.csv
Resolving s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)... 67.228.254.193
Connecting to s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)|67.228.254.193|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23101 (23K) [text/csv]
Saving to: ‘loan_train.csv’


2019-04-26 17:00:31 (11.7 MB/s) - ‘loan_train.csv’ saved [23101/23101]



In [60]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
# import library 

In [61]:
df = pd.read_csv("loan_train.csv")
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_status,Principal,terms,effective_date,due_date,age,education,Gender
0,0,0,PAIDOFF,1000,30,9/8/2016,10/7/2016,45,High School or Below,male
1,2,2,PAIDOFF,1000,30,9/8/2016,10/7/2016,33,Bechalor,female
2,3,3,PAIDOFF,1000,15,9/8/2016,9/22/2016,27,college,male
3,4,4,PAIDOFF,1000,30,9/9/2016,10/8/2016,28,college,female
4,6,6,PAIDOFF,1000,30,9/9/2016,10/8/2016,29,college,male


In [62]:
df.isnull().any()
# check is there any missing value

Unnamed: 0        False
Unnamed: 0.1      False
loan_status       False
Principal         False
terms             False
effective_date    False
due_date          False
age               False
education         False
Gender            False
dtype: bool

In [63]:
df.columns
# check what kind of feature the file have

Index(['Unnamed: 0', 'Unnamed: 0.1', 'loan_status', 'Principal', 'terms',
       'effective_date', 'due_date', 'age', 'education', 'Gender'],
      dtype='object')

In [64]:
df['loan_status'].value_counts()
# check the variable

PAIDOFF       260
COLLECTION     86
Name: loan_status, dtype: int64

In [65]:
x = df[['Principal','terms','age']].values  
x[0:5]
# Set variable x

array([[1000,   30,   45],
       [1000,   30,   33],
       [1000,   15,   27],
       [1000,   30,   28],
       [1000,   30,   29]])

In [66]:
df['loan_status'].replace('PAIDOFF','0', inplace = True)
df['loan_status'].replace('COLLECTION','1', inplace = True)


In [67]:
x[0:5]

array([[1000,   30,   45],
       [1000,   30,   33],
       [1000,   15,   27],
       [1000,   30,   28],
       [1000,   30,   29]])

In [68]:
y = df['loan_status'].values.astype(float)
y[0:5]
# Set variable y

array([ 0.,  0.,  0.,  0.,  0.])

In [69]:
x = preprocessing.StandardScaler().fit(x).transform(x.astype(float))
x[0:5]



array([[ 0.51578458,  0.92071769,  2.33152555],
       [ 0.51578458,  0.92071769,  0.34170148],
       [ 0.51578458, -0.95911111, -0.65321055],
       [ 0.51578458,  0.92071769, -0.48739188],
       [ 0.51578458,  0.92071769, -0.3215732 ]])

In [70]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
print ('Train set:', x_train.shape,  y_train.shape)
print ('Test set:', x_test.shape,  y_test.shape)

Train set: (276, 3) (276,)
Test set: (70, 3) (70,)


In [71]:
from sklearn.neighbors import KNeighborsClassifier
k = 6

neigh = KNeighborsClassifier(n_neighbors = k).fit(x_train,y_train)
neigh

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform')

In [72]:
yhat = neigh.predict(x_test)
yhat[0:5]

array([ 0.,  0.,  0.,  1.,  0.])

In [73]:
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
print('KNN Jaccard:', jaccard_similarity_score(y_test, yhat))
print('KNN F1-Score:', f1_score(y_test, yhat, average='weighted'))

Train set Accuracy:  0.757246376812
Test set Accuracy:  0.742857142857
KNN Jaccard: 0.742857142857
KNN F1-Score: 0.669789227166


# Decision Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
x1 = df[['loan_status', 'Principal','terms','age']].values  
x1[0:5]

array([['0', 1000, 30, 45],
       ['0', 1000, 30, 33],
       ['0', 1000, 15, 27],
       ['0', 1000, 30, 28],
       ['0', 1000, 30, 29]], dtype=object)

In [28]:
y1 = df['loan_status'].values
y1[0:5]

array(['0', '0', '0', '0', '0'], dtype=object)

In [29]:
x1_trainset, x1_testset, y1_trainset, y1_testset = train_test_split(x1, y1, test_size=0.3, random_state=3)
x1_trainset.shape, x1_testset.shape

((242, 4), (104, 4))

In [30]:
y1_trainset.shape, y1_testset.shape

((242,), (104,))

In [31]:
Tree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
Tree # it shows the default parameters

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [32]:
Tree.fit(x1_trainset,y1_trainset)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [33]:
pTree = Tree.predict(x1_testset)
print (pTree [0:5])
print (y1_testset [0:5])

['0' '0' '1' '1' '0']
['0' '0' '1' '1' '0']


In [34]:
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y1_testset, pTree))

DecisionTrees's Accuracy:  1.0


# Support Vector Machine

In [35]:
from sklearn import svm

In [36]:
x2 = df[['loan_status', 'Principal','terms','age']].values  
x2[0:5]

array([['0', 1000, 30, 45],
       ['0', 1000, 30, 33],
       ['0', 1000, 15, 27],
       ['0', 1000, 30, 28],
       ['0', 1000, 30, 29]], dtype=object)

In [37]:
y2 = df['loan_status'].values
y2[0:5]

array(['0', '0', '0', '0', '0'], dtype=object)

In [38]:
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.2, random_state=4)
print ('Train set:', x2_train.shape,  y2_train.shape)
print ('Test set:', x2_test.shape,  y2_test.shape)

Train set: (276, 4) (276,)
Test set: (70, 4) (70,)


In [39]:
clf = svm.SVC(kernel='rbf')
clf.fit(x2_train, y2_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [40]:
yhat = clf.predict(x2_test)
yhat [0:5]

array(['0', '0', '0', '0', '0'], dtype=object)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools
# Evaluation

In [41]:
from sklearn.metrics import f1_score
f1_score(y_test, yhat, average='weighted') 
# f1 score

0.92298136645962725

In [42]:
from sklearn.metrics import jaccard_similarity_score
jaccard_similarity_score(y_test, yhat)
# jaccard score

0.9285714285714286

# Logistic Regression

In [44]:
df = df[['loan_status', 'Principal','terms','age','loan_status']]
df['loan_status'] = df['loan_status'].astype('int')
df.head()

Unnamed: 0,loan_status,loan_status.1,Principal,terms,age,loan_status.2,loan_status.3
0,0,0,1000,30,45,0,0
1,0,0,1000,30,33,0,0
2,0,0,1000,15,27,0,0
3,0,0,1000,30,28,0,0
4,0,0,1000,30,29,0,0


In [52]:
x3 = df[['loan_status', 'Principal','terms','age']].values  
x3[0:5]

array([[   0,    0,    0,    0, 1000,   30,   45],
       [   0,    0,    0,    0, 1000,   30,   33],
       [   0,    0,    0,    0, 1000,   15,   27],
       [   0,    0,    0,    0, 1000,   30,   28],
       [   0,    0,    0,    0, 1000,   30,   29]])

In [53]:
y3 = np.asarray(df['loan_status'])
y3[0:5]

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]])

In [54]:
from sklearn import preprocessing
x3 = preprocessing.StandardScaler().fit(x3).transform(x3)
x3[0:5]



array([[-0.5751254 , -0.5751254 , -0.5751254 , -0.5751254 ,  0.51578458,
         0.92071769,  2.33152555],
       [-0.5751254 , -0.5751254 , -0.5751254 , -0.5751254 ,  0.51578458,
         0.92071769,  0.34170148],
       [-0.5751254 , -0.5751254 , -0.5751254 , -0.5751254 ,  0.51578458,
        -0.95911111, -0.65321055],
       [-0.5751254 , -0.5751254 , -0.5751254 , -0.5751254 ,  0.51578458,
         0.92071769, -0.48739188],
       [-0.5751254 , -0.5751254 , -0.5751254 , -0.5751254 ,  0.51578458,
         0.92071769, -0.3215732 ]])

In [55]:
from sklearn.model_selection import train_test_split
x3_train, x3_test, y3_train, y3_test = train_test_split( x3, y3, test_size=0.2, random_state=4)
print ('Train set:', x3_train.shape,  y3_train.shape)
print ('Test set:', x3_test.shape,  y3_test.shape)

Train set: (276, 7) (276, 4)
Test set: (70, 7) (70, 4)


In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
LR = LogisticRegression(C=0.01, solver='liblinear').fit(x3_train,y3_train)
LR

ValueError: bad input shape (276, 4)

In [57]:
yhat = LR.predict(x3_test)
yhat

NameError: name 'LR' is not defined