In [45]:
# ------------------------------------------------------------------
# Build the Support Vector Classifier Model
# Predict the loan approval status based on 
# Gender, Marital Status, Credit History, Income and Loan Amount
# ------------------------------------------------------------------

# Import Libraries and read csv file

import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import seaborn as sns
file = pd.read_csv('01Exercise1.csv')

In [14]:
#find out columns with missing values
file.head()


Unnamed: 0,gender,married,ch,income,loanamt,status
0,Male,No,1.0,5849,,Y
1,Male,Yes,1.0,4583,128.0,N
2,Male,Yes,1.0,3000,66.0,Y
3,Male,Yes,1.0,2583,120.0,Y
4,Male,No,1.0,6000,141.0,Y


In [15]:
file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   gender   601 non-null    object 
 1   married  611 non-null    object 
 2   ch       564 non-null    float64
 3   income   614 non-null    int64  
 4   loanamt  592 non-null    float64
 5   status   614 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 28.9+ KB


In [16]:
# Replace Missing Values. Drop the rows.


In [17]:
file.isnull().sum()

gender     13
married     3
ch         50
income      0
loanamt    22
status      0
dtype: int64

In [21]:
file.columns

Index(['ch', 'income', 'loanamt', 'status'], dtype='object')

In [46]:
# Drop irrelevant columns based on business sense
file.drop(['gender'], inplace=True, axis=1)
file

Unnamed: 0,married,ch,income,loanamt,status
0,No,1.0,5849,,Y
1,Yes,1.0,4583,128.0,N
2,Yes,1.0,3000,66.0,Y
3,Yes,1.0,2583,120.0,Y
4,No,1.0,6000,141.0,Y
...,...,...,...,...,...
609,No,1.0,2900,71.0,Y
610,Yes,1.0,4106,40.0,Y
611,Yes,1.0,8072,253.0,Y
612,Yes,1.0,7583,187.0,Y


In [47]:
# Create Dummy variables
file.dtypes
file = pd.get_dummies(file, drop_first=True)
file

Unnamed: 0,ch,income,loanamt,married_Yes,status_Y
0,1.0,5849,,0,1
1,1.0,4583,128.0,1,0
2,1.0,3000,66.0,1,1
3,1.0,2583,120.0,1,1
4,1.0,6000,141.0,0,1
...,...,...,...,...,...
609,1.0,2900,71.0,0,1
610,1.0,4106,40.0,1,1
611,1.0,8072,253.0,1,1
612,1.0,7583,187.0,1,1


In [85]:
#file['loanamt'].fillna((file['loanamt'].mean()), inplace=True)
file.fillna('missing', inplace=True)
file

Unnamed: 0,ch,income,loanamt,married_Yes,status_Y
0,1.0,0.072991,-1.735854e-17,0,1
1,1.0,-0.134412,-2.192733e-01,1,0
2,1.0,-0.393747,-9.576410e-01,1,1
3,1.0,-0.462062,-3.145466e-01,1,1
4,1.0,0.097728,-6.445428e-02,0,1
...,...,...,...,...,...
609,1.0,-0.410130,-8.980952e-01,0,1
610,1.0,-0.212557,-1.267279e+00,1,1
611,1.0,0.437174,1.269371e+00,1,1
612,1.0,0.357064,4.833669e-01,1,1


In [86]:
# Normalize the data (Income and Loan Amount) Using StandardScaler
from sklearn.preprocessing import StandardScaler
scalar_ = StandardScaler()

file['income'] = scalar_.fit_transform(file[['income']])
file['loanamt'] = scalar_.fit_transform(file[['loanamt']])
file

Unnamed: 0,ch,income,loanamt,married_Yes,status_Y
0,1.0,0.072991,-2.314471e-17,0,1
1,1.0,-0.134412,-2.192733e-01,1,0
2,1.0,-0.393747,-9.576410e-01,1,1
3,1.0,-0.462062,-3.145466e-01,1,1
4,1.0,0.097728,-6.445428e-02,0,1
...,...,...,...,...,...
609,1.0,-0.410130,-8.980952e-01,0,1
610,1.0,-0.212557,-1.267279e+00,1,1
611,1.0,0.437174,1.269371e+00,1,1
612,1.0,0.357064,4.833669e-01,1,1


In [87]:
file.isnull().sum()

ch             0
income         0
loanamt        0
married_Yes    0
status_Y       0
dtype: int64

In [88]:
# Create the X (Independent) and Y (Dependent) dataframes
y= file['status_Y']
X = file.drop(['status_Y'], axis=1)
X

Unnamed: 0,ch,income,loanamt,married_Yes
0,1.0,0.072991,-2.314471e-17,0
1,1.0,-0.134412,-2.192733e-01,1
2,1.0,-0.393747,-9.576410e-01,1
3,1.0,-0.462062,-3.145466e-01,1
4,1.0,0.097728,-6.445428e-02,0
...,...,...,...,...
609,1.0,-0.410130,-8.980952e-01,0
610,1.0,-0.212557,-1.267279e+00,1
611,1.0,0.437174,1.269371e+00,1
612,1.0,0.357064,4.833669e-01,1


In [89]:
# Split the X and Y dataset into training and testing set
# Split the X and Y dataset into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 434, stratify=y)

In [90]:
# Import and build Support Vector Classifier

#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)



In [91]:
# Predict the outcome using Test data
y_pred = clf.predict(X_test)

In [94]:
# Build the conufsion matrix and get the accuracy/score
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 28,  30],
       [  8, 119]], dtype=int64)

In [96]:
score = clf.score(X_test, y_test)
score

0.7945945945945946

cr  = classification_report(y_test, y_pred)
print(cr)

#LogisticRegression

In [83]:

from sklearn.linear_model import LogisticRegression
lr= LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 656, stratify=y)

In [99]:
lr.fit(X_train, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [100]:
preds = lr.predict(X_test)
preds

array([1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1], dtype=uint8)

In [102]:
cm = confusion_matrix(y_test, preds)
cm

array([[ 28,  30],
       [  8, 119]], dtype=int64)

In [103]:

score = lr.score(X_test, y_test)
score

0.7945945945945946

In [104]:

cr  = classification_report(y_test, preds)
print(cr)

              precision    recall  f1-score   support

           0       0.78      0.48      0.60        58
           1       0.80      0.94      0.86       127

    accuracy                           0.79       185
   macro avg       0.79      0.71      0.73       185
weighted avg       0.79      0.79      0.78       185



In [105]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB
NBmodel = GaussianNB()
NBmodel.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [106]:

predicted = NBmodel.predict(X_test)
predicted

array([1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1], dtype=uint8)

In [107]:

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [109]:

cm2 = confusion_matrix(y_test,predicted)
score2 = accuracy_score(y_test,predicted)
cr2 = classification_report(y_test,predicted)
auc2 = roc_auc_score(y_test,predicted)
print('Confusion Matrix for Naive Bayes:''\n',cm2 )
print('Accuracy Score for Naive Bayes:',score2 )
print('Classification Report for Naive Bayes:''\n''\n',cr2)


Confusion Matrix for Naive Bayes:
 [[ 28  30]
 [  8 119]]
Accuracy Score for Naive Bayes: 0.7945945945945946
Classification Report for Naive Bayes:

               precision    recall  f1-score   support

           0       0.78      0.48      0.60        58
           1       0.80      0.94      0.86       127

    accuracy                           0.79       185
   macro avg       0.79      0.71      0.73       185
weighted avg       0.79      0.79      0.78       185



# Decision Tree

In [112]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

print("Accuracy:",accuracy_score(y_test, y_pred))
print("Classification Report for Decision Tree:""\n",classification_report(y_test, y_pred))


Accuracy: 0.6432432432432432
Classification Report for Decision Tree:
               precision    recall  f1-score   support

           0       0.42      0.34      0.38        58
           1       0.72      0.78      0.75       127

    accuracy                           0.64       185
   macro avg       0.57      0.56      0.56       185
weighted avg       0.63      0.64      0.63       185

