In [None]:
'''
                ***** Loan Eligibility Classification Use case *****
                
This notebook provides the different algorithm implementation for Classification based models (Loan Eligibility). 
A. LogisticRegression
B. GaussianNB
C. RandomForestClassifier
D. support vector machines (SVM)

'''

In [1]:
from sklearn.datasets import load_boston
import sklearn.ensemble
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression, LogisticRegression
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('loanEligibility.csv', encoding='latin1')

df["sex"] = df["sex"].map({"Male": 0, "Female":1})
df["marital-status"] = df["marital-status"].replace(['Never-married','Divorced','Separated','Widowed'], 'Single')
df["marital-status"] = df["marital-status"].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'], 'Married')
df["marital-status"] = df["marital-status"].map({"Married":1, "Single":0})
df["marital-status"] = df["marital-status"].astype(int)

# Drop the data we don't want to use
df.drop(labels=["workclass","education","occupation","relationship","race","native-country"], axis = 1, inplace = True)
feature_names = df.columns
df.head()

Unnamed: 0,age,fnlwgt,education-num,marital-status,sex,current-worth,outstanding-credit,hours-per-week,loan-eligibility
0,39,77516,13,0,0,2174,0,40,ineligible
1,50,83311,13,1,0,0,0,13,ineligible
2,38,215646,9,0,0,0,0,40,ineligible
3,53,234721,7,1,0,0,0,40,ineligible
4,28,338409,13,1,1,0,0,40,ineligible


In [5]:
array = df.values
X = array[:,0:8]
Y = array[:,8]

validation_size = 0.20
seed = 7
num_trees = 100
max_features = 3

X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,
    test_size=validation_size,random_state=seed)

In [6]:
lrmodel = LogisticRegression().fit(X_train, Y_train)
predictions = lrmodel.predict(X_validation)
print("LogisticRegression Accuracy: %s%%" % (100*accuracy_score(Y_validation, predictions)))

nbmodel = GaussianNB().fit(X_train, Y_train)
predictions = nbmodel.predict(X_validation)
print("GaussianNB Accuracy: %s%%" % (100*accuracy_score(Y_validation, predictions)))

rf = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
rfmodel = rf.fit(X_train,Y_train)
predictions = rfmodel.predict(X_validation)
print("RandomForestClassifier Accuracy: %s%%" % (100*accuracy_score(Y_validation, predictions)))

sv = SVC()
svmodel = sv.fit(X_train,Y_train)
predictions = svmodel.predict(X_validation)
print("SVC Accuracy: %s%%" % (100*accuracy_score(Y_validation, predictions)))



LogisticRegression Accuracy: 79.44111776447106%
GaussianNB Accuracy: 79.24151696606786%
RandomForestClassifier Accuracy: 83.95516658989712%




SVC Accuracy: 76.07861200675572%
