## 4. Building a Predictive Model

In [1]:
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier


import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.svm import LinearSVC



In [2]:
df = pd.read_csv(r'C:\Users\k_mah\Documents\miniproject4-master\data\cleanloans.csv')
df.head()
df = df.drop(columns='Unnamed: 0')

In [3]:
#We need to have the Loan Status as binary when we get to the models, so let's do that first
df['Loan_Status'] = df.Loan_Status.replace(to_replace=['N', 'Y'], value=[0, 1])
y = df['Loan_Status']
df = df.drop(columns='Loan_Status')

In [4]:
#And then, let's convert the rest of our variables to dummies so we can use standard scalar
df = pd.get_dummies(df)

In [5]:
#Now we can assign the rest of the dataframe as the training variables
X = df

#And split our test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27, stratify=y)



In [6]:
#Scale and do PCA
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

pca = PCA(n_components=3)
pca.fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)

# make sure to do same pre-processing to testing data as well.
X_test_scaled = scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)

### Random Forest

In [14]:
#Random Forest
rdf = RandomForestClassifier()
rdf.fit(X_train_pca, y_train)
rdf_pred = rdf.predict(X_test_pca)

print(accuracy_score(y_test,rdf_pred))
print(precision_score(y_test, rdf_pred))
print(recall_score(y_test, rdf_pred))
print(confusion_matrix(y_test, rdf_pred))

0.6638655462184874
0.7047619047619048
0.891566265060241
[[ 5 31]
 [ 9 74]]


Try paramater grid search to improve the results

In [15]:
params1 = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20]
}

grid1 = GridSearchCV(estimator=rdf, param_grid=params1, verbose=1).fit(X_train, y_train)
print(f'The grid score: {grid1.score(X_test, y_test)}')

Fitting 5 folds for each of 9 candidates, totalling 45 fits
The grid score: 0.8403361344537815


### XGB Classifier

In [16]:
xgbc = XGBClassifier()
xgbc.fit(X_train_pca, y_train)
xgbc_pred = rdf.predict(X_test_pca)

print(accuracy_score(y_test,xgbc_pred))
print(precision_score(y_test, xgbc_pred))
print(recall_score(y_test, xgbc_pred))
print(confusion_matrix(y_test, xgbc_pred))

0.6638655462184874
0.7047619047619048
0.891566265060241
[[ 5 31]
 [ 9 74]]


Try paramater grid search to improve the results

In [17]:
params2 = {
    'booster': ['gbtree', 'dart'],
    'max_depth': [5, 10, 20]
}

grid2 = GridSearchCV(estimator=xgbc, param_grid=params1, verbose=1).fit(X_train, y_train)
print(f'The grid score: {grid2.score(X_test, y_test)}')

Fitting 5 folds for each of 9 candidates, totalling 45 fits
The grid score: 0.8319327731092437


### Linear SVC

In [18]:
lsvc = LinearSVC(random_state=0, C=0.01, max_iter = 6000).fit(X_train_scaled,y_train)

svc_pred = lsvc.predict(X_test_scaled)

print(accuracy_score(y_test,svc_pred))
print(precision_score(y_test, svc_pred))
print(recall_score(y_test, svc_pred))
print(confusion_matrix(y_test, svc_pred))

0.8319327731092437
0.8058252427184466
1.0
[[16 20]
 [ 0 83]]


Try paramater grid search to improve the results

In [20]:
params3 = {
                'kernel' : ['linear', 'rbf', 'poly'],
                'gamma' : [0.1, 1, 10, 100],
                'C' : [0.1, 1, 10, 100, 1000],
}

grid3 = GridSearchCV(estimator=rdf, param_grid=params1, verbose=1).fit(X_train, y_train)
print(f'The grid score: {grid3.score(X_test, y_test)}')

Fitting 5 folds for each of 9 candidates, totalling 45 fits
The grid score: 0.8403361344537815
