In [2]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Preprocessing and Pipeline libraries
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm

# import flask
from flask import Flask, request, jsonify, redirect, url_for, flash
from sklearn.externals import joblib
import traceback


In [3]:
print("\nLoading training data...")
# load training data
train_data = pd.read_csv("data/peerLoanTraining.csv", engine='python', header=0)

# Separate out X and y
X_train = train_data.loc[:, train_data.columns != 'is_late']
y_train = train_data['is_late']


# load test data
test_data = pd.read_csv("data/peerLoanTest.csv", engine='python', header=0)

# Separate out X and y
X_test = test_data.loc[:, test_data.columns != 'is_late']
y_test = test_data['is_late']






Loading training data...


In [73]:
# Preprocessing Steps
numeric_features = ['loan_amnt',
                    'int_rate', 'annual_inc', 'revol_util',
                    'dti', 'delinq_2yrs'
                   ]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ])

categorical_features = ['purpose','grade', 'emp_length', 'home_ownership']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
        ]
    )


In [74]:
# Combine preprocessing with classifier
latePaymentsModel = make_pipeline(preprocess, RandomForestClassifier(n_estimators = 10, random_state = 1, bootstrap = True))

In [81]:
# Fit the pipeline to the training data (fit is for both the preprocessing and the classifier)
print("\nTraining model ...")
latePaymentsModel.fit(X_train, y_train)


Training model ...


Pipeline(memory=None,
     steps=[('columntransformer', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', ...imators=10, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False))])

In [77]:
# Save the trained model as a pickle file
print("\nSaving model ...")
file = open('latePaymentsModel.pkl', 'wb')
pickle.dump(latePaymentsModel, file)
file.close()

# load the pickled model
print("\nLoading saved model to make example predictions...")
pickledModel = pickle.load(open('latePaymentsModel.pkl','rb'))

# # Save the data columns from training
# model_columns = list(X_train.columns)
# print("\nSaving model columns ...")
# file = open('model_columns.pkl','wb')
# pickle.dump(model_columns, file)
# file.close()



Saving model ...

Loading saved model to make example predictions...


In [78]:
# Make a prediction for a likely on time payer
payOnTimePrediction = {
    'loan_amnt': [100],
    'int_rate': [0.02039],
    'purpose': ['credit_card'],
    'grade': ['A'],
    'annual_inc': [80000.00],
    'revol_util': [0.05],
    'emp_length': ['10+ years'],
    'dti': [1.46],
    'delinq_2yrs': [0],
    'home_ownership': ['RENT']
    }
payOnTimePredictionDf = pd.DataFrame.from_dict(payOnTimePrediction)

print("\nPredicting class probabilities for likely on-time payer:")
print(pickledModel.predict_proba(payOnTimePredictionDf))


Predicting class probabilities for likely on-time payer:
[[1. 0.]]


In [79]:
# Prediction for a likely late payer
payLatePrediction = {
    'loan_amnt': [10000],
    'int_rate': [0.6],
    'purpose': ['credit_card'],
    'grade': ['D'],
    'annual_inc': [45000.00],
    'revol_util': [0.85],
    'emp_length': ['1 year'],
    'dti': [42.00],
    'delinq_2yrs': [4],
    'home_ownership': ['RENT']
    }
payLatePredictionDf = pd.DataFrame.from_dict(payLatePrediction)

print("\nPredicting class probabilities for a likely late payer:")
print(pickledModel.predict_proba(payLatePredictionDf))


Predicting class probabilities for a likely late payer:
[[0.5 0.5]]


In [65]:
# Predict class probabilities for a set of records using the test set
print("\nPredicting class probabilities for the test data set:")
print(pickledModel.predict_proba(X_test))

# Printing accuracy
print("Accuracy:\n%s" % accuracy_score(y_test, pickledModel.predict(X_test)))

# Printing Confusion Matrix
cm = confusion_matrix(y_test, pickledModel.predict(X_test))
print("confusion matrix is %s"%cm)


Predicting class probabilities for the test data set:
[[0.9 0.1]
 [1.  0. ]
 [1.  0. ]
 ...
 [1.  0. ]
 [1.  0. ]
 [1.  0. ]]
Accuracy:
0.9861172000584539
confusion matrix is [[6748    0]
 [  95    0]]
