In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.preprocessing import PolynomialFeatures
from imblearn.over_sampling import SMOTE
import pickle
from sklearn.metrics import roc_auc_score

In [2]:
data_train = pd.read_csv("cleaned_2013_14", low_memory = False);
data_test = pd.read_csv("cleaned_2015", low_memory = False);

In [3]:
data_test = data_test.dropna()
data_test.columns == data_train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [4]:
def split_data(df, cols):
    x = df.drop(cols, axis = 1)
    y = df.paid
    return x, y

cols_to_drop_training = ['loan_status', 'paid', 'amnt', 'total_pymnt', 'term_adj', 'zip_code']
x_train_initial, y_train_initial = split_data(data_train, cols_to_drop_training)
x_test, y_test = split_data(data_test, cols_to_drop_training)

In [5]:
sm = SMOTE(random_state=1, ratio = 1.0)
x_train, y_train = sm.fit_sample(x_train_initial, y_train_initial)

In [6]:
x_train.shape, x_test.shape

((578050, 44), (346843, 44))

In [7]:
# Use Pipeline instead of make_pipeline to do grid search
pipe = Pipeline([('POLY', PolynomialFeatures(degree=2, include_bias=False)),
            ('LDA',LinearDiscriminantAnalysis())])

lda = pipe.fit(x_train, y_train)



In [8]:
## Predict
train_predictions = lda.predict(x_train)
test_predictions = lda.predict(x_test)

## Accuracy Score
train_score = lda.score(x_train, y_train)
test_score = lda.score(x_test, y_test)

print("train accuracy: ", train_score)
print("test accuracy: ", test_score)

## AUC
train_predictions_class = np.where(train_predictions > 0.5, 1, 0)
test_predictions_class = np.where(test_predictions > 0.5, 1, 0)

print('Train ROC AUC: ', roc_auc_score(train_predictions_class, y_train))
print('Test  ROC AUC: ', roc_auc_score(test_predictions_class, y_test))

('train accuracy: ', 0.49749156647348847)
('test accuracy: ', 0.47518617933762536)
('Train ROC AUC: ', 0.49749042243431263)
('Test  ROC AUC: ', 0.4886904936188663)


In [10]:
# Dump the trained decision tree classifier with Pickle
pkl_filename = 'LDA.pkl'

# Open the file to save as pkl file
model_pkl = open(pkl_filename, 'wb')
pickle.dump(lda, model_pkl)

# Close the pickle instances
model_pkl.close()

In [9]:
# Use Pipeline instead of make_pipeline to do grid search
pipe = Pipeline([('POLY', PolynomialFeatures(degree=2, include_bias=False)),
            ('QDA',QuadraticDiscriminantAnalysis())])

qda = pipe.fit(x_train, y_train)



In [10]:
## Predict
train_predictions = qda.predict(x_train)
test_predictions = qda.predict(x_test)

## Accuracy Score
train_score = qda.score(x_train, y_train)
test_score = qda.score(x_test, y_test)

print("train accuracy: ", train_score)
print("test accuracy: ", test_score)

## AUC
train_predictions_class = np.where(train_predictions > 0.5, 1, 0)
test_predictions_class = np.where(test_predictions > 0.5, 1, 0)

print('Train ROC AUC: ', roc_auc_score(train_predictions_class, y_train))
print('Test  ROC AUC: ', roc_auc_score(test_predictions_class, y_test))

('train accuracy: ', 0.5346873107862642)
('test accuracy: ', 0.7777207555003273)
('Train ROC AUC: ', 0.7578184100689117)
('Test  ROC AUC: ', 0.7923243598837394)


In [11]:
# Dump the trained decision tree classifier with Pickle
pkl_filename = 'QDA.pkl'

# Open the file to save as pkl file
model_pkl = open(pkl_filename, 'wb')
pickle.dump(qda, model_pkl)

# Close the pickle instances
model_pkl.close()