In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

from imblearn.over_sampling import SMOTE

In [2]:
#training data is from 2013-2014
data_train = pd.read_csv("../lipika/cleaned_2013_14", low_memory = False);
#test data is from 2015
data_test = pd.read_csv("../lipika/cleaned_2015", low_memory = False);
data_test = data_test.dropna(); #drop all rows with NAs

In [3]:
print(data_train.shape)
print(data_train.columns)

(351757, 50)
Index(['funded_amnt', 'int_rate', 'total_pymnt', 'annual_inc', 'dti',
       'loan_status', 'revol_util', 'term', 'term_adj', 'zip_code',
       'emp_length_1 year', 'emp_length_10+ years', 'emp_length_2 years',
       'emp_length_3 years', 'emp_length_4 years', 'emp_length_5 years',
       'emp_length_6 years', 'emp_length_7 years', 'emp_length_8 years',
       'emp_length_9 years', 'emp_length_< 1 year', 'home_ownership_ANY',
       'home_ownership_MORTGAGE', 'home_ownership_OWN', 'home_ownership_RENT',
       'verification_status_Not Verified',
       'verification_status_Source Verified', 'verification_status_Verified',
       'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F',
       'grade_G', 'purpose_car', 'purpose_credit_card',
       'purpose_debt_consolidation', 'purpose_home_improvement',
       'purpose_house', 'purpose_major_purchase', 'purpose_medical',
       'purpose_moving', 'purpose_other', 'purpose_renewable_energy',
       'purpose_small_

In [4]:
#check that train and test sets have same columns
data_test.columns == data_train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [5]:
#only want 10% of the training data to train logistic regression model on
train_data_subset, train_data_rest = train_test_split(data_train, test_size=0.0, random_state=42);
train_data_subset.shape

(351757, 50)

In [6]:
#split data into x_train and y_train
def split_data(df, cols):
    x = df.drop(cols, axis = 1)
    y = df.paid
    return x, y

cols_to_drop_training = ['loan_status', 'paid', 'amnt', 'total_pymnt', 'term_adj', 'zip_code'];
#split the training subset
x_train_subset, y_train_subset = split_data(train_data_subset, cols_to_drop_training)
x_train_rest, y_train_rest = split_data(train_data_rest, cols_to_drop_training)
#split test data
x_test, y_test = split_data(data_test, cols_to_drop_training)

In [7]:
#upsample default loans (class 1)
sm = SMOTE(random_state=42, ratio = 1.0)
x_train, y_train = sm.fit_sample(x_train_subset, y_train_subset);
x_train.shape

(578050, 44)

In [8]:
#create a function to take advantage of sklearn make_pipeline
#pipeline adds second order terms and interaction terms to X_train and then fits a Logisitic Regression model

# def logreg_pipeline(x, y):
#     model = make_pipeline(
#     PolynomialFeatures(degree=2, include_bias=False),
#     LogisticRegressionCV(cv = 5, penalty = 'l2', max_iter = 2500) );
#     model.fit(x, y)
#     return model

def logreg_pipeline(x, y):
    model = make_pipeline(
    LogisticRegressionCV(cv = 5, penalty = 'l2', max_iter = 2500) );
    model.fit(x, y)
    return model

In [9]:
from time import clock

c = clock();
logReg_model = logreg_pipeline(x_train, y_train);
stop = clock();

print("Time to train: {0} min".format( (stop-c)/60) );

Time to train: 11.467177533333333 min


In [10]:
#get the accuracy score of the Logistic Regression model on the rest of the training data
# acc_score_rest_training = accuracy_score(y_train_rest, logReg_model.predict(x_train_rest))
acc_score_test = accuracy_score(y_test, logReg_model.predict(x_test))

In [11]:
# print("Validation accuracy:", acc_score_rest_training)
print("Test accuracy:", acc_score_test)
#0.6423 accuracy with just main effects and training on 3% of the training data
#0.6527 accuracy with just main effects and training on 10% of the training data
#0.64940 accuaracy with just main effects and training on 20% of the training data


Test accuracy: 0.7463203812676052


After training the logistic regression model on all of the 2013-2014 training data using just main effects and SMOTE (to upsample the default class), I achieve a <75% prediction accuracy on the test set. I was receiving only 65% accuracy when using a validation set from the training data, so this is an intriguing result. When I add any interaction terms or polynomial features, the model fails to converge, so I stuck to just the main effects.

In [None]:
logReg_pkl_filename = 'Tuned_logReg_all_training_data.pkl'
# Open the file to save as pkl file
logReg_model_pkl = open(logReg_pkl_filename, 'wb')
pickle.dump(logReg_model)
# Close the pickle instances
logReg_model_pkl.close()