In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

from imblearn.over_sampling import SMOTE

In [2]:
#training data is from 2013-2014
data_train = pd.read_csv("../lipika/cleaned_2013_14", low_memory = False);
#test data is from 2015
data_test = pd.read_csv("../lipika/cleaned_2015", low_memory = False);
data_test = data_test.dropna(); #drop all rows with NAs

In [3]:
data_train.shape

(351757, 49)

In [4]:
#check that train and test sets have same columns
data_test.columns == data_train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [31]:
#only want 10% of the training data to train logistic regression model on
train_data_subset, train_data_rest = train_test_split(data_train, test_size=0.2, random_state=42);
train_data_subset.shape

(281405, 49)

In [32]:
#split data into x_train and y_train
def split_data(df, cols):
    x = df.drop(cols, axis = 1)
    y = df.paid
    return x, y

cols_to_drop_training = ['loan_status', 'paid', 'amnt', 'total_pymnt', 'term_adj'];
#split the training subset
x_train_subset, y_train_subset = split_data(train_data_subset, cols_to_drop_training)
x_train_rest, y_train_rest = split_data(train_data_rest, cols_to_drop_training)
#split test data
x_test, y_test = split_data(data_test, cols_to_drop_training)

In [33]:
#upsample default loans (class 1)
sm = SMOTE(random_state=42, ratio = 1.0)
x_train, y_train = sm.fit_sample(x_train_subset, y_train_subset);
x_train.shape

(462496, 44)

In [34]:
#create a function to take advantage of sklearn make_pipeline
#pipeline adds second order terms and interaction terms to X_train and then fits a Logisitic Regression model

# def logreg_pipeline(x, y):
#     model = make_pipeline(
#     PolynomialFeatures(degree=2, include_bias=False),
#     LogisticRegressionCV(cv = 5, penalty = 'l2', max_iter = 2500) );
#     model.fit(x, y)
#     return model

def logreg_pipeline(x, y):
    model = make_pipeline(
    LogisticRegressionCV(cv = 5, penalty = 'l2', max_iter = 2500) );
    model.fit(x, y)
    return model

In [35]:
from time import clock

c = clock();
logReg_model = logreg_pipeline(x_train, y_train);
stop = clock();

print("Time to train: {0} min".format( (stop-c)/60) );

Time to train: 9.163751016666671 min


In [36]:
#get the accuracy score of the Logistic Regression model on the rest of the training data
acc_score_rest_training = accuracy_score(y_train_rest, logReg_model.predict(x_train_rest))

In [37]:
print(acc_score_rest_training)
#0.6423 accuracy with just main effects and training on 3% of the training data
#0.6527 accuracy with just main effects and training on 10% of the training data
#0.64940 accuaracy with just main effects and training on 20% of the training data


0.648638276097339


After training the logistic regression model on 80% of the training data using just main effects and SMOTE (to upsample the default class), I achieve a <65% prediction accuracy, which seems to be the upper limit. When I add any interaction terms or polynomial features, model fails to converge, so I stuck to just the main effects.