In [None]:
#=================
# Import libraries
#=================
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression, logistic
from sklearn.cross_validation import train_test_split, KFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, average_precision_score, precision_score, recall_score, accuracy_score
#from sklearn.preprocessing import normalize
import sklearn.preprocessing as preprocessing
from sklearn.externals import joblib


import scipy.stats as stats

import statsmodels.api as sm

# from patsy import standardize
# import timeit
# script_start = timeit.default_timer()
# import pylab as pl

In [None]:
#=================
#User-edited fields
#=================
##--What features to train?  ##NOTE:  not showing all 
train_cols = ['usd_amount_l1', 'wkly_ins_due', 'on_time_payments_l1', 'max_paid_amt',
               'max_paid_perc_l1', 'avg_msg_length_l1', 'tot_msg',
               'usd_amount_l2', 'second_loan_per_increase', 'total_num_pmt_due', 'emi',
               'loan_ratio', 'ins_20_bin_l1', 'ins_30_bin_l1', 'ins_40_bin_l1',
                'ins_20_bin_late_l1', 'ins_30_bin_late_l1', 'ins_40_bin_late_l1',
                'ins_20_bin_ontime_l1', 'ins_30_bin_ontime_l1', 'ins_40_bin_ontime_l1', 'on_time_perc_l1']

##--filename(s)?
#X data
X_in = 'Xdata.csv' 

#y data
y_in = 'ydata.csv' 

##--regularization type
reg_method = 'l2'  ##CHANGE: to test more methods later

##--class weight for data --> more ppl payoff than default 
classwgt = 'balanced'

In [None]:
#=================
#Read in data
#=================
#--processed MySQL db using Python

#--x data (loan features)
Xdata = pd.read_csv(X_in) 
Xdata = Xdata.drop('Unnamed: 0', axis = 1)  ###clean this up in sqldata.py later!
Xdata.head()

In [None]:
#--y data (succesffully repaid = 0, default = 1)
ydata = pd.read_csv('ydata.20170212.001.csv')
ydata.head()

In [None]:
##--sanity CHECK
null_data = Xdata[Xdata.isnull().any(axis=1)]
null_data

In [None]:
#+++++
##--Grab fields you want
##--not super clean way to do it, but can fix later! CHANGE
#+++++
X = Xdata

y = ydata[['default']]

In [None]:
#=================
#Model
#=================

#+++++
##--Train and test sets
#+++++
X_train, X_test, y_train, y_test = train_test_split(X[train_cols], y, test_size=0.33)

In [None]:
#+++++
##--Scale data for model
#+++++

##standard scaler:  "Standardize features by removing the mean and scaling to unit variance" (from sklearn doc)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

##-sanity CHECK
X_train.mean()

In [None]:
##--training sets are no longer dfs, so convert them  ##CHANGE:  investigate to see if this is necessary!
print(X_train.dtype)

#-train
X_train = pd.DataFrame(X_train)
X_train.columns = train_cols
X_train.columns

#-test
X_test = pd.DataFrame(X_test)
X_test.columns = train_cols
X_test.columns



In [None]:
##--set indices to be same for X and y
print(X_train.index)
print(y_train.index)  ##DOUBLE CHECK why this is happening!

y_train.index = X_train.iloc[:,0].index
print(y_train.index)

In [None]:
##--quick look at training set
X_train.describe()

In [None]:
#+++++
##--Logistic Regression MODEL
#+++++
logit = sm.Logit(y_train,X_train, class_weight = classwgt, method=reg_method) ##--since more pp

In [None]:
##--fit to data
result = logit.fit()

In [None]:
result.summary()
###key items:  coefficient
###            p val  

In [None]:
#+++++
##--get log odds from coefficients
#+++++
odd_rat = np.exp(result.params)
print (odd_rat)

In [None]:
#+++++
##--marginal effects
#+++++
margeeff = result.get_margeff(method='dydx', at='mean')
margeeff.summary()