In [1]:
import pandas as pd
import numpy as np
import time

from sklearn import linear_model
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

In [2]:
#reading file
full_data = pd.read_csv('data/loanstats_2015_R.csv')

In [3]:
full_data.shape

(40053, 18)

In [4]:
full_data.head(2)

Unnamed: 0.1,Unnamed: 0,loan_amnt,term,emp_length,home_ownership,annual_inc,addr_state,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,pub_rec_bankruptcies,fico,is_bad
0,1,5000,3,1,RENT,24000.0,AZ,27.65,0,1,3,0,13648,83.7,9,0,737,0
1,2,2500,6,1,RENT,30000.0,GA,1.0,0,5,3,0,1687,9.4,4,0,742,1


In [5]:
#deleting unwanted columns
del full_data['Unnamed: 0']
del full_data['addr_state']

#has null, removing for now
del full_data['revol_util']


In [6]:
data = full_data.copy()

In [7]:
ownership_lookup = { own : i for i,own in enumerate(full_data.home_ownership.unique())}
ownership_lookup

{'MORTGAGE': 2, 'OTHER': 3, 'OWN': 1, 'RENT': 0}

In [8]:
data['home_ownership'].replace(ownership_lookup, inplace=True)

In [9]:
data.head(2)

Unnamed: 0,loan_amnt,term,emp_length,home_ownership,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,total_acc,pub_rec_bankruptcies,fico,is_bad
0,5000,3,1,0,24000.0,27.65,0,1,3,0,13648,9,0,737,0
1,2500,6,1,0,30000.0,1.0,0,5,3,0,1687,4,0,742,1


In [10]:
y = data['is_bad']
x = data.copy()
del x['is_bad']

In [11]:
#train test split
#train dataset A which 1/3 of total available training set, using stratify to keep classes distribution same
train_dataset,test_dataset,train_labels,test_labels = train_test_split(x,y,stratify=y,train_size=0.8,random_state=0)

In [12]:
train_dataset.shape

(32042, 14)

In [13]:
#check for null
np.max(np.isnan(train_dataset.loc[:,:]))

loan_amnt               False
term                    False
emp_length              False
home_ownership          False
annual_inc              False
dti                     False
delinq_2yrs             False
inq_last_6mths          False
open_acc                False
pub_rec                 False
revol_bal               False
total_acc               False
pub_rec_bankruptcies    False
fico                    False
dtype: bool

In [14]:
#running the classifier as is
clf_log = linear_model.LogisticRegression()
clf_log.fit(train_dataset, train_labels)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
y_predict_log = clf_log.predict(test_dataset)

In [16]:
np.mean(test_labels==y_predict_log)

0.86730745225315187

In [17]:
np.mean(test_labels==0)

0.86730745225315187

In [18]:
confusion_matrix(test_labels,y_predict_log)

array([[6948,    0],
       [1063,    0]])

In [61]:

'''

   Input
   -----
   X: Training Features
   Y_bin: Values Predicted binarized
   classifier_name: Name of classifier
   classifier: sklearn classifier init
   param_dict: dictionary of parameters to be passed to classifier
   scor_func: scoring method used
   no_cv: # of cvv
   
   Output
   ------
   training_time: training time in minutes
   best_estimator: best model after cvv
   best_params: best parameter 
   best_score: best score
   
'''

def perform_classify(X,Y_bin,classifier_name,classifier,param_dict,scor_func,no_cv):
    #training start time
    start_time = time.clock()
    clf = classifier
    parameters = param_dict
    fitmodel = GridSearchCV(clf, param_grid=parameters, cv=no_cv, scoring=scor_func)
    fitmodel.fit(X,Y_bin)
    
    best_estimator = fitmodel.best_estimator_
    best_params = fitmodel.best_params_
    best_score = fitmodel.best_score_
    
    end_time = time.clock()
    training_time = (end_time - start_time)/60.0
    print('{} Training Time : {} minutes'.format(classifier_name,training_time))
    print('{} Best Estimator \n {}'.format(classifier_name,best_estimator))
    print('{} Best Params \n {}'.format(classifier_name,best_params))
    print('{} Best AUC \n {}'.format(classifier_name,best_score))##, fitmodel.grid_scores_
    
    return training_time,best_estimator,best_params,best_score
  

In [75]:

#parameters passed for Logistic Regression
classifier_name = 'Logisitic'
classifier = linear_model.LogisticRegression()
param_dict = {"C": [0.0001, 0.001, 0.1, 1, 10, 100,1000]}
no_cv = 3
scor_func = 'roc_auc'

In [76]:

#calling classifier on training set
log_training_time,log_best_estimator,log_best_params,log_best_score = perform_classify(train_dataset,train_labels,classifier_name,classifier,param_dict,scor_func,no_cv)

Logisitic Training Time : 0.21902573333333328 minutes
Logisitic Best Estimator 
 LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Logisitic Best Params 
 {'C': 0.001}
Logisitic Best AUC 
 0.6296162061493603


In [None]:
### Decision Tree

In [67]:
clf_part = tree.DecisionTreeClassifier()
clf_part.fit(train_dataset, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [68]:
y_predict_part = clf_part.predict(test_dataset)

In [69]:
np.mean(test_labels==y_predict_log)

0.86730745225315187

In [81]:

#parameters passed for Logistic Regression
classifier_name = 'Decision Tree'
classifier = tree.DecisionTreeClassifier()
param_dict = {"criterion": ['gini','entropy'],'min_samples_split':[2,10,50,100],'splitter':['best','random']}
no_cv = 3
scor_func = 'roc_auc'

In [82]:

#calling classifier on training set
part_training_time,part_best_estimator,part_best_params,part_best_score = perform_classify(train_dataset,train_labels,classifier_name,classifier,param_dict,scor_func,no_cv)

Decision Tree Training Time : 0.1182915666666665 minutes
Decision Tree Best Estimator 
 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=100, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='random')
Decision Tree Best Params 
 {'criterion': 'entropy', 'min_samples_split': 100, 'splitter': 'random'}
Decision Tree Best AUC 
 0.6351700264231531


In [None]:
### Random Forest

In [None]:
clf_rf= RandomForestClassifier()
clf_rf.fit(X_train, Y_train)

In [78]:
clf_rf= RandomForestClassifier()
clf_rf.fit(train_dataset, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [88]:
y_predict_rf = clf_rf.predict(test_dataset)

In [89]:
np.mean(test_labels==y_predict_rf)

0.86218948945200347

In [83]:
#parameters passed for Logistic Regression
classifier_name = 'Random Forests'
classifier = RandomForestClassifier()
param_dict = {"criterion": ['gini','entropy'],'min_samples_split':[2,10,50,100],"n_estimators": [5,10,20,25,50,100]}
no_cv = 3
scor_func = 'roc_auc'

In [84]:
rf_training_time,rf_best_estimator,rf_best_params,rf_best_score = perform_classify(train_dataset,train_labels,classifier_name,classifier,param_dict,scor_func,no_cv)

Random Forests Training Time : 3.154307616666667 minutes
Random Forests Best Estimator 
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=100, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Random Forests Best Params 
 {'n_estimators': 100, 'criterion': 'entropy', 'min_samples_split': 100}
Random Forests Best AUC 
 0.691323869467005


In [85]:
clf_rf_cvv = RandomForestClassifier(criterion=rf_best_params['criterion'],min_samples_split=rf_best_params['min_samples_split'],n_estimators=rf_best_params['n_estimators'])

In [90]:
#training whole data
clf_rf_cvv.fit(train_dataset, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=100, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [91]:
y_predict_rf = clf_rf_cvv.predict(test_dataset)

In [92]:
np.mean(test_labels==y_predict_rf)

0.86730745225315187

In [94]:
confusion_matrix(test_labels,y_predict_rf)

array([[6948,    0],
       [1063,    0]])