# Hyperparameter Tuning

* Random Forest


## I. Import libs and data

In [28]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import PolynomialFeatures
from imblearn.over_sampling import SMOTE
import pickle
from sklearn.metrics import roc_auc_score

### Import Data

In [3]:
data_train = pd.read_csv("cleaned_2013_14", low_memory = False);
data_test = pd.read_csv("cleaned_2015", low_memory = False);

In [30]:
data_train.columns

Index(['funded_amnt', 'int_rate', 'total_pymnt', 'annual_inc', 'dti',
       'loan_status', 'revol_util', 'term', 'term_adj', 'emp_length_1 year',
       'emp_length_10+ years', 'emp_length_2 years', 'emp_length_3 years',
       'emp_length_4 years', 'emp_length_5 years', 'emp_length_6 years',
       'emp_length_7 years', 'emp_length_8 years', 'emp_length_9 years',
       'emp_length_< 1 year', 'home_ownership_ANY', 'home_ownership_MORTGAGE',
       'home_ownership_OWN', 'home_ownership_RENT',
       'verification_status_Not Verified',
       'verification_status_Source Verified', 'verification_status_Verified',
       'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F',
       'grade_G', 'purpose_car', 'purpose_credit_card',
       'purpose_debt_consolidation', 'purpose_home_improvement',
       'purpose_house', 'purpose_major_purchase', 'purpose_medical',
       'purpose_moving', 'purpose_other', 'purpose_renewable_energy',
       'purpose_small_business', 'purpose_vacat

In [5]:
data_test = data_test.dropna()
data_test.columns == data_train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [6]:
def split_data(df, cols):
    x = df.drop(cols, axis = 1)
    y = df.paid
    return x, y

cols_to_drop_training = ['loan_status', 'paid', 'amnt', 'total_pymnt', 'term_adj']
x_train_initial, y_train_initial = split_data(data_train, cols_to_drop_training)
x_test, y_test = split_data(data_test, cols_to_drop_training)

In [7]:
        sm = SMOTE(random_state=1, ratio = 1.0)
        x_train, y_train = sm.fit_sample(x_train_initial, y_train_initial)

In [8]:
x_train.shape, x_test.shape

((578050, 44), (346843, 44))

### Downsample

In [12]:
## Downsample the data to 30% of the full dataset stratifying by classes
dont_use_x, x_train_sample, dont_use_y , y_train_sample = train_test_split(x_train, 
                                                            y_train, test_size = 0.3, stratify=y_train)

In [13]:
x_train_sample.shape, y_train_sample.shape

((173415, 44), (173415,))

## Random Forest Tuning

In [None]:
# Use Pipeline instead of make_pipeline to do grid search
pipe = Pipeline([('POLY', PolynomialFeatures(degree=2, include_bias=False)),
            ('RFC',RandomForestClassifier(max_features = 100))])

# Specify parameters 
param_grid = {'RFC__n_estimators' : [50,100,120,140],
              'RFC__max_depth' : [8,10,12]}

# Instantiate GS
gs = GridSearchCV(pipe,param_grid, verbose=10, cv=4)

In [None]:
gs.fit(x_train_sample, y_train_sample)

In [None]:
print("Best parameters set found on training set:")
print()
print(gs.best_params_)
print()
print("Grid scores on training set:")
print()
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

In [None]:
samp_train_preds_true = {'proba' : gs.best_estimator_.predict_proba(x_train_sample)[:, 1], 'true_val' : y_train_sample}
train_preds_true = {'proba' : gs.best_estimator_.predict_proba(x_train)[:, 1], 'true_val' : y_train}
test_preds_true = {'proba' : gs.best_estimator_.predict_proba(x_test)[:, 1], 'true_val' : y_test}

In [25]:
AUC_sample = roc_auc_score(samp_train_preds_true.true_val, samp_train_preds_true.proba)
AUC_train = roc_auc_score(train_preds_true.true_val, train_preds_true.proba)
AUC_test = roc_auc_score(test_preds_true.true_val, test_preds_true.proba)

In [None]:
print(AUC)

In [None]:
# Dump the trained decision tree classifier with Pickle
decision_tree_pkl_filename = 'Tuned_RF.pkl'
# Open the file to save as pkl file
decision_tree_model_pkl = open(decision_tree_pkl_filename, 'wb')
pickle.dump(gs.best_estimator_, decision_tree_model_pkl)
# Close the pickle instances
decision_tree_model_pkl.close()

##  LGBM Tuning

In [None]:
# Use Pipeline instead of make_pipeline to do grid search
pipe = Pipeline([('POLY', PolynomialFeatures(degree=2, include_bias=False)),
            ('LGBM',LGBMClassifier(n_estimators = 150)])

# Specify parameters 
param_grid = {'RFC__n_estimators' : [50,100,150],
              'RFC__max_depth' : [8,10,12]}

# Instantiate GS
gs = GridSearchCV(pipe,param_grid, verbose=10, cv=4)