# Set up

In [8]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

import pickle

In [2]:
train = pd.read_csv('../lipika/cleaned_2013_14')
test = pd.read_csv('../lipika/cleaned_2015')

# Run model

In [5]:
def split_data(df, cols):
    x = df.drop(cols, axis = 1)
    y = df.paid
    return x, y

cols_to_drop_training = ['loan_status', 'paid', 'amnt', 'total_pymnt', 'term_adj', 'zip_code']
x_train_initial, y_train_initial = split_data(train, cols_to_drop_training)

# drop NAs from test
test_noNAs = test.dropna()
x_test, y_test = split_data(test_noNAs, cols_to_drop_training)

In [6]:
sm = SMOTE(random_state=1, ratio = 1.0)
x_train, y_train = sm.fit_sample(x_train_initial, y_train_initial)

In [20]:
def adaboost_model(x, y, dtree_depth=3, n_est=50, lr=0.1):
    model = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=dtree_depth), n_estimators=n_est, learning_rate=lr))
    model.fit(x, y)
    return model

In [21]:
ada1 = adaboost_model(x_train, y_train)

In [31]:
ada1_class_acc_train = ada1.score(x_train, y_train)
ada1_class_acc_test = ada1.score(x_test, y_test)
ada1_auc_train = roc_auc_score(y_train, ada1.predict(x_train))
ada1_auc_test = roc_auc_score(y_test, ada1.predict(x_test))

In [32]:
print('Classification accuracy rate (train):', ada1_class_acc_train)
print('Classification accuracy rate (test):', ada1_class_acc_test)
print('AUC (train):', ada1_auc_train)
print('AUC (test):', ada1_auc_test)

Classification accuracy rate (train): 0.7942669319263039
Classification accuracy rate (test): 0.6575107469373751
AUC (train): 0.7942669319263038
AUC (test): 0.6092821309181438


In [35]:
# Open the file to save as pkl file
adaboost_pkl = open('Adaboost1.pkl', 'wb')
pickle.dump(ada1, adaboost_pkl)
# Close the pickle instances
adaboost_pkl.close()

# Adaboost and Grid Search

In [13]:
param_grid = {"base_estimator__max_depth": [3, 5, 10],
              "n_estimators": [50, 100],
              "learning_rate": [0.001, 0.01, 0.1]
             }
adaboost_clf = AdaBoostClassifier(DecisionTreeClassifier())
ada_gs = GridSearchCV(adaboost_clf, param_grid, scoring = 'roc_auc', cv=5, verbose=2)

In [14]:
ada_gs.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] base_estimator__max_depth=3, learning_rate=0.001, n_estimators=50 
[CV]  base_estimator__max_depth=3, learning_rate=0.001, n_estimators=50, total= 1.6min
[CV] base_estimator__max_depth=3, learning_rate=0.001, n_estimators=50 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.6min remaining:    0.0s


[CV]  base_estimator__max_depth=3, learning_rate=0.001, n_estimators=50, total= 1.5min
[CV] base_estimator__max_depth=3, learning_rate=0.001, n_estimators=50 
[CV]  base_estimator__max_depth=3, learning_rate=0.001, n_estimators=50, total= 1.9min
[CV] base_estimator__max_depth=3, learning_rate=0.001, n_estimators=100 
[CV]  base_estimator__max_depth=3, learning_rate=0.001, n_estimators=100, total= 3.1min
[CV] base_estimator__max_depth=3, learning_rate=0.001, n_estimators=100 
[CV]  base_estimator__max_depth=3, learning_rate=0.001, n_estimators=100, total= 2.9min
[CV] base_estimator__max_depth=3, learning_rate=0.001, n_estimators=100 
[CV]  base_estimator__max_depth=3, learning_rate=0.001, n_estimators=100, total=11.5min
[CV] base_estimator__max_depth=3, learning_rate=0.01, n_estimators=50 
[CV]  base_estimator__max_depth=3, learning_rate=0.01, n_estimators=50, total= 1.5min
[CV] base_estimator__max_depth=3, learning_rate=0.01, n_estimators=50 
[CV]  base_estimator__max_depth=3, learning

[CV]  base_estimator__max_depth=10, learning_rate=0.1, n_estimators=100, total= 9.2min


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 536.6min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'base_estimator__max_depth': [3, 5, 10], 'n_estimators': [50, 100], 'learning_rate': [0.001, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [20]:
adags_class_acc_train = ada_gs.score(x_train, y_train)
adags_class_acc_test = ada_gs.score(x_test, y_test)
adags_auc_train = roc_auc_score(y_train, ada_gs.predict(x_train))
adags_auc_test = roc_auc_score(y_test, ada_gs.predict(x_test))

In [21]:
print('Grid Search Adaboost')
print('Classification accuracy rate (train):', adags_class_acc_train)
print('Classification accuracy rate (test):', adags_class_acc_test)
print('AUC (train):', adags_auc_train)
print('AUC (test):', adags_auc_test)

Grid Search Adaboost
Classification accuracy rate (train): 0.9604228159210939
Classification accuracy rate (test): 0.6220354836307184
AUC (train): 0.896216590260358
AUC (test): 0.56200382881067


In [18]:
# Open the file to save as pkl file
adaboost_gs_pkl = open('AdaboostGS.pkl', 'wb')
pickle.dump(ada_gs, adaboost_gs_pkl)
# Close the pickle instances
adaboost_gs_pkl.close()