In [3]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import PolynomialFeatures
from imblearn.over_sampling import SMOTE
import pickle
from lightgbm.sklearn import LGBMClassifier

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [4]:
data_train = pd.read_csv("cleaned_2013_14", low_memory = False);
data_test = pd.read_csv("cleaned_2015", low_memory = False);

In [5]:
data_test = data_test.dropna()
data_test.columns == data_train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [6]:
def split_data(df, cols):
    x = df.drop(cols, axis = 1)
    y = df.paid
    return x, y

cols_to_drop_training = ['loan_status', 'paid', 'amnt', 'total_pymnt', 'term_adj', 'zip_code']
x_train_initial, y_train_initial = split_data(data_train, cols_to_drop_training)
x_test, y_test = split_data(data_test, cols_to_drop_training)

In [7]:
sm = SMOTE(random_state=1, ratio = 1.0)
x_train, y_train = sm.fit_sample(x_train_initial, y_train_initial)

In [8]:
x_train.shape, x_test.shape

((578050, 44), (346843, 44))

In [9]:
params = {}

params['learning_rate'] = [0.05, 0.01, 0.005, 0.001]
params['boosting_type'] = ['gbdt', 'dart']
params['max_leaves'] = [2500, 5000, 7500, 10000, 12500, 14000, 16300]

In [10]:
# Use Pipeline instead of make_pipeline
pipe = Pipeline([('POLY', PolynomialFeatures(degree=2, include_bias=False)),
            ('LGBM',LGBMClassifier(params, objective = 'binary', metric = 'binary_logloss',
                                   n_jobs = 5, bagging_fraction = 0.7))])


# lgbm = pipe.fit(x_train, y_train)

# Instantiate GS
gs = GridSearchCV(pipe, params, verbose=10, cv=4)

In [13]:
gs.fit(x_train, y_train)

NameError: name 'x_train' is not defined

In [None]:
print("Best parameters set found on training set:")
print()
print(gs.best_params_)
print()
print("Grid scores on training set:")
print()
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

In [None]:
train_preds_true = {'proba' : gs.best_estimator_.predict_proba(x_train)[:, 1], 'true_val' : y_train}
test_preds_true = {'proba' : gs.best_estimator_.predict_proba(x_test)[:, 1], 'true_val' : y_test}

In [None]:
AUC_train = roc_auc_score(train_preds_true['true_val'], train_preds_true['proba'])
AUC_test = roc_auc_score(test_preds_true['true_val'], test_preds_true['proba'])

In [None]:
print('AUC Train: ', AUC_train);
print('AUC Test: ', AUC_test);

In [None]:
# ## Predict
# train_predictions = lgbm.predict(x_train)
# test_predictions = lgbm.predict(x_test)

# ## Accuracy Score
# train_score = lgbm.score(x_train, y_train)
# test_score = lgbm.score(x_test, y_test)

# print("train accuracy: ", train_score)
# print("test accuracy: ", test_score)

# ## AUC
# train_predictions_class = np.where(train_predictions > 0.5, 1, 0)
# test_predictions_class = np.where(test_predictions > 0.5, 1, 0)

# print('Train ROC AUC: ', roc_auc_score(train_predictions_class, y_train))
# print('Test  ROC AUC: ', roc_auc_score(test_predictions_class, y_test))

In [None]:
# from sklearn.metrics import roc_curve
# y_pred = lgbm.predict(x_test).ravel()
# fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# from sklearn.metrics import auc
# auc_lgbm = auc(fpr, tpr)

# plt.figure(1)
# plt.plot([0, 1], [0, 1], 'k--')
# plt.plot(fpr, tpr, label='LGBM (area = {:.3f})'.format(auc_lgbm))
# plt.xlabel('False positive rate')
# plt.ylabel('True positive rate')
# plt.title('ROC curve')
# plt.legend(loc='best')
# plt.show()

In [None]:
# Dump the tuned classifier with Pickle
pkl_filename = 'Tuned_LGBM.pkl'

# Open the file to save as pkl file
model_pkl = open(pkl_filename, 'wb')
pickle.dump(gs.best_estimator_, model_pkl)

# Close the pickle instances
model_pkl.close()