In [1]:
import os
import datetime
import numpy as np
import pandas as pd

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.sans-serif'] = ['SimHei']
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [3]:
data = pd.read_csv('data/train_modified.csv')

target = 'Disbursed'
index = 'ID'
data.set_index(index, inplace=True)

x_train, x_test, y_train, y_test = train_test_split(data.drop(columns=[target]), data[target],
                                                    test_size=0.3, random_state=0)

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

In [6]:
gbm = GradientBoostingClassifier()
gbm.fit(x_train, y_train)
roc_auc_score(y_test, gbm.predict_proba(x_test)[:, 1])

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

0.8296610665879788

In [9]:
params = {'n_estimators': range(10, 300, 10),
         'learning_rate': [0.0001, 0.0005, 0.001, 0.0015, 0.002, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5]}

gsearch = GridSearchCV(estimator=GradientBoostingClassifier(subsample=0.6,
                                                            max_features='sqrt',
                                                            max_depth=8,
                                                            min_samples_split=20,
                                                            min_samples_leaf=8,
                                                            max_leaf_nodes=20,
                                                            min_weight_fraction_leaf=0,
                                                            random_state=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=8,
              max_features='sqrt', max_leaf_nodes=20,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=8, min_sampl...      subsample=0.6, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': range(10, 300, 10), 'learning_rate': [0.0001, 0.0005, 0.001, 0.0015, 0.002, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'learning_rate': 0.05, 'n_estimators': 190}

In [16]:
params = {'min_samples_split': range(2, 20, 2),
         'min_samples_leaf': range(1, 10, 1)}

gsearch = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=190,
                                                            learning_rate=0.05,
                                                            subsample=0.6,
                                                            max_features='sqrt',
                                                            max_depth=8,
                                                            max_leaf_nodes=20,
                                                            min_weight_fraction_leaf=0,
                                                            random_state=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=8,
              max_features='sqrt', max_leaf_nodes=20,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samp...      subsample=0.6, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'min_samples_split': range(2, 20, 2), 'min_samples_leaf': range(1, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'min_samples_leaf': 5, 'min_samples_split': 16}

In [17]:
params = {'max_depth': range(2, 100, 2)}

gsearch = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=190,
                                                            learning_rate=0.05,
                                                            subsample=0.6,
                                                            max_features='sqrt',
                                                            min_samples_split=16,
                                                            min_samples_leaf=5,
                                                            max_leaf_nodes=20,
                                                            min_weight_fraction_leaf=0,
                                                            random_state=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=3,
              max_features='sqrt', max_leaf_nodes=20,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=5, min_samp...      subsample=0.6, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': range(2, 100, 2)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=0)

{'max_depth': 16}

In [22]:
params = {'max_features': range(12, 20, 1)}

gsearch = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=190,
                                                            learning_rate=0.05,
                                                            subsample=0.6,
                                                            min_samples_split=16,
                                                            min_samples_leaf=5,
                                                            max_depth=16,
                                                            max_leaf_nodes=20,
                                                            min_weight_fraction_leaf=0,
                                                            random_state=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=16,
              max_features=None, max_leaf_nodes=20,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=5, min_sampl...      subsample=0.6, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_features': range(12, 20)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=0)

{'max_features': 16}

In [23]:
params = {'subsample': [i/10 for i in range(5, 11, 1)]}

gsearch = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=190,
                                                            learning_rate=0.05,
                                                            max_features=16,
                                                            min_samples_split=16,
                                                            min_samples_leaf=5,
                                                            max_depth=16,
                                                            max_leaf_nodes=20,
                                                            min_weight_fraction_leaf=0,
                                                            random_state=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=16,
              max_features=16, max_leaf_nodes=20,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=5, min_samples...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'subsample': 0.7}

In [24]:
params = {'n_estimators': range(150, 400, 10),
         'learning_rate': [0.0001, 0.0005, 0.001, 0.0015, 0.002, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5]}

gsearch = GridSearchCV(estimator=GradientBoostingClassifier(subsample=0.7,
                                                            max_features=16,
                                                            min_samples_split=16,
                                                            min_samples_leaf=5,
                                                            max_depth=16,
                                                            max_leaf_nodes=20,
                                                            min_weight_fraction_leaf=0,
                                                            random_state=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=16,
              max_features=16, max_leaf_nodes=20,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=5, min_samples_...      subsample=0.7, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': range(150, 400, 10), 'learning_rate': [0.0001, 0.0005, 0.001, 0.0015, 0.002, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'learning_rate': 0.05, 'n_estimators': 220}

In [25]:
gbm_best = GradientBoostingClassifier(n_estimators=220, 
                                      learning_rate=0.05,
                                      subsample=0.7,
                                      max_features=16,
                                      min_samples_split=16,
                                      min_samples_leaf=5,
                                      max_depth=16,
                                      max_leaf_nodes=20,
                                      min_weight_fraction_leaf=0,
                                      random_state=1)
gbm_best.fit(x_train, y_train)
roc_auc_score(y_test, gbm_best.predict_proba(x_test)[:, 1])

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=16,
              max_features=16, max_leaf_nodes=20,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=5, min_samples_split=16,
              min_weight_fraction_leaf=0, n_estimators=220,
              n_iter_no_change=None, presort='auto', random_state=1,
              subsample=0.7, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

0.8350659829504987