In [1]:
import os
import datetime
import numpy as np
import pandas as pd

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.sans-serif'] = ['SimHei']
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [3]:
data = pd.read_csv('data/train_modified.csv')

In [4]:
target = 'Disbursed'
index = 'ID'
data.set_index(index, inplace=True)

x_train, x_test, y_train, y_test = train_test_split(data.drop(columns=[target]), data[target],
                                                    test_size=0.3, random_state=0)

In [6]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
roc_auc_score(y_test, rf.predict_proba(x_test)[:, 1])



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

0.6086747217216322

In [35]:
params = {'n_estimators': range(261, 280, 1)}

gsearch = GridSearchCV(estimator=RandomForestClassifier(max_features=None,
                                                        max_depth=5,
                                                        min_samples_split=8,
                                                        min_samples_leaf=2,
                                                        max_leaf_nodes=20,
                                                        random_state=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=None, max_leaf_nodes=20,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=8,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': range(261, 280)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'n_estimators': 273}

In [36]:
params = {'max_depth': range(1, 20, 1)}

gsearch = GridSearchCV(estimator=RandomForestClassifier(n_estimators=273,
                                                        max_features=None,
                                                        min_samples_split=8,
                                                        min_samples_leaf=2,
                                                        max_leaf_nodes=20,
                                                        random_state=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=20,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=8,
            min_weight_fraction_leaf=0.0, n_estimators=273, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': range(1, 20)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=0)

{'max_depth': 4}

In [8]:
params = {'min_samples_split': range(2, 20, 1), 
         'min_samples_leaf': range(1, 5, 1)}

gsearch = GridSearchCV(estimator=RandomForestClassifier(n_estimators=273,
                                                        max_features=None,
                                                        max_depth=4,
                                                        max_leaf_nodes=20,
                                                        random_state=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features=None, max_leaf_nodes=20,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=273, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'min_samples_split': range(2, 20), 'min_samples_leaf': range(1, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'min_samples_leaf': 1, 'min_samples_split': 14}

In [11]:
params = {'max_leaf_nodes': range(2, 40, 2)}

gsearch = GridSearchCV(estimator=RandomForestClassifier(n_estimators=273,
                                                        max_features=None,
                                                        max_depth=4,
                                                        min_samples_split=14,
                                                        min_samples_leaf=1,
                                                        random_state=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=14,
            min_weight_fraction_leaf=0.0, n_estimators=273, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_leaf_nodes': range(2, 40, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'max_leaf_nodes': 24}

In [12]:
x_train.shape

(60914, 49)

In [17]:
params = {'max_features': range(20, 25, 1)}

gsearch = GridSearchCV(estimator=RandomForestClassifier(n_estimators=273,
                                                        max_depth=4,
                                                        min_samples_split=14,
                                                        min_samples_leaf=1,
                                                        max_leaf_nodes=24,
                                                        random_state=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=24,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=14,
            min_weight_fraction_leaf=0.0, n_estimators=273, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_features': range(20, 25)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=0)

{'max_features': 22}

In [19]:
rf_best = RandomForestClassifier(n_estimators=273,
                                 max_features=22,
                                 max_depth=4,
                                 min_samples_split=14,
                                 min_samples_leaf=1,
                                 max_leaf_nodes=24,
                                 random_state=1)
rf_best.fit(x_train, y_train)
roc_auc_score(y_test, rf_best.predict_proba(x_test)[:, 1])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features=22, max_leaf_nodes=24,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=14,
            min_weight_fraction_leaf=0.0, n_estimators=273, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

0.82303850811388