# Comparison between custom Random Forest and sklearn Random Forest

in this Notebook we'll compare the performances on <a href='https://www.kaggle.com/c/GiveMeSomeCredit/data#_=_'>this</a> credit scoring dataset from Kaggle

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from ensemble import RandomForestClassifierCustom
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [2]:
def impute_nan_with_median(table):
    for col in table.columns:
        table[col]=table[col].fillna(table[col].median())
    return table

In [3]:
data=pd.read_csv('credit_scoring_data.csv', sep=',')
table=impute_nan_with_median(data)

X=table.drop(columns='SeriousDlqin2yrs')
y=table['SeriousDlqin2yrs']  #target
X=np.array(X)
y=np.array(y)

In [6]:
%time
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=48)
rd=RandomForestClassifier(random_state=48)
gridrd=GridSearchCV(estimator=rd,param_grid={'max_depth':range(3,10),
                                             'max_features':range(3,10)},
                  scoring='roc_auc',cv=skf,verbose=1)

gridrd.fit(X,y)

Fitting 5 folds for each of 49 candidates, totalling 245 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.














[Parallel(n_jobs=1)]: Done 245 out of 245 | elapsed: 12.7min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=48, shuffle=True),
       error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=48, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': range(3, 10), 'max_features': range(3, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [7]:
gridrd.best_params_
gridrd.best_score_

0.8608288575576464

In [8]:
%time
rd_custom=RandomForestClassifierCustom(random_state=48)
gridrd_custom=GridSearchCV(estimator=rd_custom,param_grid={'max_depth':range(3,10),
                                             'max_features':range(3,10)},
                  scoring='roc_auc',cv=skf,verbose=0)

gridrd_custom.fit(X,y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=48, shuffle=True),
       error_score='raise-deprecating',
       estimator=RandomForestClassifierCustom(max_depth=10, max_features=10, n_estimators=10,
               random_state=48),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': range(3, 10), 'max_features': range(3, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [9]:
gridrd_custom.best_params_
gridrd_custom.best_score_

0.861610760179296