In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt 

from sklearn.ensemble import RandomForestClassifier


In [5]:
train = pd.read_csv('./data/train.csv')
train.sample()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
31727,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Preprocession

In [8]:
y=train['label']
x=train.drop('label', axis=1)

## Grid Search

In [9]:
forest = RandomForestClassifier(random_state=42)

In [10]:


param_grid = [
    {'n_estimators' : [5],
    'max_features' : [0.3, 0.4],  # 작으면 작을수록 랜덤한 성격이 들어간다
    'bootstrap' : [True,False]} 
]

In [11]:
grid_search = GridSearchCV(forest,
                          param_grid,
                          cv=2,
                          scoring='accuracy',
                          return_train_score=True,
                           # train data에 대해서도 score를 쓸꺼냐?는 문제라서 크게 중요하지는 않다
                           #-> train data는 어차피 score가 높으니깐
                           #-> but, train과 test의 score가 너무 많이 다를 경우에는 중요할 수 있다
                           #(train만 점수가 높으면 train에만 맞춰서 fitting이 된 경우니깐)
                          n_jobs=-1, # -1은 내가 쓰고있는 코어를 다 쓰겠다
                          verbose=1)

In [13]:
grid_search.fit(x,y)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:  7.4min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'b

In [15]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 0.5, 'n_estimators': 10}

In [17]:
grid_search.cv_results_

{'mean_fit_time': array([ 54.59549248,  75.81872749,  90.19173241,  97.45577705,
         78.03102732, 104.69799519, 139.43310249, 150.95578575]),
 'std_fit_time': array([0.50913918, 1.02077341, 1.88044333, 0.94746959, 0.03021145,
        0.54055405, 0.77891815, 0.21292758]),
 'mean_score_time': array([0.44730294, 0.48869514, 0.37657213, 0.3744992 , 0.55351591,
        0.35953748, 0.39394665, 0.33659887]),
 'std_score_time': array([0.01246727, 0.02992082, 0.03647995, 0.0174526 , 0.14262223,
        0.00548589, 0.07978678, 0.00348878]),
 'param_bootstrap': masked_array(data=[True, True, True, True, False, False, False, False],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_features': masked_array(data=[0.5, 0.7, 0.9, None, 0.5, 0.7, 0.9, None],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': 

In [19]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score,params)

0.9305 {'bootstrap': True, 'max_features': 0.5, 'n_estimators': 10}
0.9290952380952381 {'bootstrap': True, 'max_features': 0.7, 'n_estimators': 10}
0.9268571428571428 {'bootstrap': True, 'max_features': 0.9, 'n_estimators': 10}
0.9256190476190476 {'bootstrap': True, 'max_features': None, 'n_estimators': 10}
0.9388571428571428 {'bootstrap': False, 'max_features': 0.5, 'n_estimators': 10}
0.9302142857142857 {'bootstrap': False, 'max_features': 0.7, 'n_estimators': 10}
0.8856428571428572 {'bootstrap': False, 'max_features': 0.9, 'n_estimators': 10}
0.8488095238095238 {'bootstrap': False, 'max_features': None, 'n_estimators': 10}


## Random Search

In [18]:
from scipy.stats import uniform as sp_uniform # float uniform dist
from scipy.stats import randint as sp_randint # int uniform dist

In [26]:
param_dist = {'max_features' : sp_uniform(0.3, 0.5),
             'bootstrap' : [True, False], 
             'criterion' : ['gini','entropy'],
             'n_estimators' : [5],
             'max_depth' : sp_randint(5,25) }

In [27]:
random_search = RandomizedSearchCV(forest, param_dist, n_iter=5, cv=2, verbose=1, n_jobs=-1)

In [28]:
random_search.fit(x,y)

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.2min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [29]:
for i in random_search.best_estimator_.estimators_:
    print(i.get_depth())

20
20
19
20
20
