<a href="https://colab.research.google.com/github/aluqbnle/ml-sandbox/blob/master/17_Hyper_Parameter__Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### グリッドサーチ

In [0]:
param_a = [0.0 , 0.2 , 0.4 ,0.6,0.8,1.0]
param_b = [1,10,100,1000]

In [30]:
for a in param_a:
  for b in param_b:
    print(f"a={a},b={b}")

a=0.0,b=1
a=0.0,b=10
a=0.0,b=100
a=0.0,b=1000
a=0.2,b=1
a=0.2,b=10
a=0.2,b=100
a=0.2,b=1000
a=0.4,b=1
a=0.4,b=10
a=0.4,b=100
a=0.4,b=1000
a=0.6,b=1
a=0.6,b=10
a=0.6,b=100
a=0.6,b=1000
a=0.8,b=1
a=0.8,b=10
a=0.8,b=100
a=0.8,b=1000
a=1.0,b=1
a=1.0,b=10
a=1.0,b=100
a=1.0,b=1000


### ランダムサーチ

In [0]:
import numpy
numpy.random.seed(0) #乱数シードの固定

In [32]:
for _ in range(20):
  a = numpy.random.random()
  b = numpy.random.randint(1,1001)
  print(f"a={a},b={b}")

a=0.5488135039273248,b=630
a=0.8442657485810173,b=764
a=0.5448831829968969,b=10
a=0.6235636967859723,b=755
a=0.4375872112626925,b=71
a=0.05671297731744318,b=397
a=0.3834415188257777,b=487
a=0.8121687287754932,b=175
a=0.5680445610939323,b=678
a=0.8360787635373775,b=73
a=0.08712929970154071,b=116
a=0.36824153984054797,b=710
a=0.7781567509498505,b=432
a=0.8700872583584364,b=100
a=0.7991585642167236,b=756
a=0.5204774795512048,b=148
a=0.11827442586893322,b=289
a=0.5820197920751071,b=698
a=0.9446689170495839,b=544
a=0.10590760718779213,b=152


## パラメータ探索の実験

In [0]:
from pandas import DataFrame
from sklearn.datasets import load_breast_cancer

In [0]:
breast_cancer = load_breast_cancer()
X = breast_cancer.data[:,:10]
y = breast_cancer.target

columns=['radius','texture','Circumference','area','smoothness','compactness','dent','number_of_dent','Symmetry','fractal dimension']

df = DataFrame(data=X[:,:10],columns=columns)
df['objective_values']=y

In [0]:
X=df[['area','dent']].values
y=df['objective_values'].values

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42)

In [0]:
import numpy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

### グリッドサーチ

In [0]:
from sklearn.model_selection import GridSearchCV

In [0]:
param_grid = {
    'max_depth':[1,2],
    'n_estimators':[10,15,20,25,30]
}

In [0]:
gs = GridSearchCV(
    # ランダムフォレスト
    estimator=RandomForestClassifier(criterion='gini',random_state=42),
    # 上で定義したパラメータの範囲
    param_grid=param_grid,
    scoring='accuracy',
    # 交差検証に StratifiedKFold を利用する
    cv = StratifiedKFold(n_splits=10,shuffle=True,random_state=42),
    return_train_score=True)

In [42]:
gs.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
       error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [1, 2], 'n_estimators': [10, 15, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [43]:
gs.best_params_

{'max_depth': 2, 'n_estimators': 20}

In [44]:
gs.best_score_

0.9120603015075377

In [47]:
df_grid_result = DataFrame(gs.cv_results_)
df_grid_result[['param_max_depth','param_n_estimators','mean_train_score','mean_test_score']]

Unnamed: 0,param_max_depth,param_n_estimators,mean_train_score,mean_test_score
0,1,10,0.886656,0.884422
1,1,15,0.888043,0.88191
2,1,20,0.883304,0.879397
3,1,25,0.888602,0.879397
4,1,30,0.888327,0.879397
5,2,10,0.916248,0.904523
6,2,15,0.918757,0.907035
7,2,20,0.923504,0.91206
8,2,25,0.924623,0.91206
9,2,30,0.926297,0.91206


In [48]:
# もっとも良かった機械学習モデルを取り出す
clf = gs.best_estimator_
# scoreメソッドを利用して、正解率の計算をする
clf.score(X_test,y_test)

0.9298245614035088

### ランダムサーチ

In [0]:
from sklearn.model_selection import RandomizedSearchCV

In [0]:
from scipy.stats import randint

In [0]:
param_dist = {
    'max_depth':randint(1,3),
    'n_estimators':randint(10,31)
}

In [0]:
rs = RandomizedSearchCV(
    # ランダムフォレスト
    estimator=RandomForestClassifier(criterion='gini',random_state=42),
    # 上で定義したパラメータの分布
    param_distributions=param_dist,
    scoring='accuracy',
    # 交差検証にStratifiedKFoldを利用する
    cv = StratifiedKFold(n_splits=10,shuffle=True,random_state=42),
    # 探索回数は10回
    n_iter=10,
    return_train_score=True,
    random_state=42)

In [56]:
rs.fit(X_train,y_train)



RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
          error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe3463547f0>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe3463542e8>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scoring='accuracy', verbose=0)

In [57]:
rs.best_params_

{'max_depth': 2, 'n_estimators': 30}

In [58]:
rs.best_score_

0.9120603015075377

In [59]:
df_random_result = DataFrame(rs.cv_results_)
df_random_result[['param_max_depth', 'param_n_estimators', 'mean_train_score', 'mean_test_score']]

Unnamed: 0,param_max_depth,param_n_estimators,mean_train_score,mean_test_score
0,1,29,0.891116,0.879397
1,1,24,0.88665,0.88191
2,1,17,0.888043,0.88191
3,1,30,0.888327,0.879397
4,1,28,0.888048,0.879397
5,1,20,0.883304,0.879397
6,1,30,0.888327,0.879397
7,2,17,0.918478,0.909548
8,2,12,0.917922,0.907035
9,2,30,0.926297,0.91206


In [60]:
clf2 = rs.best_estimator_
clf2.score(X_test, y_test)

0.9181286549707602