# GridSearchCV
#### GridSearchCV 란?
###### 사이킷런에서는 분류 알고리즘이나 회귀 알고리즘에 사용되는 하이퍼파라미터를 순차적으로 입력해 학습을 하고 측정을 하면서 가장 좋은 파라미터를 알려준다. 
###### GridSearchCV가 없다면 max_depth 가 3일때 가장 최적의 스코어를 뽑아내는지 1일때 가장 최적인 스코어를  뽑아내는지 일일이 학습을 해야 한다. 
###### 하지만 grid 파라미터 안에서 집합을 만들고 적용하면 최적화된 파라미터를 뽑아낼 수 있다.

#### GridSearchCV 클래스의 생성자 정리

###### -estimator : classifier, regressor, pipeline 등 가능

###### -param_grid : 튜닝을 위해 파라미터, 사용될 파라미터를 dictionary 형태로 만들어서 넣는다.

###### -scoring : 예측 성능을 측정할 평가 방법을 넣는다. 보통 accuracy 로 지정하여서 정확도로 성능 평가를 한다.

###### -cv : 교차 검증에서 몇개로 분할되는지 지정한다.

###### -refit : True가 디폴트로 True로 하면 최적의 하이퍼 파라미터를 찾아서 재학습 시킨다.

In [3]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score,cross_validate
import numpy as np

iris=load_iris()
dt_clf=DecisionTreeClassifier(random_state=156)

features=iris.data
label=iris.target

scores=cross_val_score(dt_clf,features,label,scoring='accuracy',cv=3)  
print('교차 검증별 정확도:',scores)
print('평균 검증 정확도:',np.round(np.mean(scores),4))

교차 검증별 정확도: [0.98 0.94 0.98]
평균 검증 정확도: 0.9667


In [10]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

iris=load_iris()
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.2,random_state=121)

dt_clf=DecisionTreeClassifier()

parameters={'max_depth':[1,2,3],'min_samples_split':[2,3]}
#하이퍼파라미터는 딕셔너리 형식으로 지정
#key : 이 결정트리의 하이파라미터
#value : 하이퍼파라미터의 값

In [12]:
import pandas as pd
grid_tree=GridSearchCV(dt_clf,param_grid=parameters,cv=3,refit=True,return_train_score=True)
grid_tree.fit(X_train,y_train)
scores_df=pd.DataFrame(grid_tree.cv_results_)
scores_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.0,0.0,0.0,0.0,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.7,0.7,0.7,0.7,1.110223e-16,5,0.7,0.7,0.7,0.7,1.110223e-16
1,0.000334,0.000472,0.000333,0.000471,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.7,0.7,0.7,0.7,1.110223e-16,5,0.7,0.7,0.7,0.7,1.110223e-16
2,0.0,0.0,0.0,0.0,2,2,"{'max_depth': 2, 'min_samples_split': 2}",0.925,1.0,0.95,0.958333,0.03118048,3,0.975,0.9375,0.9625,0.958333,0.01559024
3,0.000333,0.000471,0.0,0.0,2,3,"{'max_depth': 2, 'min_samples_split': 3}",0.925,1.0,0.95,0.958333,0.03118048,3,0.975,0.9375,0.9625,0.958333,0.01559024
4,0.000667,0.000472,0.000332,0.000469,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1.0,0.95,0.975,0.02041241,1,0.9875,0.9625,0.9875,0.979167,0.01178511
5,0.0,0.0,0.0,0.0,3,3,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1.0,0.95,0.975,0.02041241,1,0.9875,0.9625,0.9875,0.979167,0.01178511


In [13]:
grid_tree.cv_results_

{'mean_fit_time': array([0.        , 0.00033355, 0.        , 0.00033339, 0.00066694,
        0.        ]),
 'std_fit_time': array([0.        , 0.00047171, 0.        , 0.00047148, 0.00047161,
        0.        ]),
 'mean_score_time': array([0.        , 0.00033323, 0.        , 0.        , 0.00033156,
        0.        ]),
 'std_score_time': array([0.        , 0.00047126, 0.        , 0.        , 0.0004689 ,
        0.        ]),
 'param_max_depth': masked_array(data=[1, 1, 2, 2, 3, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 3, 2, 3, 2, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 1, 'min_samples_split': 2},
  {'max_depth': 1, 'min_samples_split': 3},
  {'max_depth': 2, 'min_samples_split': 2},
  {'max_depth': 2, 'min_samples_split': 3},
  {'max_depth': 3, 'min_sample

In [14]:
scores_df[['params','mean_test_score','rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1


In [16]:
#최고 성능을 가지는 파라미터 조합 및 예측 성능 1위 값 출력
print('최적 파라미터: ',grid_tree.best_params_)
print('최고 정확도:',grid_tree.best_score_)

최적 파라미터:  {'max_depth': 3, 'min_samples_split': 2}
최고 정확도: 0.975


In [18]:
#GridSeaarchCV 객체의 생성 파라미터로 refit=True로 설정된 경우(디폴트)
best_dt=grid_tree.best_estimator_
pred=best_dt.predict(X_test)
accuracy_score(y_test,pred)

0.9666666666666667

#### 참고
###### fit : train할때 사용
###### predict : 추측할때 사용