## 하이퍼파라미터

### 교차 검증
- cross validation
- 모델의 성능을 정확히 표현하기 위래서 사용

In [2]:
import numpy as np
from sklearn.model_selection import KFold

x = np.array([[1,2], [3, 4], [5,6], [7,8]])
y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=3)

print(kf.get_n_splits(x))
print(kf)

for train_idx, test_idx in kf.split(x):
    print('-------idx---------')
    print(train_idx, test_idx)
    print('------train ----------')
    print(x[train_idx])
    print('--------val-------')
    print(x[test_idx])

3
KFold(n_splits=3, random_state=None, shuffle=False)
-------idx---------
[2 3] [0 1]
------train ----------
[[5 6]
 [7 8]]
--------val-------
[[1 2]
 [3 4]]
-------idx---------
[0 1 3] [2]
------train ----------
[[1 2]
 [3 4]
 [7 8]]
--------val-------
[[5 6]]
-------idx---------
[0 1 2] [3]
------train ----------
[[1 2]
 [3 4]
 [5 6]]
--------val-------
[[7 8]]


### 다시 와인

In [3]:
import pandas as pd

red_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv"
white_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv"

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';') ## ;을 기준으로 분류해서 읽어라

red_wine['color'] = 1.
white_wine['color'] = 0.

wine = pd.concat([red_wine, white_wine]) ## 아래로 이어 붙이기

x = wine.drop('color', axis= 1)
y = wine['color']

wine = pd.concat([red_wine, white_wine])

In [4]:
wine['taste'] = [1. if grade > 5 else 0. for grade in wine['quality']]

x = wine.drop(['taste', 'quality'], axis= 1)
y = wine['taste']

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tr_dt, te_dt, tr_lb, te_lb = train_test_split(x, y, test_size=0.2,
                                              random_state=13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(tr_dt, tr_lb)

pred_tr = wine_tree.predict(tr_dt)
pred_te = wine_tree.predict(te_dt)

print('train acc : ', accuracy_score(tr_lb, pred_tr))
print('te acxc : ', accuracy_score(te_lb, pred_te))

train acc :  0.7294593034442948
te acxc :  0.7161538461538461


### KFOLD

In [6]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

In [10]:
for train_idx, test_idx in kfold.split(x):
    print(len(train_idx), len(test_idx))

5197 1300
5197 1300
5198 1299
5198 1299
5198 1299


### 각각의 fold에 대한 학습후 accuracy 확인

In [11]:
cv_accuracy = []

for train_idx, test_idx in kfold.split(x):
    tr_dt, te_dt = x.iloc[train_idx], x.iloc[test_idx]
    tr_lb, te_lb = y.iloc[train_idx], y.iloc[test_idx]
    wine_tree_cv.fit(tr_dt, tr_lb)
    pred = wine_tree_cv.predict(tr_dt)
    cv_accuracy.append(accuracy_score(tr_lb, pred))
cv_accuracy

[0.7442755435828362,
 0.748316336347893,
 0.7445171219699884,
 0.7314351673720662,
 0.7250865717583687]

In [12]:
np.mean(cv_accuracy)

0.7387261482062305

### stratified kfold

In [14]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cv_accuracy = []

for train_idx, test_idx in skfold.split(x, y):
    tr_dt, te_dt = x.iloc[train_idx], x.iloc[test_idx]
    tr_lb, te_lb = y.iloc[train_idx], y.iloc[test_idx]
    wine_tree_cv.fit(tr_dt, tr_lb)
    predict = wine_tree_cv.predict(tr_dt)
    cv_accuracy.append(accuracy_score(tr_lb, predict))

cv_accuracy

[0.7477390802385991,
 0.7469694054262074,
 0.7431704501731435,
 0.7350904193920739,
 0.7325894574836476]

In [15]:
np.mean(cv_accuracy)

0.7411117625427343

### cross validation을 보다 간단히

In [16]:
from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cross_val_score(wine_tree_cv, x, y, scoring=None, cv = skfold)

array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595])

In [20]:
wine_tree_cv = DecisionTreeClassifier(max_depth=5, random_state=13)

cross_val_score(wine_tree_cv, x, y, scoring=None, cv=skfold )

TypeError: cross_val_score() got an unexpected keyword argument 'return_train_score'

In [21]:
from sklearn.model_selection import cross_validate

cross_validate(wine_tree_cv, x, y, scoring= None, cv = skfold, return_train_score=True)

{'fit_time': array([0.01403236, 0.01763463, 0.01654601, 0.01629591, 0.01749635]),
 'score_time': array([0.00187755, 0.00269842, 0.00190687, 0.00186682, 0.00194979]),
 'test_score': array([0.50076923, 0.62615385, 0.69745958, 0.7582756 , 0.74903772]),
 'train_score': array([0.78795459, 0.78045026, 0.77568295, 0.76356291, 0.76279338])}

### 하이퍼 파라미터 튜닝

- 튜닝대상: max_depth

### 또 다시 와인

In [22]:
import pandas as pd

red_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv"
white_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv"

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';') ## ;을 기준으로 분류해서 읽어라

red_wine['color'] = 1.
white_wine['color'] = 0.

wine = pd.concat([red_wine, white_wine]) ## 아래로 이어 붙이기

x = wine.drop('color', axis= 1)
y = wine['color']

wine = pd.concat([red_wine, white_wine])

wine['taste'] = [1. if grade > 5 else 0. for grade in wine['quality']]

x = wine.drop(['taste', 'quality'], axis= 1)
y = wine['taste']

### gridsearchcv

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

params = {'max_depth' : [2, 4, 7, 10]}
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)

gridsearch = GridSearchCV(  estimator=wine_tree, 
                            param_grid=params, 
                            cv=5,
                            n_jobs=4) ## core 4개 사용
gridsearch.fit(x,y)

In [24]:
import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gridsearch.cv_results_)

{   'mean_fit_time': array([0.01151304, 0.01659937, 0.02780747, 0.03712826]),
    'mean_score_time': array([0.0027966 , 0.00321975, 0.00293417, 0.0026166 ]),
    'mean_test_score': array([0.6888005 , 0.66356523, 0.65340854, 0.64401587]),
    'param_max_depth': masked_array(data=[2, 4, 7, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object),
    'params': [   {'max_depth': 2},
                  {'max_depth': 4},
                  {'max_depth': 7},
                  {'max_depth': 10}],
    'rank_test_score': array([1, 2, 3, 4], dtype=int32),
    'split0_test_score': array([0.55230769, 0.51230769, 0.50846154, 0.51615385]),
    'split1_test_score': array([0.68846154, 0.63153846, 0.60307692, 0.60076923]),
    'split2_test_score': array([0.71439569, 0.72363356, 0.68360277, 0.66743649]),
    'split3_test_score': array([0.73210162, 0.73210162, 0.73672055, 0.71054657]),
    'split4_test_score': array([0.75673595, 0.7182448 , 0.73518091, 0.7251732

- 최적의 성능을 가진 모델

In [25]:
gridsearch.best_estimator_

In [26]:
gridsearch.best_score_

0.6888004974240539

In [27]:
gridsearch.best_params_

{'max_depth': 2}

### 만약 pipeline을 적용한 모델에 grid search를 적용하고 싶다면

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

estimators = [('scaler', StandardScaler()),
              ('clf', DecisionTreeClassifier(random_state=13))]

pipe = Pipeline(estimators)

In [29]:
param_grid = [{'clf__max_depth' : [2, 4, 7, 10]}]

gridsearch = GridSearchCV(estimator= pipe,
                          param_grid = param_grid,
                          cv=5,
                          )
gridsearch.fit(x,y)

In [30]:
gridsearch.best_estimator_

In [31]:
gridsearch.best_score_

0.6888004974240539

In [33]:
pp.pprint(gridsearch.cv_results_)

{   'mean_fit_time': array([0.01047659, 0.01556158, 0.02201986, 0.02970238]),
    'mean_score_time': array([0.00222597, 0.0023325 , 0.00213094, 0.00204773]),
    'mean_test_score': array([0.6888005 , 0.66356523, 0.6534083 , 0.64401563]),
    'param_clf__max_depth': masked_array(data=[2, 4, 7, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object),
    'params': [   {'clf__max_depth': 2},
                  {'clf__max_depth': 4},
                  {'clf__max_depth': 7},
                  {'clf__max_depth': 10}],
    'rank_test_score': array([1, 2, 3, 4], dtype=int32),
    'split0_test_score': array([0.55230769, 0.51230769, 0.50846154, 0.51615385]),
    'split1_test_score': array([0.68846154, 0.63153846, 0.60461538, 0.60230769]),
    'split2_test_score': array([0.71439569, 0.72363356, 0.68206313, 0.66589684]),
    'split3_test_score': array([0.73210162, 0.73210162, 0.73672055, 0.71054657]),
    'split4_test_score': array([0.75673595, 0.718244

### 표로 성능 결과 정리하기

In [34]:
import pandas as pd

score_df = pd.DataFrame(gridsearch.cv_results_)
score_df[['params', 'rank_test_score', 'mean_test_score', 'std_test_score']] 

## accuracy의 평균과 표준편차 확인

Unnamed: 0,params,rank_test_score,mean_test_score,std_test_score
0,{'clf__max_depth': 2},1,0.6888,0.071799
1,{'clf__max_depth': 4},2,0.663565,0.083905
2,{'clf__max_depth': 7},3,0.653408,0.086993
3,{'clf__max_depth': 10},4,0.644016,0.076915
