In [5]:
import numpy as np
from sklearn.model_selection import KFold

In [13]:
X = np.array([
    [1,2], [3,4], [5,6], [7,8]
])
y = np.array([1,2,3,4])

In [14]:
X

array([[1, 2],
       [3, 4],
       [5, 6],
       [7, 8]])

In [8]:
y

array([1, 2, 3, 4])

In [9]:
kf = KFold(n_splits=2) # 데이터를 총 2등분으로 나눔
print(kf.get_n_splits)

<bound method _BaseKFold.get_n_splits of KFold(n_splits=2, random_state=None, shuffle=False)>


In [15]:
# kfold는 인덱스를 반환하므로 데이터 값을 얻기 위해서는 아래와 같이 인덱스 지정으로 해주어야함
for train_idx, val_idx in kf.split(X):
    print("==============================")
    print('train data : ', X[train_idx])
    print('val data : ', X[val_idx])

train data :  [[5 6]
 [7 8]]
val data :  [[1 2]
 [3 4]]
train data :  [[1 2]
 [3 4]]
val data :  [[5 6]
 [7 8]]


와인 데이터에서 한 번 실습해보자.

In [16]:
# 데이터 읽기

import pandas as pd

red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep = ';')
white_wine = pd.read_csv(white_url, sep = ';')

# 와인 색상 구분 컬럼
red_wine['color'] = 1
white_wine['color'] = 0

# 두 데이터 합치기
wine = pd.concat([red_wine, white_wine])

X = wine.drop(['color'], axis=1)
y = wine['color'] # 타겟 데이터

In [17]:
wine['taste'] = [1. if grade > 5 else 0. for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis = 1)
y = wine['taste']

In [26]:
# 이전 방법

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                   test_size = 0.2, 
                                                   random_state=13,
                                                   stratify= y
                                                   )

# 학습
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)

# 예측
y_pred_test = wine_tree.predict(X_test) 

# 성능
accuracy_score(y_test, y_pred_test)

0.7276923076923076

이번에는 교차검증을 한 번 해보자.

## 교차검증

In [27]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cv_acc = []

for train_idx, test_idx in kfold.split(X):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    wine_tree_cv.fit(X_train, y_train)

    pred = wine_tree_cv.predict(X_test)
    cv_acc.append(accuracy_score(y_test, pred))

cv_acc

[0.6007692307692307,
 0.6884615384615385,
 0.7090069284064665,
 0.7628945342571208,
 0.7867590454195535]

In [28]:
np.mean(cv_acc) # 각 acc의 분산이 크지 않다면 평균을 대표값으로 함

0.709578255462782

In [22]:
# StratifiedKFold
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cv_acc = []

for train_idx, test_idx in skfold.split(X,y):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    wine_tree_cv.fit(X_train, y_train)

    pred = wine_tree_cv.predict(X_test)
    cv_acc.append(accuracy_score(y_test, pred))

np.mean(cv_acc) # 각 acc의 분산이 크지 않다면 평균을 대표값으로 함

0.6888004974240539

성능이 점점 떨어진다면 내 모델이 그리 좋은 성능만을 가지고 있진 않구나 라고 생각할 수 있음

위와 같이 데이터를 인덱스로 분리하고 성능을 구할 수도 있지만 보다 간편하게 구할 수도 있다.

In [23]:
from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cross_val_score(wine_tree_cv, X, y, scoring=None, cv = skfold)

array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595])

In [24]:
np.mean(cross_val_score(wine_tree_cv, X, y, scoring=None, cv = skfold))

0.6888004974240539

In [25]:
# train score와 함께 보고 싶다면

from sklearn.model_selection import cross_validate
cross_validate(wine_tree_cv, X, y, scoring=None, cv = skfold, return_train_score=True)

{'fit_time': array([0.00602007, 0.00698066, 0.00798678, 0.00698185, 0.00610924]),
 'score_time': array([0.00199342, 0.00227308, 0.0009973 , 0.00098848, 0.0019908 ]),
 'test_score': array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595]),
 'train_score': array([0.74773908, 0.74696941, 0.74317045, 0.73509042, 0.73258946])}

아무래도 train 성능이 좋은것으로 보아 과적합 현상이 있는 것 같다.

## 하이퍼파라마미터 튜닝(GridSearch)

In [29]:
# 데이터 읽기

import pandas as pd

red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep = ';')
white_wine = pd.read_csv(white_url, sep = ';')

# 와인 색상 구분 컬럼
red_wine['color'] = 1
white_wine['color'] = 0

# 두 데이터 합치기
wine = pd.concat([red_wine, white_wine])

X = wine.drop(['color'], axis=1)
y = wine['color'] # 타겟 데이터

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

params = {'max_depth' : [2,4,7,10]} # {'파라미터명' : 파라미터 값}
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)

gridsearch = GridSearchCV(estimator= wine_tree, param_grid=params, cv = 5)
gridsearch.fit(X,y)

In [31]:
# 결과

import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gridsearch.cv_results_)

{   'mean_fit_time': array([0.00649247, 0.01440992, 0.01378846, 0.01736345]),
    'mean_score_time': array([0.00179648, 0.00197778, 0.00161223, 0.00166764]),
    'mean_test_score': array([0.96044401, 0.97752685, 0.98014331, 0.98168366]),
    'param_max_depth': masked_array(data=[2, 4, 7, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object),
    'params': [   {'max_depth': 2},
                  {'max_depth': 4},
                  {'max_depth': 7},
                  {'max_depth': 10}],
    'rank_test_score': array([4, 3, 2, 1]),
    'split0_test_score': array([0.95615385, 0.98461538, 0.98846154, 0.98769231]),
    'split1_test_score': array([0.96      , 0.97846154, 0.98076923, 0.97692308]),
    'split2_test_score': array([0.96920708, 0.98460354, 0.98614319, 0.98229407]),
    'split3_test_score': array([0.95842956, 0.97151655, 0.9799846 , 0.9830639 ]),
    'split4_test_score': array([0.95842956, 0.96843726, 0.96535797, 0.97844496]),
    'std

In [32]:
# 최적의 성능을 가진 모델

gridsearch.best_estimator_

In [33]:
# 최적의 성능을 가진 모델의 정확도
gridsearch.best_score_

0.981683661988512

In [34]:
# 최적의 성능을 가진 모델의 파라미터 값
gridsearch.best_params_

{'max_depth': 10}

만약, pipeline에 gridsearch를 적용하고 싶다면

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

estimators = [
    ('scaler', StandardScaler()), # 첫번째 단계의 (객체 이름, 객체)
    ('clf', DecisionTreeClassifier()) # 두번째 단계의 (객체 이름, 객체)
]

pipe = Pipeline(estimators)

In [37]:
param_grid = [
    {'clf__max_depth' : [2,4,7,10]}
]

gridsearch = GridSearchCV(estimator= pipe, param_grid=param_grid, cv = 5)
gridsearch.fit(X,y)

In [38]:
# 똑같이 결과도 확인할 수 있다
gridsearch.best_estimator_, gridsearch.best_score_, gridsearch.best_params_

(Pipeline(steps=[('scaler', StandardScaler()),
                 ('clf', DecisionTreeClassifier(max_depth=7))]),
 0.9807590454195534,
 {'clf__max_depth': 7})

정확도를 좀 더 깔끔하게 보려면 아래와 같은 방법을 통해 볼 수 있다.

In [40]:
import pandas as pd

score_df = pd.DataFrame(gridsearch.cv_results_)
score_df = score_df[['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]
score_df

Unnamed: 0,params,rank_test_score,mean_test_score,std_test_score
0,{'clf__max_depth': 2},4,0.960444,0.00455
1,{'clf__max_depth': 4},3,0.978605,0.005973
2,{'clf__max_depth': 7},1,0.980759,0.007273
3,{'clf__max_depth': 10},2,0.980452,0.002367
