In [1]:
import numpy as np
from sklearn.model_selection import KFold

In [2]:
X = np.array([
    [1,2], [3, 4], [1, 2], [3, 4]
])

In [3]:
y = np.array([1, 2, 3, 4])

In [4]:
kf = KFold(n_splits=2)

print(kf.get_n_splits(X))
print(kf)

2
KFold(n_splits=2, random_state=None, shuffle=False)


In [7]:
for train_idx, test_idx in kf.split(X):
    print('Train idx : ', train_idx)
    print('Test idx : ', test_idx)

Train idx :  [2 3]
Test idx :  [0 1]
Train idx :  [0 1]
Test idx :  [2 3]


In [8]:
for train_idx, test_idx in kf.split(X):
    print('---idx---')
    print(train_idx, test_idx)
    print('---train_data---')
    print(X[train_idx])
    print('---test_data---')
    print(X[test_idx])

---idx---
[2 3] [0 1]
---train_data---
[[1 2]
 [3 4]]
---test_data---
[[1 2]
 [3 4]]
---idx---
[0 1] [2 3]
---train_data---
[[1 2]
 [3 4]]
---test_data---
[[1 2]
 [3 4]]


In [9]:
import pandas as pd

red_url = 'https://github.com/PinkWink/ML_tutorial/raw/master/dataset/winequality-red.csv'
white_url = 'https://github.com/PinkWink/ML_tutorial/raw/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')

red_wine['color'] = 1
white_wine['color'] = 0

wine = pd.concat([red_wine, white_wine])

In [10]:
wine['taste'] = [1. if grade > 5 else 0. for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)

y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))

Train Acc :  0.7294593034442948
Test Acc :  0.7161538461538461


## KFold

In [12]:
kfold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

- kfold는 훈련셋과 검증셋의 인덱스를 반환하는 제너레이터

In [17]:
kfold.split(X) # generator

<generator object _BaseKFold.split at 0x000001F3AECC4C80>

In [13]:
for train_idx, test_idx in kfold.split(X):
    print(len(train_idx), len(test_idx))

5197 1300
5197 1300
5198 1299
5198 1299
5198 1299


In [14]:
cv_accuracy = []
for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(y_test, pred))

cv_accuracy

[0.6007692307692307,
 0.6884615384615385,
 0.7090069284064665,
 0.7628945342571208,
 0.7867590454195535]

In [18]:
import numpy as np

np.mean(cv_accuracy), np.std(cv_accuracy)

(0.709578255462782, 0.06493865709899564)

## StratifiedKFold

In [21]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cv_accuracy = []
for train_idx, test_idx in skfold.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(y_test, pred))

cv_accuracy

[0.5523076923076923,
 0.6884615384615385,
 0.7143956889915319,
 0.7321016166281755,
 0.7567359507313318]

In [22]:
np.mean(cv_accuracy), np.std(cv_accuracy)

(0.6888004974240539, 0.07179934165921319)

## cross_val_score

In [23]:
from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold)

array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595])

In [24]:
np.mean(cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold))

0.6888004974240539

- 하이퍼 파라미터를 조정해서 결과를 비교하고자 할 땐, 함수로 만들어서 인자를 조절하는 방식으로 하면 코드의 반복을 줄이고 가독성을 높일 수 있다. 

In [26]:
def skfold_dt(depth):
    skfold = StratifiedKFold(n_splits=5)
    wine_tree_cv = DecisionTreeClassifier(max_depth=depth, random_state=13)

    print(cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold))

skfold_dt(5), skfold_dt(3)

[0.50076923 0.62615385 0.69745958 0.7582756  0.74903772]
[0.56846154 0.68846154 0.71439569 0.73210162 0.75673595]


(None, None)

In [28]:
# 훈련셋의 점수도 같이 보고 싶을 떄
from sklearn.model_selection import cross_validate

cross_validate(wine_tree_cv, X, y, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([0.01499915, 0.00998473, 0.00997186, 0.01197267, 0.01794434]),
 'score_time': array([0.00395632, 0.00298023, 0.0039916 , 0.0039854 , 0.00398922]),
 'test_score': array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595]),
 'train_score': array([0.74773908, 0.74696941, 0.74317045, 0.73509042, 0.73258946])}