#### 【 교차검증 (Cross Validation) 】

- 적은 데이터셋으로 안정적이고 신뢰성있는 모델 평가를 위한 방법
- 학습 데이터셋을 K개 분할 후 매번 다른 데이터로 검증 진행
- 교차검증 후 모델의 일반화 성능으로 여김

[1] 모듈 로딩 및 데이터 준비<hr>

In [30]:
## [1-1] 모듈 로딩
##-> 기본 모듈
import numpy as np
import pandas as pd

##-> ML 관련 모듈
from sklearn.model_selection import KFold, StratifiedKFold   ## 교차검증용
from sklearn.neighbors import KNeighborsClassifier                ## 학습 알고리즘

In [2]:
## [1-2] 데이터 준비
DATA_FILE = '../Data/iris.csv'

irisDF = pd.read_csv(DATA_FILE)
irisDF.head(3)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa


[2] 데이터 전처리 및 학습 준비 <hr>

In [4]:
## [2-1] 품종컬럼 자료형 변환
pd.options.mode.copy_on_write = True

irisDF.variety = irisDF.variety.astype('category')
irisDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   sepal.length  150 non-null    float64 
 1   sepal.width   150 non-null    float64 
 2   petal.length  150 non-null    float64 
 3   petal.width   150 non-null    float64 
 4   variety       150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [5]:
## [2-2] 피쳐와 타겟분리
featureDF = irisDF[ irisDF.columns[:-1] ]
targetSR  = irisDF [ irisDF.columns[-1] ]

print(f'featureDF : {featureDF.shape},  targetSR : {targetSR.shape}')

featureDF : (150, 4),  targetSR : (150,)


[3] 교차검증 <hr>

In [26]:
## ======================================================
## [3-1] K-Fold 교차검증
## ======================================================
## K-Fold 인스턴스 생성
kfold = KFold(random_state=7, shuffle=True)
print(f'kfold => {kfold}')

## 평가 결과 저장 DF
resultDF = pd.DataFrame(columns=['neighbors', 'train', 'valid', 'diff'])

MAX_N = 31
for neighbors in range(1, MAX_N):
    ## K개 교차검증 진행
    ## => 학습용 k-1/k 인덱스, 검증용 1/k 인덱스
    ## => K개 성능점수 저장
    ts_scores, vs_scores = [], []
    for train_index, valid_index in kfold.split(featureDF):

        ## 학습 진행
        x_train, y_train = featureDF.iloc[train_index], targetSR[train_index]
        x_valid, y_valid = featureDF.iloc[valid_index], targetSR[valid_index]
        #print(f'[Train 타겟별 비율] : {round( y_train.value_counts()/y_train.shape[0], 1)}')
        #print(f'[Valid 타겟별 비율] : {round( y_valid.value_counts()/y_valid.shape[0], 1)}')

        kModel = KNeighborsClassifier(n_neighbors=neighbors)
        kModel.fit(x_train, y_train)

        ## 검증 진행
        t_score = kModel.score(x_train, y_train)
        v_score = kModel.score(x_valid, y_valid)
        ts_scores.append(t_score)
        vs_scores.append(v_score)

    ## K-Fold 진행 후 성능평균
    ts_mean = sum(ts_scores)/len(ts_scores)
    vs_mean = sum(vs_scores)/len(vs_scores)
    print(f'[neighbors = {neighbors}] Train 평균 : {ts_mean:.5f}, Valid 평균 : {vs_mean:.5f}')
    resultDF.loc[resultDF.shape[0]]=[neighbors, ts_mean, vs_mean, abs(ts_mean-vs_mean)]


kfold => KFold(n_splits=5, random_state=7, shuffle=True)
[neighbors = 1] Train 평균 : 1.00000, Valid 평균 : 0.96000
[neighbors = 2] Train 평균 : 0.97500, Valid 평균 : 0.96667
[neighbors = 3] Train 평균 : 0.96333, Valid 평균 : 0.96000
[neighbors = 4] Train 평균 : 0.96333, Valid 평균 : 0.96667
[neighbors = 5] Train 평균 : 0.97000, Valid 평균 : 0.96000
[neighbors = 6] Train 평균 : 0.97333, Valid 평균 : 0.95333
[neighbors = 7] Train 평균 : 0.97167, Valid 평균 : 0.96667
[neighbors = 8] Train 평균 : 0.97833, Valid 평균 : 0.96000
[neighbors = 9] Train 평균 : 0.97667, Valid 평균 : 0.96000
[neighbors = 10] Train 평균 : 0.97500, Valid 평균 : 0.96000
[neighbors = 11] Train 평균 : 0.97667, Valid 평균 : 0.97333
[neighbors = 12] Train 평균 : 0.97833, Valid 평균 : 0.96667
[neighbors = 13] Train 평균 : 0.97667, Valid 평균 : 0.97333
[neighbors = 14] Train 평균 : 0.97833, Valid 평균 : 0.97333
[neighbors = 15] Train 평균 : 0.97667, Valid 평균 : 0.96667
[neighbors = 16] Train 평균 : 0.97500, Valid 평균 : 0.96000
[neighbors = 17] Train 평균 : 0.97333, Valid 평균 : 0.96667


In [28]:
resultDF.sort_values(by='diff').head(5)

Unnamed: 0,neighbors,train,valid,diff
27,28.0,0.953333,0.953333,0.0
2,3.0,0.963333,0.96,0.003333
28,29.0,0.956667,0.96,0.003333
10,11.0,0.976667,0.973333,0.003333
12,13.0,0.976667,0.973333,0.003333


In [33]:
## ====================================================================
## [3-2] StratifiedGroupKFold 교차검증
##       => .split(2D_피쳐, 1D_타겟) : 타겟의 클래스/라벨 비율 계산
##       => 분류 모델일 때
## ====================================================================
## K-Fold 인스턴스 생성
kfold = StratifiedKFold(random_state=7, shuffle=True)
print(f'kfold => {kfold}')

## 평가 결과 저장 DF
resultDF = pd.DataFrame(columns=['neighbors', 'train', 'valid', 'diff'])

MAX_N = 31
for neighbors in range(1, MAX_N):
    ## K개 교차검증 진행
    ## => 학습용 k-1/k 인덱스, 검증용 1/k 인덱스
    ## => K개 성능점수 저장
    ts_scores, vs_scores = [], []
    for train_index, valid_index in kfold.split(featureDF, targetSR):

        ## 학습 진행
        x_train, y_train = featureDF.iloc[train_index], targetSR[train_index]
        x_valid, y_valid = featureDF.iloc[valid_index], targetSR[valid_index]
        print(f'[Train 타겟별 비율] : {round( y_train.value_counts()/y_train.shape[0], 1)}')
        print(f'[Valid 타겟별 비율] : {round( y_valid.value_counts()/y_valid.shape[0], 1)}')

        kModel = KNeighborsClassifier(n_neighbors=neighbors)
        kModel.fit(x_train, y_train)

        ## 검증 진행
        t_score = kModel.score(x_train, y_train)
        v_score = kModel.score(x_valid, y_valid)
        ts_scores.append(t_score)
        vs_scores.append(v_score)

    ## K-Fold 진행 후 성능평균
    ts_mean = sum(ts_scores)/len(ts_scores)
    vs_mean = sum(vs_scores)/len(vs_scores)
    print(f'[neighbors = {neighbors}] Train 평균 : {ts_mean:.5f}, Valid 평균 : {vs_mean:.5f}')
    resultDF.loc[resultDF.shape[0]]=[neighbors, ts_mean, vs_mean, abs(ts_mean-vs_mean)]


kfold => StratifiedKFold(n_splits=5, random_state=7, shuffle=True)
[Train 타겟별 비율] : variety
Setosa        0.3
Versicolor    0.3
Virginica     0.3
Name: count, dtype: float64
[Valid 타겟별 비율] : variety
Setosa        0.3
Versicolor    0.3
Virginica     0.3
Name: count, dtype: float64
[Train 타겟별 비율] : variety
Setosa        0.3
Versicolor    0.3
Virginica     0.3
Name: count, dtype: float64
[Valid 타겟별 비율] : variety
Setosa        0.3
Versicolor    0.3
Virginica     0.3
Name: count, dtype: float64
[Train 타겟별 비율] : variety
Setosa        0.3
Versicolor    0.3
Virginica     0.3
Name: count, dtype: float64
[Valid 타겟별 비율] : variety
Setosa        0.3
Versicolor    0.3
Virginica     0.3
Name: count, dtype: float64
[Train 타겟별 비율] : variety
Setosa        0.3
Versicolor    0.3
Virginica     0.3
Name: count, dtype: float64
[Valid 타겟별 비율] : variety
Setosa        0.3
Versicolor    0.3
Virginica     0.3
Name: count, dtype: float64
[Train 타겟별 비율] : variety
Setosa        0.3
Versicolor    0.3
Virginica     0.

In [34]:
resultDF.sort_values(by='diff').head(5)

Unnamed: 0,neighbors,train,valid,diff
6,7.0,0.975,0.973333,0.001667
4,5.0,0.975,0.973333,0.001667
11,12.0,0.975,0.973333,0.001667
10,11.0,0.976667,0.973333,0.003333
12,13.0,0.97,0.973333,0.003333
