#### 【 교차검증 (Cross Validation) 】

- 적은 데이터셋으로 안정적이고 신뢰성있는 모델 평가를 위한 방법
- 학습 데이터셋을 K개 분할 후 매번 다른 데이터로 검증 진행
- 교차검증 후 모델의 일반화 성능으로 여김

[1] 모듈 로딩 및 데이터 준비<hr>

In [1]:
## [1-1] 모듈 로딩
##-> 기본 모듈
import numpy as np
import pandas as pd

##-> ML 관련 모듈
from sklearn.model_selection import KFold, StratifiedGroupKFold   ## 교차검증용
from sklearn.neighbors import KNeighborsClassifier                ## 학습 알고리즘

In [2]:
## [1-2] 데이터 준비
DATA_FILE = '../Data/iris.csv'

irisDF = pd.read_csv(DATA_FILE)
irisDF.head(3)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa


[2] 데이터 전처리 및 학습 준비 <hr>

In [4]:
## [2-1] 품종컬럼 자료형 변환
pd.options.mode.copy_on_write = True

irisDF.variety = irisDF.variety.astype('category')
irisDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   sepal.length  150 non-null    float64 
 1   sepal.width   150 non-null    float64 
 2   petal.length  150 non-null    float64 
 3   petal.width   150 non-null    float64 
 4   variety       150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [5]:
## [2-2] 피쳐와 타겟분리
featureDF = irisDF[ irisDF.columns[:-1] ]
targetSR  = irisDF [ irisDF.columns[-1] ]

print(f'featureDF : {featureDF.shape},  targetSR : {targetSR.shape}')

featureDF : (150, 4),  targetSR : (150,)


[3] 교차검증 <hr>

In [20]:
## ======================================================
## [3-1] K-Fold 교차검증
## ======================================================
## K-Fold 인스턴스 생성
kfold = KFold(random_state=7, shuffle=True)
print(f'kfold => {kfold}')

## K개 교차검증 진행
## => 학습용 k-1/k 인덱스, 검증용 1/k 인덱스
## => K개 성능점수 저장
k_scores=[]
for train_index, valid_index in kfold.split(featureDF):
    ## 학습용, 검증용 인덱스 추출
    print(f'train_index : {train_index.shape},  valid_index : {valid_index.shape}')

    ## 학습 진행
    x_train, y_train = featureDF.iloc[train_index], targetSR[train_index]
    x_valid, y_valid = featureDF.iloc[valid_index], targetSR[valid_index]
    print(f'[Train 타겟별 비율] : {round(y_train.value_counts()/y_train.shape[0], 1)}')
    print(f'[Valid 타겟별 비율] : {round(y_valid.value_counts()/y_valid.shape[0], 1)}')

    kModel = KNeighborsClassifier()
    kModel.fit(x_train, y_train)

    ## 검증 진행
    v_score = kModel.score(x_valid, y_valid)
    print(f'v_score : {v_score}')
    k_scores.append(v_score)

## K-Fold 진행 후 성능평균
print(f'v_score : {sum(k_scores)/len(k_scores):.5f}')

kfold => KFold(n_splits=5, random_state=7, shuffle=True)
train_index : (120,),  valid_index : (30,)
[Train 타겟별 비율] : variety
Setosa        0.4
Virginica     0.3
Versicolor    0.3
Name: count, dtype: float64
[Valid 타겟별 비율] : variety
Versicolor    0.4
Virginica     0.4
Setosa        0.2
Name: count, dtype: float64
v_score : 0.9
train_index : (120,),  valid_index : (30,)
[Train 타겟별 비율] : variety
Versicolor    0.3
Virginica     0.3
Setosa        0.3
Name: count, dtype: float64
[Valid 타겟별 비율] : variety
Setosa        0.4
Versicolor    0.3
Virginica     0.3
Name: count, dtype: float64
v_score : 1.0
train_index : (120,),  valid_index : (30,)
[Train 타겟별 비율] : variety
Versicolor    0.4
Setosa        0.3
Virginica     0.3
Name: count, dtype: float64
[Valid 타겟별 비율] : variety
Setosa        0.4
Virginica     0.4
Versicolor    0.3
Name: count, dtype: float64
v_score : 1.0
train_index : (120,),  valid_index : (30,)
[Train 타겟별 비율] : variety
Versicolor    0.3
Virginica     0.3
Setosa        0.3
Name: co