# KFold
- 데이터셋을 K개의 동일 크기 부분(Fold)나눠서 K번 반복하여 하나의 폴드를 검증용으로 사용을 하고 나머지 k-1개의 폴드를 학습용으로 사용
- 모든 폴드를 한번은 검증용으로 포함이 되고 k-1개를 학습용으로 사용 
- 데이터가 수가 적을때 모델의 성능을 안정적으로 평가 할 수 있는 방법 

- 매개변수 
    - n_splits
        - 기본값 : 5
        - 폴드의 개수를 지정 
        - 최소 값은 2
    - shuffle
        - 기본값 : False
        - 데이터를 분할하기 전에 섞을지 지정 
        - True로 변경하게 되면 폴드가 랜덤하게 구성 
    - random_state
        - 기본값 : None
        - shuffle이 True인 경우에 사용
        - 랜덤 시드 고정 
- 속성
    - n_splits 
        - 분할된 폴드의 개수
- 메서드 
    - split(x, y = None)
        - 학습용/검증용 인덱스를 생성
        - 반복문을 이용하여 (train_index, test_index)로 변환하여 사용

- 장점 
    - 데이터를 폴드화 해서 학습/ 검증용으로 사용하기 때문에 데이터가 낭비가 없다. 
    - 일반적으로 사용하던 train_test_split보다 성능 평가가 안정적(설명이 충분하다.)
- 단점 
    - K번의 학습 -> K번의 예측 -> K번의 평가 --> 계산이 늘어남 -> 시간 증가
    - 데이터의 크기가 크다면 시간이 증가 

- 변형 KFold 클래스 
    - StratifiedKFold : 분류 문제에서 클래스의 비율을 유지하여 분할 
    - GroupKFold : 그룹 단위로 데이터를 나눠 그룹이 학습/검증에 동시에 들어가지 않도록 보장 
    - RepeatedKFold : KFold를 여러번 반복해 평가 안정성 강화 

In [2]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split, GridSearchCV, \
                KFold, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.metrics import classification_report, r2_score

In [3]:
# Pipeline + GridSearchCV + KFold를 사용하여 분류 문제 해결 
# iris 데이터를 로드 
iris = pd.read_csv("../data/iris.csv")
iris.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
# target 데이터를 0, 1, 2로 데이터를 변경
# iris['target'].unique()
for i, key in enumerate(iris['target'].unique()):
    # print(i)
    # print(key)
    iris['target'] = iris['target'].replace(key, i)

  iris['target'] = iris['target'].replace(key, i)


In [5]:
# train, test 데이터셋을 구분 
x = iris.drop('target', axis=1).values
y = iris['target'].values


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, random_state=42, test_size=0.2, stratify=y
)

In [7]:
# KFold를 이용해서 데이터 분할 
# 분류 모델 -> target의 비율 1:1:1 -> 비율을 맞춰서 폴드화
# stratifiedKFold를 이용
cv_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
cv_folds

StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [9]:
# Pipeline 생성 -> 
# 학습(fit()) -> Scaler 생성하고 fit()을 이용하여 범위를 지정하고 
# transform()을 이용해서 스케일링을 하고 모델에 학습을 시킨다.
# 예측(predict()) -> 생성된 Scaler를 사용하여 
# 검증 데이터를 transform()을 이용해서 스케일링하고 학습된 모델에 
# predict() 함수를 이용하여 예측값을 반환
pipe = Pipeline(
    [
        ('scaler', StandardScaler()), 
        ('svc', SVC(probability=True, random_state=42))
    ]
)

In [10]:
# GridSearchCV에서 사용할 파라미터 조합을 생성 
# pipeline을 이용하여 Grid를 사용하면 매개변수의 이름을 수정할 필요가 있다
# pipeline에서 사용할 모델에( 이름 + __ + 매개변수명) 키 값들을 변경 
params = {
    "svc__C" : [0.1, 1, 10], 
    'svc__gamma' : ['scale', 'auto'], 
    'svc__kernel' : ['linear', 'rbf']
}

In [11]:
grid_cls = GridSearchCV(
    estimator= pipe,   # gird에서 사용할 모델은 pipeline으로 만들어진 모델
    param_grid= params, # dict 형태로 각 파라미터별 사용할 값 지정
    scoring= 'accuracy',    # 검증할때마다 정확도를 이용하여 모델을 평가
    cv = cv_folds,      # 교차 검증은 횟수는 KFold의 값들을 이용
    verbose = 1,        # 진행상황을 간단한 로그로 표시 
    refit = True,       # 베스트 파라미터를 이용하여 재학습 
    return_train_score= True,   # 학습 데이터의 성능을 확인
    n_jobs=-1           # 해당 코드를 이용하여 작업시 사용할 코어는 모두
)
grid_cls.fit(X_train, Y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'svc__C': [0.1, 1, ...], 'svc__gamma': ['scale', 'auto'], 'svc__kernel': ['linear', 'rbf']}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,0.1
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [12]:
print(grid_cls.score(X_test, Y_test))

0.9333333333333333


In [13]:
# print(grid_cls.cv_results_)
pd.DataFrame(grid_cls.cv_results_).sort_values(
    "mean_test_score", ascending=False
)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__C,param_svc__gamma,param_svc__kernel,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.003187,0.001622,0.001048,0.000992,0.1,scale,linear,"{'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__k...",0.958333,1.0,...,0.975,0.020412,1,0.96875,0.96875,0.979167,0.979167,0.96875,0.972917,0.005103
2,0.003463,0.001174,0.001031,0.00093,0.1,auto,linear,"{'svc__C': 0.1, 'svc__gamma': 'auto', 'svc__ke...",0.958333,1.0,...,0.975,0.020412,1,0.96875,0.96875,0.979167,0.979167,0.96875,0.972917,0.005103
6,0.002992,0.000957,0.000809,0.001128,1.0,auto,linear,"{'svc__C': 1, 'svc__gamma': 'auto', 'svc__kern...",1.0,1.0,...,0.975,0.033333,1,0.96875,0.96875,0.979167,0.979167,0.989583,0.977083,0.007795
4,0.002753,0.001028,0.000801,0.000982,1.0,scale,linear,"{'svc__C': 1, 'svc__gamma': 'scale', 'svc__ker...",1.0,1.0,...,0.975,0.033333,1,0.96875,0.96875,0.979167,0.979167,0.989583,0.977083,0.007795
8,0.002252,0.000951,0.000827,0.001014,10.0,scale,linear,"{'svc__C': 10, 'svc__gamma': 'scale', 'svc__ke...",1.0,0.958333,...,0.966667,0.03118,5,0.979167,0.958333,0.96875,0.989583,1.0,0.979167,0.014731
9,0.002817,0.000912,0.001297,0.001009,10.0,scale,rbf,"{'svc__C': 10, 'svc__gamma': 'scale', 'svc__ke...",0.958333,0.958333,...,0.966667,0.016667,5,0.989583,0.979167,0.96875,0.989583,1.0,0.985417,0.010623
5,0.002946,0.000606,0.000282,0.000564,1.0,scale,rbf,"{'svc__C': 1, 'svc__gamma': 'scale', 'svc__ker...",0.958333,1.0,...,0.966667,0.016667,5,0.979167,0.979167,0.96875,0.979167,0.979167,0.977083,0.004167
7,0.003305,0.000855,0.001075,0.000954,1.0,auto,rbf,"{'svc__C': 1, 'svc__gamma': 'auto', 'svc__kern...",0.958333,1.0,...,0.966667,0.016667,5,0.979167,0.979167,0.96875,0.979167,0.979167,0.977083,0.004167
10,0.002231,0.000755,0.001204,0.00075,10.0,auto,linear,"{'svc__C': 10, 'svc__gamma': 'auto', 'svc__ker...",1.0,0.958333,...,0.966667,0.03118,5,0.979167,0.958333,0.96875,0.989583,1.0,0.979167,0.014731
11,0.002815,0.001177,0.001626,0.000479,10.0,auto,rbf,"{'svc__C': 10, 'svc__gamma': 'auto', 'svc__ker...",0.958333,0.958333,...,0.966667,0.016667,5,0.989583,0.979167,0.96875,0.989583,1.0,0.985417,0.010623
