In [1]:
# 필요한 패키지 설치
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, LeaveOneOut, StratifiedKFold
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [2]:
# 데이터 불러오기
# https://www.kaggle.com/datasets/uciml/glass
df = pd.read_csv("datasets/glass.csv")

# 데이터 샘플 확인
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [3]:
# k-fold 방법 1

# 종속변수 문자형 변환
df["Type_str"]=df["Type"].apply(str)

# 독립변수, 종속변수 분리
df_x1 = df[['RI','Na','Mg','Al','Si','K','Ca','Ba','Fe']]
df_y1 = df[['Type_str']]

# 기본 모델 설정
rnf_model = RandomForestClassifier(n_estimators=100, max_depth=5,random_state=0)

# 학습셋과 테스트셋 분리하여 생성(6:4)
x_train, x_test, y_train, y_test = train_test_split(
    df_x1,df_y1,test_size=0.4,random_state=10)

# cross_val_score 함수로 k-fold 성능 측정
kfold_scores_1 = cross_val_score(rnf_model, x_train, y_train, cv = 7)

print('k-fold 교차 검증 SCORE : ', kfold_scores_1)
print("k-fold 교차 검증 SCORE 평균 : {:.2f}".format(kfold_scores_1.mean()))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


k-fold 교차 검증 SCORE :  [0.89473684 0.68421053 0.61111111 0.55555556 0.94444444 0.72222222
 0.88888889]
k-fold 교차 검증 SCORE 평균 : 0.76


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [4]:
# k-fold 방법 2

# 종속변수 문자형 변환 및 분리
df["Type_str"]=df["Type"].apply(str)
df_x2 = np.array(df[['RI','Na','Mg','Al','Si','K','Ca','Ba','Fe']])
df_y2 = df['Type_str']

# k-fold 설정
kfold = KFold(n_splits=7, shuffle = True, random_state = 37)
kfold_scores_2_mean = []
fold_idx = 0

# k-fold 수행
for train_idx, test_idx in kfold.split(df_x2):
    
    train_x, train_y = df_x2[train_idx], df_y2[train_idx]
    test_x, test_y = df_x2[test_idx], df_y2[test_idx]
    
    # 기본 모델 설정
    rnf_model = RandomForestClassifier(n_estimators=100, max_depth=5,random_state=0)    
    rnf_model.fit(train_x, train_y)
    pred_y = rnf_model.predict(test_x)
    kfold_scores_2 = accuracy_score(test_y, pred_y)    
    fold_idx += 1    
    kfold_scores_2_mean.append(kfold_scores_2)

print(f"k-fold 교차 검증 SCORE 평균 : {np.mean(kfold_scores_2_mean)}")

k-fold 교차 검증 SCORE 평균 : 0.7526881720430108


In [5]:
# LOOCV (Leave-one-out Cross-validation)

# 기본 모델 설정
rnf_model = RandomForestClassifier(n_estimators=100, max_depth=5,random_state=0)
rnf_model.fit(train_x, train_y)
pred_y = rnf_model.predict(test_x)

loocv = LeaveOneOut()
loocv_scores = cross_val_score(rnf_model,test_x,test_y, cv = loocv)

print("테스트 셋 전체 관측치 수 : ", len(test_y))
print("LOOCV 검증 분할 횟수 : ", len(loocv_scores))
print("LOOCV 교차 검증 SCORE 평균 : {:.2f}".format(loocv_scores.mean()))

테스트 셋 전체 관측치 수 :  30
LOOCV 검증 분할 횟수 :  30
LOOCV 교차 검증 SCORE 평균 : 0.70


In [6]:
# Stratified K-fold Cross Validation

# Stratified K-fold 교차검증 설정
strati = StratifiedKFold(n_splits=3)

n_iter=0
print("전체 데이터셋 범주 별 관측치 수 :\n",df_y2.value_counts())
for train_index, test_index in strati.split(df_x2,df_y2):
    n_iter +=1
    strati_train_y = df_y2.iloc[train_index]
    strati_test_y = df_y2.iloc[test_index]
    print('분할 {0}'.format(n_iter))
    print('학습 셋 범주 별 관측치 수:\n', strati_train_y.value_counts())
    print('검증 셋 범주 별 관측치 수:\n', strati_test_y.value_counts())

strati_scores = cross_val_score(rnf_model,test_x,test_y, cv = strati)
print("Stratified K-fold 교차 검증 SCORE 평균 : {:.2f}".format(strati_scores.mean()))

전체 데이터셋 범주 별 관측치 수 :
 2    76
1    70
7    29
3    17
5    13
6     9
Name: Type_str, dtype: int64
분할 1
학습 셋 범주 별 관측치 수:
 2    51
1    46
7    19
3    11
5     9
6     6
Name: Type_str, dtype: int64
검증 셋 범주 별 관측치 수:
 2    25
1    24
7    10
3     6
5     4
6     3
Name: Type_str, dtype: int64
분할 2
학습 셋 범주 별 관측치 수:
 2    50
1    47
7    20
3    12
5     8
6     6
Name: Type_str, dtype: int64
검증 셋 범주 별 관측치 수:
 2    26
1    23
7     9
3     5
5     5
6     3
Name: Type_str, dtype: int64
분할 3
학습 셋 범주 별 관측치 수:
 2    51
1    47
7    19
3    11
5     9
6     6
Name: Type_str, dtype: int64
검증 셋 범주 별 관측치 수:
 2    25
1    23
7    10
3     6
5     4
6     3
Name: Type_str, dtype: int64




Stratified K-fold 교차 검증 SCORE 평균 : 0.67


In [7]:
# Nested Cross Validation 및 Grid Search Cross Validation

# 학습셋과 테스트셋 분리하여 생성(6:4)
x_train, x_test, y_train, y_test = train_test_split(
    df_x2,df_y2,test_size=0.4,random_state=10)

# 모델 시도 횟수 설정
NUM_TRIALS = 20

# 그리드 하이퍼 파라미터 설정
hp_para = {'max_depth':[2,3,4], 'min_samples_split':[2,3]}

# 스코어를 array로 저장
nested_scores = np.zeros(NUM_TRIALS)

# 교차 검증 반복 수행
for i in range(NUM_TRIALS):

    inner_loop = KFold(n_splits=3, shuffle=True, random_state=i)
    outer_loop = KFold(n_splits=3, shuffle=True, random_state=i)

    # Nested Cross Validation 하이퍼 파라미터 최적화
    gs_cv = GridSearchCV(rnf_model, param_grid=hp_para, cv=inner_loop)
    nested_score = cross_val_score(gs_cv, X=x_train, y=y_train, cv=outer_loop)
    nested_scores[i] = nested_score.mean()

# 테스트셋에 모델 적용
gs_cv.fit(x_test, y_test)

print("각 TRIAL 별 SCORE 평균 : \n", nested_scores)
print("전체 TRIAL SCORE 평균 : {:.2f}".format(nested_scores.mean()))
print("최대 TRIAL SCORE : {:.2f}".format(nested_scores.max()))
print("최적 하이퍼 파라미터 : \n", gs_cv.best_params_)

각 TRIAL 별 SCORE 평균 : 
 [0.71059432 0.71077889 0.67884828 0.72646733 0.71834625 0.69545958
 0.72683647 0.68752307 0.6720192  0.68014027 0.72757475 0.7113326
 0.68014027 0.7113326  0.66426726 0.68807678 0.73366556 0.73421927
 0.67109635 0.70302695]
전체 TRIAL SCORE 평균 : 0.70
최대 TRIAL SCORE : 0.73
최적 하이퍼 파라미터 : 
 {'max_depth': 4, 'min_samples_split': 2}
