#### [앙상블 학습 알고리즘 - 보팅]
- 특징: 동일한 데이터셋 + 다양한 학습 알고리즘 모델들
- 결과:
    * Hard Voting: 결론을 다수결로 결정
    * Soft Voting: 클래스별 확률값 평균 도출 후 최고 확률값의 클래스로 결정
    * 성능: Hard Voting < Soft Voting
    * 주의: Soft Voting 시 모델들은 **predict_proba()** 메서드가 존재해야 한다!

[1] 모듈 로딩 및 데이터 준비 <hr>

In [7]:
import pandas as pd
import numpy as np

## 데이터셋 및 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder

## 교차검증
from sklearn.model_selection import StratifiedKFold, cross_val_score

## ML, CV, Pipeline 관련 => 모델 일반화/최적 하이퍼파라미터 조사 및 데이터 누수 해결
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

## ML 학습
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier


## 성능지표 관련
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.metrics import classification_report

## 시각화 관련
import matplotlib.pyplot as plt
import graphviz

[2] 학습 준비 <hr>
- 피쳐/타겟 분리
- 학습용/테스트용 분리
- 수치형 피쳐 스케일링

In [8]:
FILE_NAME = '../Data/iris.csv'
irisDF = pd.read_csv(FILE_NAME)
irisDF.variety = irisDF.variety.astype('category')
irisDF.info()
display(irisDF.head(3))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   sepal.length  150 non-null    float64 
 1   sepal.width   150 non-null    float64 
 2   petal.length  150 non-null    float64 
 3   petal.width   150 non-null    float64 
 4   variety       150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa


In [9]:
## [2-1] 피쳐와 타겟 분리
featureDF = irisDF[irisDF.columns[:-1]]
targetSR = irisDF[irisDF.columns[-1]]

print(f'featureDF: {featureDF.shape}, targetSR: {targetSR.shape}')

featureDF: (150, 4), targetSR: (150,)


In [10]:
## [2-2] 학습용 테스트용 분리
x_train, x_test, y_train, y_test = train_test_split(featureDF, targetSR, test_size=0.2,
                                                    random_state=42, stratify=targetSR)

In [11]:
## ===================================================
## 범주형 타겟 전처리 => 인코딩
## 타겟은 정답지이므로 밖으로 빼서 미리 인코딩 해도 
## 데이터 누수 문제는 없다 
## ===================================================
lbEncoder = LabelEncoder()
en_y_train = lbEncoder.fit_transform(y_train)
en_y_test = lbEncoder.transform(y_test)

print(len(en_y_train))
print(en_y_train)

120
[0 2 1 0 1 2 1 2 2 2 2 1 1 1 1 0 0 2 2 0 1 0 2 0 1 2 2 0 2 0 0 1 1 0 2 2 1
 1 2 1 0 1 0 2 0 0 2 0 0 0 0 1 2 1 0 2 1 2 0 2 0 1 2 0 1 1 2 1 1 2 0 0 0 2
 1 2 1 2 2 1 0 2 1 0 2 0 2 1 1 0 1 2 0 0 2 2 2 1 2 0 2 1 2 2 0 1 1 1 1 1 0
 2 1 1 0 0 0 0 1 0]


In [16]:
## ===================================================
## 수치형 피쳐 컬럼 스케일링 처리
## ===================================================
rbScaler = RobustScaler()
rb_x_train = rbScaler.fit_transform(x_train)
rb_x_test = rbScaler.transform(x_test)

[3] 학습 진행 - 단순 <hr>

In [18]:
## 보팅 학습에 사용될 모델/학습기들 생성
knn = KNeighborsClassifier()
svc = SVC(probability=True)     # predict_proba() 메서드 활성화 설정
dt = DecisionTreeClassifier(random_state=10)

In [19]:
## 보팅 인스턴스 생성
vtModel = VotingClassifier(estimators=[('knn', knn), ('svc', svc), ('dt', dt)])

## 보팅 동일 데이터셋으로 다른 학습 알고리즘으로 학습 진행
vtModel.fit(rb_x_train, en_y_train)

0,1,2
,estimators,"[('knn', ...), ('svc', ...), ...]"
,voting,'hard'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,10
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [21]:
## 학습 후 모델 파라미터 확인
print(f"classes_: {vtModel.classes_}")          # 품종 라벨
print(f"estimators_: {vtModel.estimators_}")    # 추정 모델 3개
print(f'named_estimators_: {vtModel.named_estimators_}')

classes_: [0 1 2]
estimators_: [KNeighborsClassifier(), SVC(probability=True), DecisionTreeClassifier(random_state=10)]
named_estimators_: {'knn': KNeighborsClassifier(), 'svc': SVC(probability=True), 'dt': DecisionTreeClassifier(random_state=10)}


In [None]:
## 성능평가 => 하드 보팅
vtModel.score(rb_x_test, en_y_test)

0.9666666666666667

[4] 학습 진행 - 데이터 누수 방지를 위한 파이프라인 및 교차 검증 <hr>

In [None]:
## =================================================================
## => Pipeline: 튜닝 시 데이터 누수에 대한 보완
## =================================================================
## [1] Pipeline 인스턴스 생성
##          -> 수치형 피쳐: 스케일링
##          -> 모델 인스턴스
pipeline = Pipeline(steps=[('rb_scaler', RobustScaler()),
                      ('dt_cls', DecisionTreeClassifier(random_state=42))])

## [2] GridSearchCV 준비
##      -> 최적화 할 하이퍼파라미터 값 dict
##      -> 모델 인스턴스
param_dict = {"dt_cls__max_depth": [3, 5, 7],
              "dt_cls__criterion": ['gini', 'entropy', 'log_loss']}

gridCV = GridSearchCV(pipeline,
                      param_grid=param_dict,
                      cv=3,
                      return_train_score=True)

In [None]:
## =================================================================
## 교차 검증을 통한 일반화 성능 체크 + 최적의 하이퍼파라미터 찾기
## => GridSearchCV: 분류/회귀 교차검증, 최적 파라미터값, 성능
## =================================================================
gridCV.fit(x_train, en_y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'dt_cls__criterion': ['gini', 'entropy', ...], 'dt_cls__max_depth': [3, 5, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [None]:
## 학습 후 모델 파라미터 통한 결과 확인
print(f"best_score_: {gridCV.best_score_}")
print(f"best_params_: {gridCV.best_params_}")

best_score_: 0.9583333333333334
best_params_: {'dt_cls__criterion': 'gini', 'dt_cls__max_depth': 3}


In [None]:
## 전체 교차 검증별 학습점수, 검증점수 cv_results_
pd.options.mode.copy_on_write=True

resultDF = pd.DataFrame(gridCV.cv_results_)
resultDF = resultDF[['rank_test_score', 'mean_train_score', 'mean_test_score', 'param_dt_cls__max_depth']]
resultDF['diff'] = abs(resultDF['mean_test_score'] - resultDF['mean_train_score'])

resultDF.rename(columns={'param_dt_cls__max_depth': "max_depth",
                         'param_dt_cls__criterion': "criterion",
                         'rank_test_score': "rank_test"}, inplace=True)

resultDF.sort_values(by=['rank_test'])

Unnamed: 0,rank_test,mean_train_score,mean_test_score,max_depth,diff
0,1,0.979167,0.958333,3,0.020833
3,2,0.979167,0.95,3,0.029167
6,2,0.979167,0.95,3,0.029167
1,4,1.0,0.941667,5,0.058333
2,4,1.0,0.941667,7,0.058333
4,6,0.995833,0.933333,5,0.0625
7,6,0.995833,0.933333,5,0.0625
5,8,1.0,0.925,7,0.075
8,8,1.0,0.925,7,0.075


[5] 최적의 파라미터를 가지고 학습 진행 <hr>

In [None]:
# 1) 최적 모델 가져오기
best_model = gridCV.best_estimator_

# 2) train 데이터로 최종 학습시키기
best_model.fit(x_train, en_y_train)

0,1,2
,steps,"[('rb_scaler', ...), ('dt_cls', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [None]:
# 3) test 데이터로 평가하기
y_pred = best_model.predict(x_test)
print(classification_report(en_y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30

