### 앙상블(Ensemble) 학습

In [1]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [2]:
from sklearn.preprocessing import StandardScaler
cancer_std = StandardScaler().fit_transform(cancer.data)

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_std, cancer.target, stratify=cancer.target, test_size=0.2, random_state=2023
)

#### 1. Voting방식 
#### 1.1 Hard Voting
- 로지스틱 회귀
- 서포트 벡터 머신
- K 최근접 이웃

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [6]:
lrc = LogisticRegression(random_state=2023)
svc = SVC(random_state=2023)
knn = KNeighborsClassifier()

In [7]:
# 하드 보팅을 위한 앙상블 분류기
from sklearn.ensemble import VotingClassifier
voc = VotingClassifier(
    estimators=[('LRC', lrc), ('SVC', svc), ('KNN', knn)],
    voting='hard'
)

In [8]:
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

0.9298245614035088

In [9]:
# 개별 분류기의 성능
lrc.fit(X_train, y_train)   # 
svc.fit(X_train, y_train)
knn.fit(X_train, y_train)
lrc.score(X_test, y_test), svc.score(X_test, y_test), knn.score(X_test, y_test)

(0.9473684210526315, 0.9298245614035088, 0.9122807017543859)

#### 1.2 Soft voting

- predict_proba() 메소드를 지원하는 분류기인 경우 사용 가능

In [12]:
dir(lrc)    #  text editor들어가면 'predict', 'predict_proba' 나옴

lrc.predict_proba(X_test[:3])

array([[9.98781249e-01, 1.21875135e-03],
       [9.76075906e-04, 9.99023924e-01],
       [6.40363671e-02, 9.35963633e-01]])

In [13]:
knn.predict_proba(X_test[:3])

array([[1., 0.],
       [0., 1.],
       [0., 1.]])

In [None]:
# SVC는 probability=False(default) 옵션을 줘야만 사용 가능
svc.predict_proba(X_test[:3]) # 불가-오류

In [15]:
svc2 = SVC(probability=True, random_state=2023)
svc2.fit(X_train, y_train)
svc2.predict_proba(X_test[:3])

array([[9.99574375e-01, 4.25625266e-04],
       [5.14249474e-08, 9.99999949e-01],
       [1.65822655e-02, 9.83417734e-01]])

- Soft voting

In [17]:
voc2 = VotingClassifier(
    estimators=[('LRC',lrc), ('SVC',svc2), ('KNN', knn)],
    voting='soft'
)

In [18]:
voc2.fit(X_train, y_train)
voc2.score(X_test, y_test)

0.9298245614035088

In [19]:
voc2.predict_proba(X_test[:3])

array([[9.99451874e-01, 5.48125537e-04],
       [3.25375777e-04, 9.99674624e-01],
       [2.68728775e-02, 9.73127122e-01]])

- GridSearchCV

In [20]:
lrc.C, svc2.C   # 둘의 C 기본값은 항상 1

(1.0, 1.0)

In [23]:
params = {
    'LRC__C': [0.1, 1, 10], # 언더바__ 2개씩
    'SVC__C': [0.1, 1, 10]
}

In [24]:
from sklearn.model_selection import GridSearchCV
grid_voc2 = GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 10, 'SVC__C': 1}

In [28]:
params = {
    'LRC__C': [5, 10, 20],
    'SVC__C': [0.5, 1, 3]
}
grid_voc2 = GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 10, 'SVC__C': 1}

In [29]:
grid_voc2.best_estimator_.score(X_test, y_test)

0.9473684210526315

### 2. Bagging방식 - Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2023)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.9210526315789473

In [31]:
rfc.predict_proba(X_test[:3])

array([[0.99, 0.01],
       [0.  , 1.  ],
       [0.18, 0.82]])