## 앙상블(Ensemble) 학습

In [43]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [44]:

from sklearn.preprocessing import StandardScaler
cancer_std = StandardScaler().fit_transform(cancer.data)

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_std, cancer.target, stratify=cancer.target, test_size=0.2, random_state=2023
)

### 1. Voting 방식
#### 1.1 Hard voting
- 로지스틱 회귀
- 서포트 백터 머신
- K 최근접 이웃

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [47]:
lrc = LogisticRegression(random_state=2023)
svc = SVC(random_state=2023)
knn = KNeighborsClassifier(n_neighbors=5)

In [48]:
# 하드 보팅을 위한 앙상블 분류기
from sklearn.ensemble import VotingClassifier
voc = VotingClassifier(
    estimators=[('LRC', lrc), ('SVC', svc), ('KNN', knn)],
    voting='hard'
)

In [49]:
# 앙상블 학습과 평가 
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

0.9298245614035088

In [50]:
# 개별 분류기의 학습
lrc.fit(X_train, y_train)
svc.fit(X_train, y_train)
knn.fit(X_train, y_train)
lrc.score(X_test, y_test), svc.score(X_test, y_test), knn.score(X_test, y_test)

(0.9473684210526315, 0.9298245614035088, 0.9122807017543859)

#### 1.2 Soft voting
- predict_proba() method를 지원하는 분류기만 가능
- 로지스틱 회귀
- 서포트 백터 머신
- K 최근접 이웃

In [51]:
svc2 = SVC(probability=True, random_state=2023)
svc2.fit(X_train, y_train)
svc2.predict_proba(X_test[:3])

array([[9.99574375e-01, 4.25625266e-04],
       [5.14249474e-08, 9.99999949e-01],
       [1.65822655e-02, 9.83417734e-01]])

In [52]:
voc2 = VotingClassifier(
    estimators=[('LRC', lrc), ('SVC', svc), ('KNN', knn)],
    voting='hard'
)
voc2.fit(X_train, y_train)
voc2.score(X_test,y_test)

0.9298245614035088

- GridSearchCV 적용 

In [53]:
lrc.C, svc.C

(1.0, 1.0)

In [54]:
from sklearn.model_selection import GridSearchCV
params = {
    'LRC__C': [ 0.1, 1, 10],
    'SVC__C': [0.1, 1, 10]
}
grid_voc2 = GridSearchCV(voc, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 1, 'SVC__C': 1}

In [55]:
params = {
    'LRC__C': [ 5, 10, 20],
    'SVC__C': [0.5, 1, 3]
}
grid_voc2 = GridSearchCV(voc, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 5, 'SVC__C': 1}

In [56]:
grid_voc2.best_estimator_.score(X_test,y_test)

0.9298245614035088

## 2. Bagging 방식 - Random Forest

In [57]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2023)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2023,
 'verbose': 0,
 'warm_start': False}

In [58]:
rfc.fit(X_train,y_train)
rfc.score(X_test,y_test)

0.9210526315789473

In [59]:
rfc.predict_proba(X_test[:5])

array([[0.99, 0.01],
       [0.  , 1.  ],
       [0.18, 0.82],
       [0.  , 1.  ],
       [0.02, 0.98]])

In [60]:
# DecisionTreeClassifier는 soft voting을 하는데 도움이 안됨
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc.predict_proba(X_test[:5])

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]])

### 타이타닉

In [84]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
titanic = sns.load_dataset('titanic')
titanic



Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


- 데이터 준비/ 기본 설정

In [85]:
df = sns.load_dataset('titanic')

- 데이터 탐색 전처리

In [86]:
# # NaN값이 많은 deck 열을 삭제, embarked와 내용이 겹치는 embark_town 열을 삭제
titanic = df.drop(['deck', 'embark_town'], axis=1) 

In [87]:
# age 열에 나이 데이터가 없는 모든 행을 삭제 - age 열(891개 중 177개의 NaN 값)
titanic = titanic.dropna(subset=['age'], how='any', axis=0)  

In [90]:
# embarked 열의 NaN값을 승선도시 중에서 가장 많이 출현한 값으로 치환하기
most_freq = titanic['embarked'].value_counts(dropna = True).idxmax()
titanic['embarked'].fillna(most_freq, inplace=True)

- 분석에 사용할 속성을 선택

In [92]:
# 분석에 활용할 열(속성)을 선택 
titanic1 = titanic[['survived', 'pclass','sex','age','sibsp','parch','embarked']]

In [94]:
# 원핫인코딩 - 범주형 데이터를 모형이 인식할 수 있도록 숫자형으로 변환
onehot_sex = pd.get_dummies(titanic1['sex'])
titanic1 = pd.concat([titanic1, onehot_sex], axis=1)

titanic1.drop(['sex', 'embarked'],axis=1, inplace = True)

- 데이터셋 구분 - 훈련용(train data)/ 검증용(test data)

In [97]:
# 속성(변수) 선택
X = titanic1[['pclass', 'age', 'sibsp', 'parch','female', 'male']] # 독립변수
y=titanic1['survived'] # 종속변수


In [98]:
# 설명 변수 데이터를 정규화(normalization)
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)