# 7. 모형 평가

## 7.3 파이프라인

In [2]:
# 파이프 라인 사용 전 전체 코드
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

raw_boston = datasets.load_boston()

X = raw_boston.data
y = raw_boston.target

# 트레이닝 / 테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state = 7)

# 표준화 스케일링
std_scale = StandardScaler()
X_tn_std = std_scale.fit_transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 학습
clf_linear = LinearRegression()
clf_linear.fit(X_tn_std, y_tn)

# 예측
pred_linear = clf_linear.predict(X_te_std)

# 평가 
mean_squared_error(y_te, pred_linear)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

29.515137790197596

In [4]:
# 파이프라인 사용 후 전체 코드
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state = 7)

# 파이프라인
linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_regression', LinearRegression())
])

# 학습
linear_pipeline.fit(X_tn, y_tn)

# 예측
pred_linear = linear_pipeline.predict(X_te)

# 평가
mean_squared_error(y_te, pred_linear)

29.515137790197596

## 7.4 그리드 서치(grid search)

- 머신러닝 과정에서 관심 있는 매개 변수들을 대상으로 학습 가능하도록 만드는 방식
- 다음 코드는 k-nearest neighbor 알고리즘을 적용할 때 1부터 10까지의 k값 후보 중 가장 높은 성능을 보이는 k값을 정하는 과정

In [8]:
# 그리드 서치 전체 코드
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 꽃 데이터 불러오기
raw_iris = datasets.load_iris()

# 피처/타깃
X = raw_iris.data
y = raw_iris.target

# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state = 0)

# 표준화 스케일
std_sclae = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

best_accuracy = 0

for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    clf_knn = KNeighborsClassifier(n_neighbors = k)
    clf_knn.fit(X_tn_std, y_tn)
    knn_pred = clf_knn.predict(X_te_std)
    accuracy = accuracy_score(y_te, knn_pred)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        final_k = {'k': k}
        
print(final_k)
print(accuracy)

{'k': 3}
0.9736842105263158


### 7.6.2 분류 문제에서의 성능 평가

- **정확도(Accuracy)**

In [9]:
from sklearn.metrics import accuracy_score
y_pred = [0, 2, 1, 3]
y_true = [0, 1, 2, 3]
print(accuracy_score(y_true, y_pred))
print(accuracy_score(y_true, y_pred, normalize = False))

0.5
2


- **F1 score**

- **Confusion Matrix**

In [10]:
from sklearn.metrics import confusion_matrix
y_true = [2, 0, 2, 2, 0, 1]
y_pred = [0, 0, 2, 2, 0, 2]
confusion_matrix(y_true, y_pred)

array([[2, 0, 0],
       [0, 0, 1],
       [1, 0, 2]], dtype=int64)

주의: Ture - pred

- **Classification report**

In [11]:
from sklearn.metrics import classification_report
y_true = [0, 1, 2, 2, 0]
y_pred = [0, 0, 2, 1, 0]
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_true, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     class 0       0.67      1.00      0.80         2
     class 1       0.00      0.00      0.00         1
     class 2       1.00      0.50      0.67         2

    accuracy                           0.60         5
   macro avg       0.56      0.50      0.49         5
weighted avg       0.67      0.60      0.59         5



### 7.6.3 회귀 문제에서의 성능 평가

- **Mean Absolute Error**

In [12]:
from sklearn.metrics import mean_absolute_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
print(mean_absolute_error(y_true, y_pred))

0.5


- **Mean Squared Error(MSE)**

In [13]:
from sklearn.metrics import mean_squared_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
print(mean_squared_error(y_true, y_pred))

0.375


- **r2 score**

In [14]:
from sklearn.metrics import r2_score
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
print(r2_score(y_true, y_pred))

0.9486081370449679


### 7.6.4 군집 문제에서의 성능 평가

- **실루엣 스코어(Silhouette score)**

In [15]:
from sklearn.metrics import silhouette_score
X = [[1, 2], [4, 5], [2, 1], [6, 7], [2, 3]]
labels = [0, 1, 0, 1, 0]
sil_score = silhouette_score(X, labels)
print(sil_score)

0.5789497702625118


# 8. 지도학습

## 8.3 k-nearest neighbor algorithm

In [16]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 아이리스 데이터 불러오기 
raw_iris = datasets.load_iris()

# 피처/타깃
X = raw_iris.data
y = raw_iris.target

# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state = 0)

# 표준화 스케일
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 학습
clf_knn = KNeighborsClassifier(n_neighbors = 2)
clf_knn.fit(X_tn_std, y_tn)

# 예측
knn_pred = clf_knn.predict(X_te_std)
print(knn_pred)

# 정확도
accuracy = accuracy_score(y_te, knn_pred)
print(accuracy)

# confusion matrix 확인
conf_matrix = confusion_matrix(y_te, knn_pred)
print(conf_matrix)

# 분류 리포트 확인
class_report = classification_report(y_te, knn_pred)
print(class_report)

[2 1 0 2 0 2 0 1 1 1 1 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
0.9473684210526315
[[13  0  0]
 [ 0 15  1]
 [ 0  1  8]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      0.94      0.94        16
           2       0.89      0.89      0.89         9

    accuracy                           0.95        38
   macro avg       0.94      0.94      0.94        38
weighted avg       0.95      0.95      0.95        38



## 8.4 선형 회귀 분석

In [17]:
# 전체 코드
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 데이터 불러오기 
raw_boston = datasets.load_boston()

# 피처, 타깃 데이터 지정
X = raw_boston.data
y = raw_boston.target

# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state = 1)

# 데이터 표준화
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 선형 회귀 분석 학습
clf_lr = LinearRegression()
clf_lr.fit(X_tn_std, y_tn)

# 선형 회귀 분석 모형 추정 계수 확인
print(clf_lr.coef_)
print(clf_lr.intercept_)

# 릿지 회귀 분석(L2 제약식 적용)
clf_ridge = Ridge(alpha = 1)
clf_ridge.fit(X_tn_std, y_tn)

# 릿지 회귀 분석 모형 추정 계수 확인
print(clf_ridge.coef_)
print(clf_ridge.intercept_)

# 라쏘 회귀 분석(L1 제약식 적용)
clf_lasso = Lasso(alpha = 0.01)
clf_lasso.fit(X_tn_std, y_tn)

# 라쏘 회귀 분석 모형 추정 계수 확인
print(clf_lasso.coef_)
print(clf_lasso.intercept_)

# 엘라스틱 넷
clf_elastic = ElasticNet(alpha = 0.01, l1_ratio = 0.01)
clf_elastic.fit(X_tn_std, y_tn)

# 엘라스틱 넷 모형 추정 계수 확인
print(clf_elastic.coef_)
print(clf_elastic.intercept_)

# 예측 
pred_lr = clf_lr.predict(X_te_std)
pred_ridge = clf_ridge.predict(X_te_std)
pred_lasso = clf_lasso.predict(X_te_std)
pred_elastic = clf_elastic.predict(X_te_std)

# 모형 평가 R-squared
print(r2_score(y_te, pred_lr))
print(r2_score(y_te, pred_ridge))
print(r2_score(y_te, pred_lasso))
print(r2_score(y_te, pred_elastic))

# 모형 평가 MSE
print(mean_squared_error(y_te, pred_lr))
print(mean_squared_error(y_te, pred_ridge))
print(mean_squared_error(y_te, pred_lasso))
print(mean_squared_error(y_te, pred_elastic))

[-1.07145146  1.34036243  0.26298069  0.66554537 -2.49842551  1.97524314
  0.19516605 -3.14274974  2.66736136 -1.80685572 -2.13034748  0.56172933
 -4.03223518]
22.344591029023768
[-1.05933451  1.31050717  0.23022789  0.66955241 -2.45607567  1.99086611
  0.18119169 -3.09919804  2.56480813 -1.71116799 -2.12002592  0.56264409
 -4.00942448]
22.344591029023768
[-1.04326518  1.27752711  0.1674367   0.66758228 -2.41559964  1.99244179
  0.14733958 -3.09473711  2.46431135 -1.60552274 -2.11046422  0.55200229
 -4.00809905]
22.344591029023768
[-1.02916603  1.23681955  0.15236504  0.67859622 -2.34646781  2.02965524
  0.14575132 -2.98592423  2.32013379 -1.48829485 -2.09271972  0.56506801
 -3.9495281 ]
22.344591029023768
0.7789410172622858
0.7789704562726604
0.7787621490259894
0.7787876079239252
21.89776539604949
21.894849212618755
21.91548381050483
21.912961890936874



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

## 8.5 로지스틱 회귀

In [18]:
# 전체 코드
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 데이터 불러오기 
raw_cancer = datasets.load_breast_cancer()

# 피처, 타깃 데이터 지정
X = raw_cancer.data
y = raw_cancer.target

# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

# 데이터 표준화
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 로지스틱 회귀 분석(L2 제약식 적용)
clf_logi_l2 = LogisticRegression(penalty='l2')
clf_logi_l2.fit(X_tn_std, y_tn)

# 로지스틱 회귀 분석 모형(L2 제약식 적용) 추정 계수
print(clf_logi_l2.coef_)
print(clf_logi_l2.intercept_)

# 예측 
pred_logistic = clf_logi_l2.predict(X_te_std)
print(pred_logistic)

# 확률값으로 예측
pred_proba = clf_logi_l2.predict_proba(X_te_std)
print(pred_proba)

# 정밀도 
precision = precision_score(y_te, pred_logistic)
print(precision)

# confusion matrix 확인
conf_matrix = confusion_matrix(y_te, pred_logistic)
print(conf_matrix)

# 분류 리포트 확인
class_report = classification_report(y_te, pred_logistic)
print(class_report)

[[-0.29792942 -0.58056355 -0.3109406  -0.377129   -0.11984232  0.42855478
  -0.71131106 -0.85371164 -0.46688191  0.11762548 -1.38262136  0.0899184
  -0.94778563 -0.94686238  0.18575731  0.99305313  0.11090349 -0.3458275
   0.20290919  0.80470317 -0.91626377 -0.91726667 -0.8159834  -0.86539197
  -0.45539191  0.10347391 -0.83009341 -0.98445173 -0.5920036  -0.61086989]]
[0.02713751]
[0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1
 0 1 0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0
 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1
 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 0 0 1 1 1 0]
[[9.98638613e-01 1.36138656e-03]
 [3.95544804e-02 9.60445520e-01]
 [1.30896362e-03 9.98691036e-01]
 [1.24473354e-02 9.87552665e-01]
 [2.44132101e-04 9.99755868e-01]
 [4.50491513e-03 9.95495085e-01]
 [1.13985968e-04 9.99886014e-01]
 [1.82475894e-03 9.98175241e-01]
 [9.67965506e-05 9.99903203e-01]
 [1.75222878e-06 9.99998248e-

## 8.6 나이브 베이즈

In [22]:
# 전체 코드
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 데이터 불러오기
raw_wine = datasets.load_wine()

# 피처, 타깃 데이터 지정
X = raw_wine.data
y = raw_wine.target

# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

# 데이터 표준화
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 나이브 베이즈 학습
clf_gnb = GaussianNB()
clf_gnb.fit(X_tn_std, y_tn)

# 예측 
pred_gnb = clf_gnb.predict(X_te_std)
print(pred_gnb)

# 리콜
# average 인수 지정하지 않으면 아래의 에러가 발생 
'''
ValueError: Target is multiclass but average='binary'. 
Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

# average 인수 
average{‘micro’, ‘macro’, ‘samples’, ‘weighted’, ‘binary’} or None, default=’binary’
This parameter is required for multiclass/multilabel targets. If None, the scores 
for each class are returned. Otherwise, this determines the type of averaging performed on the data:

'binary':
Only report results for the class specified by pos_label. 
This is applicable only if targets (y_{true,pred}) are binary.

'micro':
Calculate metrics globally by counting the total true positives, 
false negatives and false positives.

'macro':
Calculate metrics for each label, and find their unweighted mean. 
This does not take label imbalance into account.

'weighted':
Calculate metrics for each label, and find their average weighted by support 
(the number of true instances for each label). 
This alters ‘macro’ to account for label imbalance; 
it can result in an F-score that is not between precision and recall.
Weighted recall is equal to accuracy.

'samples':
Calculate metrics for each instance, and find their average 
(only meaningful for multilabel classification where this differs from accuracy_score).
'''
recall = recall_score(y_te, pred_gnb, average = 'micro')
print(recall)

# confusion matrix 확인
conf_matrix = confusion_matrix(y_te, pred_gnb)
print(conf_matrix)

# 분류 리포트 확인
class_report = classification_report(y_te, pred_gnb)
print(class_report)

[0 2 1 0 1 1 0 2 1 1 2 2 0 0 2 1 0 0 2 0 0 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]
0.9333333333333333
[[16  0  0]
 [ 2 18  1]
 [ 0  0  8]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94        16
           1       1.00      0.86      0.92        21
           2       0.89      1.00      0.94         8

    accuracy                           0.93        45
   macro avg       0.93      0.95      0.94        45
weighted avg       0.94      0.93      0.93        45



## 8.7 의사결정 나무

In [25]:
# 전체 코드
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn import tree

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 데이터 불러오기 
raw_wine = datasets.load_wine()

# 피처, 타깃 데이터 지정
X = raw_wine.data
y = raw_wine.target

# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

# 데이터 표준화
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 의사결정나무 학습
clf_tree = tree.DecisionTreeClassifier(random_state=0)
clf_tree.fit(X_tn_std, y_tn)

# 예측
pred_tree = clf_tree.predict(X_te_std)
print(pred_tree)

# f1 score
f1 = f1_score(y_te, pred_tree, average = 'micro')
print(f1)

# confusion matrix 확인
conf_matrix = confusion_matrix(y_te, pred_tree)
print(conf_matrix)

# 분류 리포트 확인
class_report = classification_report(y_te, pred_tree)
print(class_report)

[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 2 0 1 0 1 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 1 0 1 1 1]
0.9333333333333333
[[14  2  0]
 [ 0 20  1]
 [ 0  0  8]]
              precision    recall  f1-score   support

           0       1.00      0.88      0.93        16
           1       0.91      0.95      0.93        21
           2       0.89      1.00      0.94         8

    accuracy                           0.93        45
   macro avg       0.93      0.94      0.93        45
weighted avg       0.94      0.93      0.93        45



## 8.8 서포트 벡터 머신(Support Vector Machine)

In [26]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn import svm

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 데이터 불러오기
raw_wine = datasets.load_wine()

# 피처, 타깃 데이터 지정
X = raw_wine.data
y = raw_wine.target

# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

# 데이터 표준화
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 서포트 벡터 머신 학습
clf_svm_lr = svm.SVC(kernel='linear', random_state=0)
clf_svm_lr.fit(X_tn_std, y_tn)

# 예측
pred_svm = clf_svm_lr.predict(X_te_std)
print(pred_svm)

# 정확도
accuracy = accuracy_score(y_te, pred_svm)
print(accuracy)

# confusion matrix 확인
conf_matrix = confusion_matrix(y_te, pred_svm)
print(conf_matrix)

# 분류 리포트 확인
class_report = classification_report(y_te, pred_svm)
print(class_report)

[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 1 0 1 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]
1.0
[[16  0  0]
 [ 0 21  0]
 [ 0  0  8]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00         8

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



## 8.9 크로스 밸리데이션(Cross validation)

In [33]:
# 전체 코드
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import numpy as np
import pandas as pd

from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 데이터 불러오기
raw_wine = datasets.load_wine()

# 피처, 타깃 데이터 지정
X = raw_wine.data
y = raw_wine.target

# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, )

# 데이터 표준화
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 그리드 서치 학습
param_grid = {'kernel' : ('linear', 'rbf'), 'C': [0.5, 1, 10, 100]}
kfold = StratifiedKFold(n_splits=5, shuffle = True, random_state = 0)
svc = svm.SVC(random_state=0)
grid_cv = GridSearchCV(svc, param_grid, cv = kfold, scoring = 'accuracy')
grid_cv.fit(X_tn_std, y_tn)

# 그리드 서치 결과 확인
print(grid_cv.cv_results_)
np.transpose(pd.DataFrame(grid_cv.cv_results_))

# 베스트 스코어
grid_cv.best_score_

# 베스트 하이퍼파라미터
grid_cv.best_params_

# 최종 모형
clf = grid_cv.best_estimator_
print(clf)

# 크로스 밸리데이션 스코어 확인(1)
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_scores = cross_validate(clf, X_tn_std, y_tn, cv = kfold, scoring = metrics)

# 크로스 밸리데이션 스코어 확인(2)
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(clf, X_tn_std, y_tn, cv = kfold, scoring = 'accuracy')

print(cv_score)
print(cv_score.mean())
print(cv_score.std())

# 예측
pred_svm = clf.predict(X_te_std)
print(pred_svm)

# 정확도
accuracy = accuracy_score(y_te, pred_svm)
print(accuracy)

# confusion matirx 확인
conf_matrix = confusion_matrix(y_te, pred_svm)
print(conf_matrix)

# 분류 레포트 확인
class_report = classification_report(y_te, pred_svm)
print(class_report)

{'mean_fit_time': array([0.00019984, 0.00060463, 0.00080023, 0.00019999, 0.0004003 ,
       0.00100021, 0.00079699, 0.00100007]), 'std_fit_time': array([3.99684906e-04, 4.93745087e-04, 4.00115551e-04, 3.99971008e-04,
       4.90271296e-04, 1.97412887e-06, 3.98527204e-04, 1.90734863e-07]), 'mean_score_time': array([0.        , 0.00039945, 0.        , 0.0001996 , 0.00019979,
       0.        , 0.00019994, 0.        ]), 'std_score_time': array([0.        , 0.00048922, 0.        , 0.00039921, 0.00039959,
       0.        , 0.00039988, 0.        ]), 'param_C': masked_array(data=[0.5, 0.5, 1, 1, 10, 10, 100, 100],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf',
                   'linear', 'rbf'],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params':

# 9. 앙상블 학습

## 9.2 보팅

In [34]:
# 전체 코드
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 데이터 불러오기 
raw_iris = datasets.load_iris()

# 피처, 타깃 데이터 지정
X = raw_iris.data
y = raw_iris.target

# 트레이닝/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

# 데이터 표준화
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 보팅 학습
clf1 = LogisticRegression(multi_class='multinomial', random_state = 1)
clf2 = svm.SVC(kernel = 'linear', random_state=1)
clf3 = GaussianNB()

clf_voting = VotingClassifier(estimators =[('lr', clf1), ('svm', clf2), ('gnb', clf3)],
                             voting = 'hard',
                             weights = [1, 1, 1])
clf_voting.fit(X_tn_std, y_tn)

# 예측
pred_voting = clf_voting.predict(X_te_std)
print(pred_voting)

# 정확도
accuracy = accuracy_score(y_te, pred_voting)
print(accuracy)

# confusion matrix 확인
conf_matrix = confusion_matrix(y_te, pred_voting)
print(conf_matrix)

# 분류 리포트 확인
class_report = classification_report(y_te, pred_voting)
print(class_report)

[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
0.9736842105263158
[[13  0  0]
 [ 0 15  1]
 [ 0  0  9]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      0.94      0.97        16
           2       0.90      1.00      0.95         9

    accuracy                           0.97        38
   macro avg       0.97      0.98      0.97        38
weighted avg       0.98      0.97      0.97        38



## 9.3 배깅과 랜덤 포레스트

In [None]:
# 전체 코드
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 데이터 불러오기 
raw_wine = datasets.load_wine()

