# 評估模型

In [1]:
# 導入所需套件
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder  #非數值欄位轉數值
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split  #model_selection 做模型評估
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score  #交叉驗證

from sklearn.decomposition import PCA  #主成分分析
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC  #利用支持向量機進行分類

In [5]:
# 載入乳癌預測資料集
df = pd.read_csv('wdbc.data', header=None)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


乳癌資料集有下列屬性：
1）身份證號碼
2）診斷（M =惡性，B =良性）

為每個細胞核計算十個實值特徵：
a）半徑（從中心到周長上的點的距離的平均值）
b）紋理（灰度值的標準偏差）
c）周長
d）面積
e）平滑度（半徑長度的局部變化）
f）緊湊度（周長^ 2 /面積-1.0）
g）凹度（輪廓凹部的嚴重程度）
h）凹點（輪廓的凹部分的數量）
i）對稱
j）分形維數（“邊界線近似”-1）

為每個圖像計算這些特徵的平均值，標準差和“最差”或最大（三個最大值的平均值），從而得到30個特徵。 例如，欄位3是平均半徑，欄位13是半徑標準差，欄位23是最差半徑。

所有值都編碼成四位有效數字。

In [7]:
# 將診斷欄位編碼成數值，以便處理
X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

array(['B', 'M'], dtype=object)

In [10]:
# M =惡性/編碼成1，B =良性/編碼成0
le.transform(['M', 'B'])

array([1, 0], dtype=int64)

In [11]:
# 分隔訓練資料集與測試資料集
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, 
                     test_size=0.20,
                     stratify=y,
                     random_state=1)

### 建立管道

![](images/06_01.png)

In [14]:
# 建立管道與訓練模型
pipe_lr = make_pipeline(StandardScaler(),  #第一步 標準化
                        PCA(n_components=2),  #第二步 作主成分分析
                        LogisticRegression(random_state=1, solver='lbfgs'))   #羅集思回歸(診斷結果非0則1)

pipe_lr.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=2)),
                ('logisticregression', LogisticRegression(random_state=1))])

# 評估模型效能方式有下列：

## 留出法(The holdout method)

留出一定比例的資料作為測試集。在剩餘的資料上訓練模型，然後在測試集上評估模型。如前所述，為了防止資訊洩露，你不能基於測試集來調節模型，所以還應該保留一個驗證集。留出驗證（hold-out validation）的示意圖

![](images/06_02.png)

In [15]:
# 留出法衡量測試資料集
y_pred = pipe_lr.predict(X_test)
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))

Test Accuracy: 0.956


## K-折交叉驗證

將訓練集分割成k個子樣本，一個單獨的子樣本被保留作為驗證模型的數據，其他k − 1個樣本用來訓練。交叉驗證重複k次，每個子樣本驗證一次，平均k次的結果或者使用其它結合方式，最終得到一個單一估測。這個方法的優勢在於，同時重複運用隨機產生的子樣本進行訓練和驗證，每次的結果驗證一次，10次交叉驗證是最常用的。

![](images/06_03.png)

In [16]:
kfold = KFold(n_splits=10).split(X_train, y_train)

scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print('Fold: %2d, Class dist.: %s, Acc: %.3f' % (k+1,
          np.bincount(y_train[train]), score))
    
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Fold:  1, Class dist.: [255 154], Acc: 0.935
Fold:  2, Class dist.: [252 157], Acc: 0.935
Fold:  3, Class dist.: [256 153], Acc: 0.978
Fold:  4, Class dist.: [257 152], Acc: 0.957
Fold:  5, Class dist.: [258 151], Acc: 0.935
Fold:  6, Class dist.: [255 155], Acc: 0.933
Fold:  7, Class dist.: [257 153], Acc: 0.978
Fold:  8, Class dist.: [262 148], Acc: 0.933
Fold:  9, Class dist.: [255 155], Acc: 0.956
Fold: 10, Class dist.: [258 152], Acc: 0.956

CV accuracy: 0.949 +/- 0.017


## 帶有打亂數據的重複K 折驗證

多次使用K 折驗證，在每次將資料劃分為K 個分區之前都先將資料打亂。
最終分數是每次K 折驗證分數的平均值。
這種方法一共要訓練和評估P×K 個模型（P重複次數），計算代價很大。

In [17]:
kfold = RepeatedKFold(n_splits=10).split(X_train, y_train)

scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print('Fold: %2d, Class dist.: %s, Acc: %.3f' % (k+1,
          np.bincount(y_train[train]), score))
    
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Fold:  1, Class dist.: [254 155], Acc: 0.935
Fold:  2, Class dist.: [257 152], Acc: 0.957
Fold:  3, Class dist.: [252 157], Acc: 0.978
Fold:  4, Class dist.: [253 156], Acc: 0.978
Fold:  5, Class dist.: [260 149], Acc: 1.000
Fold:  6, Class dist.: [257 153], Acc: 0.933
Fold:  7, Class dist.: [258 152], Acc: 0.911
Fold:  8, Class dist.: [259 151], Acc: 0.911
Fold:  9, Class dist.: [255 155], Acc: 0.911
Fold: 10, Class dist.: [260 150], Acc: 0.911
Fold: 11, Class dist.: [258 151], Acc: 0.957
Fold: 12, Class dist.: [253 156], Acc: 0.957
Fold: 13, Class dist.: [250 159], Acc: 0.957
Fold: 14, Class dist.: [253 156], Acc: 0.957
Fold: 15, Class dist.: [258 151], Acc: 0.848
Fold: 16, Class dist.: [261 149], Acc: 0.933
Fold: 17, Class dist.: [259 151], Acc: 0.889
Fold: 18, Class dist.: [259 151], Acc: 0.978
Fold: 19, Class dist.: [256 154], Acc: 0.956
Fold: 20, Class dist.: [258 152], Acc: 0.978
Fold: 21, Class dist.: [257 152], Acc: 0.891
Fold: 22, Class dist.: [259 150], Acc: 0.978
Fold: 23, 

## 巢狀交叉驗證的演算法選擇

巢狀交叉驗證分為外部迴圈和內部迴圈，在外部迴圈中，我們將資料分為訓練塊和測試塊。在內部迴圈中，我們將訓練塊分為訓練塊和驗證塊，在訓練塊上使用k折交叉驗證，測試塊用於對於模型進行評估，通過內部迴圈來進行模型選擇。

![](images/06_07.png)

In [18]:
pipe_svc = make_pipeline(StandardScaler(),
                         SVC(random_state=1))

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

param_grid = [{'svc__C': param_range, 
               'svc__kernel': ['linear']},
              {'svc__C': param_range, 
               'svc__gamma': param_range, 
               'svc__kernel': ['rbf']}]

gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2)

scores = cross_val_score(gs, X_train, y_train, 
                         scoring='accuracy', cv=5)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores),
                                      np.std(scores)))

CV accuracy: 0.974 +/- 0.015


In [19]:
gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                  param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}],
                  scoring='accuracy',
                  cv=2)

scores = cross_val_score(gs, X_train, y_train, 
                         scoring='accuracy', cv=5)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), 
                                      np.std(scores)))

CV accuracy: 0.934 +/- 0.016
