In [None]:
# --- 第 1 部分 ---
# 載入函式庫與資料
from sklearn.datasets import load_digits
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import metrics
import numpy as np
digits = load_digits()

np.random.seed(1)
train_size = 1500
train_x, train_y = digits.data[:train_size], digits.target[:train_size]
test_x, test_y = digits.data[train_size:], digits.target[train_size:]


In [None]:
# --- 第 2 部分 ---
# 建立集成模型
ensemble_size = 10
ensemble = BaggingClassifier(base_estimator = 
                             DecisionTreeClassifier(),
                             n_estimators = ensemble_size,
                             oob_score = True)


In [None]:
# --- 第 3 部分 ---
# 訓練模型
ensemble.fit(train_x, train_y)


In [None]:
# --- 第 4 部分 ---
# 評估模型
ensemble_predictions = ensemble.predict(test_x)

ensemble_acc = metrics.accuracy_score(test_y, 
                                      ensemble_predictions)


In [None]:
# --- 第 5 部分 ---
# 顯示準確率
print('Bagging: %.2f' % ensemble_acc)
print('Out-of_bag: %.2f' % ensemble.oob_score_)

In [None]:
from sklearn.model_selection import validation_curve
import matplotlib.pyplot as plt

# 計算訓練資料集以及驗證資料集準確率
param_range = list(range(1, 39, 2))
train_scores, test_scores = validation_curve(ensemble, 
                                             train_x, 
                                             train_y,
                                             param_name = 
                                             'n_estimators',
                                             param_range = param_range,
                                             cv = 10,
                                             scoring = "accuracy")

# 對每個超參數計算模型準確率的平均數與標準差
train_scores_mean = np.mean(train_scores, axis = 1)
train_scores_std = np.std(train_scores, axis = 1)
test_scores_mean = np.mean(test_scores, axis = 1)
test_scores_std = np.std(test_scores, axis = 1)

plt.figure(figsize = (8, 8))
plt.title('Validation curves')
# 繪製標準差
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha = 0.1,
                 color = "C1")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha = 0.1, color = "C0")

# 繪製平均數
plt.plot(param_range, train_scores_mean, 'o-', color = "C1",
         label = "Training score")
plt.plot(param_range, test_scores_mean, 'o-', color = "C0",
         label = "Cross-validation score")
plt.xticks(param_range)
plt.xlabel('Number of base learner')
plt.ylabel('Accuracy')
plt.legend(loc = "best")
plt.show()
