# 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [1]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
from pprint import pprint
import numpy as np
import warnings
warnings.simplefilter('ignore')

In [3]:
# 讀取資料集
diabetes = datasets.load_breast_cancer()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target,
                                                    test_size=0.25, random_state=42)

# 建立模型
clf = GradientBoostingClassifier(random_state = 42)

In [4]:
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))

0.958041958041958


In [5]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 200, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 15, num = 11)]
max_depth.append(None)

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth}
pprint(random_grid)

{'max_depth': [1, 2, 3, 5, 6, 8, 9, 10, 12, 13, 15, None],
 'n_estimators': [50, 66, 83, 100, 116, 133, 150, 166, 183, 200]}


Random search

In [6]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, 
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    8.2s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [50, 66, 83, 100, 116, 133, 150, 166, 183, 200], 'max_depth': [1, 2, 3, 5, 6, 8, 9, 10, 12, 13, 15, None]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [7]:
rf_random.best_params_

{'n_estimators': 150, 'max_depth': 1}

Random search 評估

In [8]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    # 回歸
    # errors = abs(predictions - test_labels)
    # mape = 100 * np.mean(errors / test_labels)
    # accuracy = 100 - mape
    # 分類
    accuracy = metrics.accuracy_score(predictions, test_labels)
    print('Model Performance')
    # 回歸
    # print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [9]:
base_model = GradientBoostingClassifier(random_state = 42)
base_model.fit(x_train, y_train)
base_accuracy = evaluate(base_model, x_test, y_test)

Model Performance
Accuracy = 0.96%.


In [10]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, x_test, y_test)

Model Performance
Accuracy = 0.97%.
