## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [2]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np 
import pandas as pd

In [34]:
train_df = pd.read_csv('../data/data-science-london-scikit-learn/train.csv',header = None)
train_label = pd.read_csv('../data/data-science-london-scikit-learn/trainLabels.csv',header = None)
test_df = pd.read_csv('../data/data-science-london-scikit-learn/test.csv',header = None)

In [23]:
print("shape of training set:{}".format(train_df.shape))
print("shape of training label set:{}".format(train_label.shape))
print("shape of testing set:{}".format(test_df.shape))

shape of training set:(1000, 40)
shape of training label set:(1000, 1)
shape of testing set:(9000, 40)


In [19]:
# get column data types, seems all are number
train_df.dtypes

0     float64
1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13    float64
14    float64
15    float64
16    float64
17    float64
18    float64
19    float64
20    float64
21    float64
22    float64
23    float64
24    float64
25    float64
26    float64
27    float64
28    float64
29    float64
30    float64
31    float64
32    float64
33    float64
34    float64
35    float64
36    float64
37    float64
38    float64
39    float64
dtype: object

In [20]:
# check if there is any missing value, no missing value
train_df.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
dtype: int64

In [44]:
# 建立模型
clf = GradientBoostingClassifier(random_state=7)
lable = train_label.iloc[:,0]
x_train, x_test, y_train, y_test = train_test_split(train_df, lable, test_size = 0.2, random_state = 2019)
print("type of train_label={}, shape of={}".format(type(train_label), train_label.shape))
print("type of lable={}, shape of={}".format(type(lable), lable.shape))
print("type of x_train={}, shape of={}".format(type(x_train), x_train.shape))
print("type of y_train={}, shape of={}".format(type(y_train), y_train.shape))
print("type of x_test={}, shape of={}".format(type(x_test), x_test.shape))
print("type of y_test={}, shape of={}".format(type(y_test), y_test.shape))



clf.fit(x_train, y_train)
pred = clf.predict(x_test)

type of train_label=<class 'pandas.core.frame.DataFrame'>, shape of=(1000, 1)
type of lable=<class 'pandas.core.series.Series'>, shape of=(1000,)
type of x_train=<class 'pandas.core.frame.DataFrame'>, shape of=(800, 40)
type of y_train=<class 'pandas.core.series.Series'>, shape of=(800,)
type of x_test=<class 'pandas.core.frame.DataFrame'>, shape of=(200, 40)
type of y_test=<class 'pandas.core.series.Series'>, shape of=(200,)


In [46]:
acc = metrics.accuracy_score(y_test, pred)
print("Acuuracy: ", acc)

Acuuracy:  0.865


In [None]:
train_x = total_df[:len(train_df)]
test_x = total_df[len(train_df):]

clf = GradientBoostingClassifier()

x_train, x_test, y_train, y_test = train_test_split(train_x,train_label, test_size = 0.2, random_state = 2019)
clf.fit(x_train, y_train)
pred = clf.predict(x_test)

print(f' Accuracy with non-tuned classifier : {metrics.accuracy_score(pred,y_test)}')

In [47]:
# 設定要訓練的超參數組合
n_estimators = [50, 100, 150]
max_depth = [1, 3, 5]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(reg, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

# 預設會跑 3-fold cross-validadtion，總共 9 種參數組合，總共要 train 27 次模型

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   11.2s finished


In [48]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: -0.135000 using {'max_depth': 5, 'n_estimators': 150}


In [49]:
# 使用最佳參數重新建立模型
reg_bestparam = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'])

# 訓練模型
reg_bestparam.fit(x_train, y_train)

# 預測測試集
y_pred = reg_bestparam.predict(x_test)

In [50]:
# 調整參數後accu可提升一些些
acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.88


In [62]:
y_pred.shape

(200,)

In [59]:
type(test_df)

pandas.core.frame.DataFrame

In [82]:
# predict original test set
result = reg_bestparam.predict(test_df)
result.shape
#output = pd.DataFrame({'Id' : np.arange(1,len(test_df)+1,1), 'Solution' : result}).drop([0], axis=0)
output = pd.DataFrame({'Id' : np.arange(1,len(test_df)+1,1), 'Solution' : result})
output.head(10)
output.to_csv("submission20190712001.csv", index=False)
