In [69]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV 
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier

In [74]:
# 讀取資料集
data_path = 'data/'
train = pd.read_csv(data_path + 'train.csv', header=None)
trainLabels = pd.read_csv(data_path + 'trainLabels.csv', header=None)
test = pd.read_csv(data_path + 'test.csv', header=None)
MMEconder = MinMaxScaler()
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.299403,-1.226624,1.498425,-1.17615,5.289853,0.208297,2.404498,1.594506,-0.051608,0.663234,...,-0.850465,-0.62299,-1.833057,0.293024,3.552681,0.717611,3.305972,-2.715559,-2.682409,0.10105
1,-1.174176,0.332157,0.949919,-1.285328,2.199061,-0.151268,-0.427039,2.619246,-0.765884,-0.09378,...,-0.81975,0.012037,2.038836,0.468579,-0.517657,0.422326,0.803699,1.213219,1.382932,-1.817761
2,1.192222,-0.414371,0.067054,-2.233568,3.658881,0.089007,0.203439,-4.219054,-1.184919,-1.24031,...,-0.604501,0.750054,-3.360521,0.856988,-2.751451,-1.582735,1.672246,0.656438,-0.932473,2.987436
3,1.57327,-0.580318,-0.866332,-0.603812,3.125716,0.870321,-0.161992,4.499666,1.038741,-1.092716,...,1.022959,1.275598,-3.48011,-1.065252,2.153133,1.563539,2.767117,0.215748,0.619645,1.883397
4,-0.613071,-0.644204,1.112558,-0.032397,3.490142,-0.011935,1.443521,-4.290282,-1.761308,0.807652,...,0.513906,-1.803473,0.518579,-0.205029,-4.744566,-1.520015,1.830651,0.870772,-1.894609,0.408332


In [75]:
train.isna().values.any()

False

In [76]:
#切分訓練集測試集
x_train, x_test, y_train, y_test = train_test_split(train, trainLabels, test_size = 0.25, random_state = 4)
MMEconder = MinMaxScaler()
x_train.shape

(750, 40)

In [77]:
#feature scaling
x_train = MMEconder.fit_transform(x_train)

#設定要訓練的超參數組合
n_estimators = [100, 200, 300]
max_depth = [1, 3, 5]
para_grid = dict(n_estimators = n_estimators, max_depth = max_depth)

#建立模型
clf = GradientBoostingClassifier()

#建立搜尋物件,放入模型與超參數組合字典
grid_search = GridSearchCV(clf, para_grid, scoring = 'accuracy', n_jobs = -1, verbose= 1)

#開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    9.5s finished
  y = column_or_1d(y, warn=True)


In [78]:
#印出最佳結果,最佳參數
print("Best accuracy : %f using %s" %(grid_result.best_score_, grid_result.best_params_))

Best accuracy : 0.870667 using {'max_depth': 5, 'n_estimators': 300}


In [79]:
#使用最佳參數重新建立模型
clf_bestparam = GradientBoostingClassifier(max_depth= grid_result.best_params_['max_depth'], 
                                 n_estimators = grid_result.best_params_['n_estimators'] )
#再訓練一次模型
clf_bestparam.fit(x_train, y_train)

#再預測一次
y_pred = clf_bestparam.predict(x_test)

  y = column_or_1d(y, warn=True)


In [80]:
acc = metrics.accuracy_score(y_pred, y_test)
print("Accuracy: ", acc)

Accuracy:  0.628


In [81]:
test = MMEconder.fit_transform(test)
test_pred = clf_bestparam.predict(test)

submission = pd.DataFrame(test_pred)
print(test_pred.shape)

(9000,)


In [82]:
submission.columns = ['Solution']
submission['Id'] = np.arange(1,submission.shape[0]+1)
submission = submission[['Id', 'Solution']]
submission.to_csv('DataScienceLondon_Scikit-learn_submission.csv', index=False)

In [83]:
submission

Unnamed: 0,Id,Solution
0,1,1
1,2,0
2,3,0
3,4,0
4,5,0
5,6,0
6,7,0
7,8,1
8,9,0
9,10,0
