In [1]:
# 將需要的都import進來
import os
import copy
import time
import math
import numpy             as np
import pandas            as pd
import seaborn           as sns
import datetime          as dt
import warnings
import matplotlib.pyplot as plt
from scipy                   import stats
from itertools               import compress
from sklearn.tree            import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics         import roc_curve,mean_squared_error,r2_score,accuracy_score,precision_score,recall_score,fbeta_score
from sklearn.ensemble        import GradientBoostingRegressor,GradientBoostingClassifier,RandomForestClassifier,RandomForestRegressor
from sklearn.datasets        import load_boston, load_wine
from sklearn.linear_model    import LogisticRegression,LinearRegression,Lasso,Ridge
from sklearn.preprocessing   import LabelEncoder, MinMaxScaler, StandardScaler,OneHotEncoder
from sklearn.model_selection import cross_val_score,train_test_split,KFold,GridSearchCV
from sklearn.decomposition   import PCA
from IPython.display         import YouTubeVideo

# 將較長的函式改名一下
MSE  = mean_squared_error
ACC  = accuracy_score
MME  = MinMaxScaler()
LE   = LabelEncoder()
OHE  = OneHotEncoder()
PCA  = PCA()

# 一些必要的設定
warnings.filterwarnings('ignore')
%matplotlib inline

# 設定【data的資料夾路徑】，命名為【data_folder】
data_folder = 'C:/Users/Ynitsed/Documents/GitHub/2nd-ML100Days/data'

1. train：1000筆資料，40個x
2. label：1000筆資料，1個y
3. test：9000筆資料，40個x，沒有y

這意思是，我們利用train+label去作出一個【我做的模型】，  
然後去跑test，會得到一個【我做的模型】所得到的y，  
Kaggle會再拿這個y去比對他們手上真正的y，看正確率如何？  

其中，這9000個y又會再被Kaggle內部分為【Public set】和【Private set】，  
【Public set】會被公開，但最後的排名會以【Private set】當排名，  
這個做法考量大家無法拿到真正的y做模型調參。  

詳細可參考[社群問答](https://www.cupoy.com/qa/kwassist/ai_tw/0000016B2D579A9D000000176375706F795F72656C656173655155455354)

In [2]:
# 將資料讀入
t001_train = os.path.join(data_folder, 'scikit-learn-practice_train.csv')
t001_label = os.path.join(data_folder, 'scikit-learn-practice_trainLabels.csv')
t001_test  = os.path.join(data_folder, 'scikit-learn-practice_test.csv')

### 沒有欄位名稱，header必須指定為None，否則第一筆資料會變成欄位名稱。

In [3]:
# 讀入train
train_X_t1 = pd.read_csv(t001_train,header=None)
train_X_t1 = train_X_t1[[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                         10,11,12,13,14,15,16,17,18,19,
                         20,21,22,23,24,25,26,27,28,29,
                         30,31,32,33,34,35,36,37,38,39]]
print('Path of read in data: %s' %t001_train)
print(train_X_t1.shape)
train_X_t1.head()

Path of read in data: C:/Users/Ynitsed/Documents/GitHub/2nd-ML100Days/data\scikit-learn-practice_train.csv
(1000, 40)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.299403,-1.226624,1.498425,-1.17615,5.289853,0.208297,2.404498,1.594506,-0.051608,0.663234,...,-0.850465,-0.62299,-1.833057,0.293024,3.552681,0.717611,3.305972,-2.715559,-2.682409,0.10105
1,-1.174176,0.332157,0.949919,-1.285328,2.199061,-0.151268,-0.427039,2.619246,-0.765884,-0.09378,...,-0.81975,0.012037,2.038836,0.468579,-0.517657,0.422326,0.803699,1.213219,1.382932,-1.817761
2,1.192222,-0.414371,0.067054,-2.233568,3.658881,0.089007,0.203439,-4.219054,-1.184919,-1.24031,...,-0.604501,0.750054,-3.360521,0.856988,-2.751451,-1.582735,1.672246,0.656438,-0.932473,2.987436
3,1.57327,-0.580318,-0.866332,-0.603812,3.125716,0.870321,-0.161992,4.499666,1.038741,-1.092716,...,1.022959,1.275598,-3.48011,-1.065252,2.153133,1.563539,2.767117,0.215748,0.619645,1.883397
4,-0.613071,-0.644204,1.112558,-0.032397,3.490142,-0.011935,1.443521,-4.290282,-1.761308,0.807652,...,0.513906,-1.803473,0.518579,-0.205029,-4.744566,-1.520015,1.830651,0.870772,-1.894609,0.408332


In [4]:
# 讀入label
train_Y_t1 = pd.read_csv(t001_label,header=None)
print('Path of read in data: %s' %t001_label)
print(train_Y_t1.shape)
train_Y_t1.head()

Path of read in data: C:/Users/Ynitsed/Documents/GitHub/2nd-ML100Days/data\scikit-learn-practice_trainLabels.csv
(1000, 1)


Unnamed: 0,0
0,1
1,0
2,0
3,1
4,0


In [5]:
# 讀入test，但目前暫時用不到，最後要交作業時才會用
x_test_hw = pd.read_csv(t001_test,header=None)
x_test_hw = x_test_hw[[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                       10,11,12,13,14,15,16,17,18,19,
                       20,21,22,23,24,25,26,27,28,29,
                       30,31,32,33,34,35,36,37,38,39]]
print('Path of read in data: %s' %t001_test)
print(x_test_hw.shape)
x_test_hw.head()

Path of read in data: C:/Users/Ynitsed/Documents/GitHub/2nd-ML100Days/data\scikit-learn-practice_test.csv
(9000, 40)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,2.808909,-0.242894,-0.546421,0.255162,1.749736,-0.030458,-1.322071,3.578071,-0.667578,-0.884257,...,-0.261688,-0.224375,-1.675606,-0.479584,-0.244388,-0.672355,0.51786,0.010665,-0.419214,2.818387
1,-0.374101,0.537669,0.081063,0.756773,0.915231,2.557282,3.703187,1.673835,-0.764122,-1.22804,...,-0.969463,0.574154,-2.200519,-1.61224,0.179031,-2.924596,0.64361,-1.470939,-0.067408,-0.976265
2,-0.08837,0.154743,0.380716,-1.176126,1.699867,-0.258627,-1.384999,1.093584,1.596633,0.230631,...,-0.769885,-0.005143,1.46749,0.483803,-3.542981,0.814561,-1.652948,1.265866,-1.749248,1.773784
3,-0.685635,0.501283,1.873375,0.215224,-3.983468,-0.103637,4.136113,-0.225431,-1.515015,-1.071763,...,0.968609,2.386412,-0.131219,0.285646,2.302069,1.255588,-1.56309,-0.125258,-1.030761,-2.945329
4,0.350867,0.721897,-0.477104,-1.748776,-2.627405,1.075433,4.954253,-3.293501,-0.760369,0.20436,...,0.260553,-2.04565,-2.173227,0.372992,0.4507,-0.211657,1.301359,-0.522164,2.484883,0.039213


# 本次的y是離散型

In [6]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(train_X_t1, train_Y_t1, test_size=0.2, random_state=4)
# 看切完長怎樣
print(x_train.shape)
print(x_train.head())
print(y_train.shape)
print(y_train.head())
print(x_test.shape)
print(x_test.head())
print(y_test.shape)
print(y_test.head())

(800, 40)
           0         1         2         3         4         5         6   \
608 -2.548167  0.404039 -0.601543  1.824213 -2.818650 -0.944685 -2.730364   
511  0.995867  0.971880  0.211951  0.157973 -3.717024 -0.115513 -0.947025   
641 -0.454988 -1.204417  1.116421  0.322063  2.313806 -1.173895 -2.032064   
112 -0.666400  0.371497  1.514663  0.229668 -3.647657 -0.442661  2.270981   
535  0.446133  1.684642 -0.962372 -0.089929 -4.388425  0.036134  1.221663   

           7         8         9   ...        30        31        32  \
608  1.955640 -0.014587  1.489978  ...  0.411372  0.532103  4.175994   
511 -0.338361 -0.675010 -0.641413  ...  0.047100 -1.594257 -2.368145   
641  1.760241 -1.740474  0.585253  ...  0.104236 -0.321704 -3.270759   
112 -1.282171 -1.954917  0.616799  ... -0.594698  0.790001  1.207777   
535 -1.858487 -0.311780  1.154022  ...  0.475814 -0.378512  1.980913   

           33        34        35        36        37        38        39  
608  0.952334 -1.8

# 非連續型、分類(classification)，有4種做法

# 1.使用 LogisticRegression

In [7]:
# LGR
LGR = LogisticRegression()
LGR.fit(x_train, y_train)
# 印出coef
print(LGR.coef_)
print(LGR.intercept_)
print(LGR.score(x_train, y_train))
# 將x_test丟進上面跑好的回歸模型裡，得到y_pred，也就是預測出來的y_pred。
y_pred = LGR.predict(x_test)
print(y_pred.shape)
print(pd.DataFrame(y_pred).head())
# 看一下預測出來的y_pred和實際的y_test差多少？
print("Mean squared error: %.2f"% MSE(y_test, y_pred))
# 本例中直接分成了0,1,2，屬於classification tasks，可以使用ACC。
print("Accuracy: ", ACC(y_test, y_pred))

[[-0.1017302   0.01597184 -0.10109519  0.13955032  0.14769906 -0.03517952
  -0.06679527  0.05504298  0.04371259  0.19396811  0.0174138  -0.04821463
   0.26333821  0.10486345  0.3557994  -0.19406753 -0.03636938 -0.13069166
   0.05749699  0.08098114 -0.11330471 -0.09802794  0.08380933 -0.08396527
   0.00959759 -0.18483526  0.18834853 -0.16470784 -0.09847777 -0.09228342
  -0.13387867  0.10418527 -0.18321309 -0.1361636   0.30306451  0.03632906
  -0.19508855  0.1663002  -0.12137058  0.40023851]]
[0.36665452]
0.83125
(200,)
   0
0  0
1  1
2  0
3  0
4  1
Mean squared error: 0.17
Accuracy:  0.83


# 2.使用 DecisionTreeClassifier

In [8]:
# DTC
DTC = DecisionTreeClassifier()
DTC.fit(x_train, y_train)
# 印出coef
# print(DTC.coef_)
# print(DTC.intercept_)
print(DTC.score(x_train, y_train))
# 將x_test丟進上面跑好的回歸模型裡，得到y_pred，也就是預測出來的y_pred。
y_pred = DTC.predict(x_test)
print(y_pred.shape)
print(pd.DataFrame(y_pred).head())
# 看一下預測出來的y_pred和實際的y_test差多少？
print("Mean squared error: %.2f"% MSE(y_test, y_pred))
# 本例中直接分成了0,1,2，屬於classification tasks，可以使用ACC。
print("Accuracy: ", ACC(y_test, y_pred))

1.0
(200,)
   0
0  0
1  0
2  0
3  0
4  0
Mean squared error: 0.20
Accuracy:  0.795


# 3.使用 RandomForestClassifier

In [9]:
# RFC：使用預設值
RFC1 = RandomForestClassifier(n_estimators='warn',
                              criterion='gini',
                              max_depth=None,
                              min_samples_split=2,
                              min_samples_leaf=1,
                              min_weight_fraction_leaf=0.0,
                              max_features='auto',
                              max_leaf_nodes=None,
                              min_impurity_decrease=0.0,
                              min_impurity_split=None,
                              bootstrap=True, oob_score=False,
                              n_jobs=None, random_state=None,
                              verbose=0,
                              warm_start=False,
                              class_weight=None)
RFC1.fit(x_train, y_train)
print(RFC1.score(x_train, y_train))
# 將x_test丟進上面跑好的回歸模型裡，得到y_pred，也就是預測出來的y_pred。
y_pred = RFC1.predict(x_test)
print(y_pred.shape)
print(pd.DataFrame(y_pred).head())
# 看一下預測出來的y_pred和實際的y_test差多少？
print("Mean squared error: %.2f"% MSE(y_test, y_pred))
print("Accuracy: ", ACC(y_test, y_pred))

0.99625
(200,)
   0
0  0
1  1
2  0
3  0
4  1
Mean squared error: 0.15
Accuracy:  0.85


In [10]:
# 設定要訓練的超參數組合
n = [50, 100, 150, 200]
d = [1, 3, 5, 7]
grid_param = dict(n_estimators=n,max_depth=d)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(RFC1,grid_param,scoring="accuracy",n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

# 預設會跑 3-fold cross-validadtion，總共 9 種參數組合，總共要 train 27 次模型

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    9.6s finished


In [11]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: 0.853750 using {'max_depth': 7, 'n_estimators': 200}


In [12]:
# 使用最佳參數重新建立模型
RFC2 = RandomForestClassifier(max_depth=grid_result.best_params_['max_depth'],
                                  n_estimators=grid_result.best_params_['n_estimators'])
# 預測測試集
RFC2.fit(x_train, y_train)
print(RFC2.score(x_train, y_train))
# 將x_test丟進上面跑好的回歸模型裡，得到y_pred，也就是預測出來的y_pred。
y_pred = RFC2.predict(x_test)
print(y_pred.shape)
print(pd.DataFrame(y_pred).head())
# 看一下預測出來的y_pred和實際的y_test差多少？
print("Mean squared error: %.2f"% MSE(y_test, y_pred))
print("Accuracy: ", ACC(y_test, y_pred))

0.98625
(200,)
   0
0  0
1  1
2  0
3  0
4  1
Mean squared error: 0.14
Accuracy:  0.865


# 4.使用 GradientBoostingClassifier

In [13]:
# GBC：使用預設值
GBC1 = GradientBoostingClassifier(loss='deviance',
                                  learning_rate=0.1,
                                  n_estimators=100,
                                  subsample=1.0,
                                  criterion='friedman_mse',
                                  min_samples_split=2,
                                  min_samples_leaf=1,
                                  min_weight_fraction_leaf=0.0,
                                  max_depth=3,
                                  min_impurity_decrease=0.0,
                                  min_impurity_split=None,
                                  init=None,
                                  random_state=None,
                                  max_features=None,
                                  verbose=0,
                                  max_leaf_nodes=None,
                                  warm_start=False,
                                  presort='auto',
                                  validation_fraction=0.1,
                                  n_iter_no_change=None,
                                  tol=0.0001)
GBC1.fit(x_train, y_train)
print(GBC1.score(x_train, y_train))
# 將x_test丟進上面跑好的回歸模型裡，得到y_pred，也就是預測出來的y_pred。
y_pred = GBC1.predict(x_test)
print(y_pred.shape)
print(pd.DataFrame(y_pred).head())
# 看一下預測出來的y_pred和實際的y_test差多少？
print("Mean squared error: %.2f"% MSE(y_test, y_pred))
print("Accuracy: ", ACC(y_test, y_pred))

0.99375
(200,)
   0
0  0
1  1
2  0
3  0
4  1
Mean squared error: 0.14
Accuracy:  0.86


In [14]:
# 設定要訓練的超參數組合
n = [50, 100, 150, 200]
d = [1, 3, 5, 7]
grid_param = dict(n_estimators=n,max_depth=d)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(GBC1,grid_param,scoring="accuracy",n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

# 預設會跑 3-fold cross-validadtion，總共 9 種參數組合，總共要 train 27 次模型

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  41 out of  48 | elapsed:    8.4s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    9.6s finished


In [15]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: 0.875000 using {'max_depth': 5, 'n_estimators': 150}


In [16]:
# 使用最佳參數重新建立模型
GBC2 = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                  n_estimators=grid_result.best_params_['n_estimators'])
# 預測測試集
GBC2.fit(x_train, y_train)
print(GBC2.score(x_train, y_train))
# 將x_test丟進上面跑好的回歸模型裡，得到y_pred，也就是預測出來的y_pred。
y_pred = GBC2.predict(x_test)
print(y_pred.shape)
print(pd.DataFrame(y_pred).head())
# 看一下預測出來的y_pred和實際的y_test差多少？
print("Mean squared error: %.2f"% MSE(y_test, y_pred))
print("Accuracy: ", ACC(y_test, y_pred))

1.0
(200,)
   0
0  0
1  1
2  0
3  0
4  1
Mean squared error: 0.12
Accuracy:  0.88


# 看來GBC2的結果最好

In [17]:
# 將x_test丟進上面跑好的回歸模型裡，得到y_pred，也就是預測出來的y_pred。
y_pred_hw_t1 = GBC2.predict(x_test_hw)

# 把結果命名為'Solution'
y_pred_hw    = pd.DataFrame({'Solution': y_pred_hw_t1})

# 把index命名為Id'
y_pred_hw.index.name = 'Id'

# 看一下最後資料的長相
print(y_pred_hw.shape)
print(y_pred_hw.head())

# 將資料結果存檔 (保留index和header)
y_pred_hw.to_csv('scikit-learn-practice_testLabels_t1.csv', index=True, header=True)

(9000, 1)
    Solution
Id          
0          1
1          0
2          1
3          0
4          0


# 但上傳需要從1~9000，必須把index+1
### 把已經存的scikit-learn-practice_testLabels_t1.csv再讀入，讓它index+1之後再匯出

In [18]:
data_folder2 = 'C:/Users/Ynitsed/Documents/GitHub/2nd-ML100Days/homework'
t001  = os.path.join(data_folder2, 'scikit-learn-practice_testLabels_t1.csv')

In [19]:
# 讀入label
t002 = pd.read_csv(t001,header=0)
print('Path of read in data: %s' %t001)
print(t002.shape)
t002.head()

Path of read in data: C:/Users/Ynitsed/Documents/GitHub/2nd-ML100Days/homework\scikit-learn-practice_testLabels_t1.csv
(9000, 2)


Unnamed: 0,Id,Solution
0,0,1
1,1,0
2,2,1
3,3,0
4,4,0


In [20]:
t002['Id']= t002['Id']+1

In [21]:
print(t002.shape)
t002.head()

(9000, 2)


Unnamed: 0,Id,Solution
0,1,1
1,2,0
2,3,1
3,4,0
4,5,0


In [22]:
t002.to_csv('scikit-learn-practice_testLabels.csv', index=False, header=True)