# Package

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle

import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor 
from sklearn.linear_model import Lasso,Ridge
from sklearn.svm import LinearSVR

# Data

In [3]:
file_path = r'C:\Users\user\Desktop\git_hub\finished\ml\kaggle房價預測\data\train_processing.csv'
data = pd.read_csv(file_path, encoding="Big5", low_memory=False, index_col=0)

In [4]:
data.shape

(1460, 180)

# Train Test Spliting

In [5]:
data_v1 = data.copy()

In [6]:
X = data_v1.drop("SalePrice", axis=1).values
y = data_v1["SalePrice"].values.reshape(-1,1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Hypertune

### Decision Tree

In [39]:
pipeline = Pipeline([
                    ('scaler', Normalizer()),
                    ('pca', PCA(n_components = 50)),
                    ('regressor', DecisionTreeRegressor())
                     ])

parameters = { 
              'scaler': [Normalizer(), MinMaxScaler(), StandardScaler(), None],
              'pca': [PCA(n_components = 50), PCA(n_components = 100), PCA(n_components = 150)
                      , None, SelectFromModel(ExtraTreesRegressor(random_state=111))],
              'regressor__max_depth': [4, 6, 8, 10]
              }

scoring = 'r2'

n_splits = 5
cv = KFold(n_splits = n_splits, shuffle = True, random_state = 123457)

SearchCV = GridSearchCV(estimator = pipeline,
                        param_grid = parameters,
                        scoring = scoring, 
                        cv = cv,
                        return_train_score = True,
                        verbose = 1, 
                        n_jobs = -1)

In [40]:
SearchCV.fit(X_train,y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   29.3s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  1.1min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', Normalizer(copy=True, norm='l2')), ('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None,...lit=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [Normalizer(copy=True, norm='l2'), MinMaxScaler(copy=True, feature_range=(0, 1)), StandardScaler(copy=True, with_mean=True, with_std=True), None], 'pca': [PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), PCA(..._features=None, norm_order=1, prefit=False, threshold=None)], 'regressor__max_depth

In [41]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__max_depth': 8, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}
0.8028921834181211


In [42]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.7756916826135817


In [43]:
r_dt = SearchCV.cv_results_
print(r_dt)

{'mean_fit_time': array([0.12520709, 0.17460999, 0.16320934, 0.13040743, 0.12420712,
       0.11920686, 0.12260704, 0.1276073 , 0.1264071 , 0.13600783,
       0.22681298, 0.16780968, 0.19121099, 0.17861023, 0.20401163,
       0.23061314, 0.19401112, 0.25381451, 0.21481237, 0.24821424,
       0.22621298, 0.33101897, 0.24881415, 0.25401454, 0.33981943,
       0.3618206 , 0.29561691, 0.27961607, 0.30161729, 0.29221673,
       0.33881946, 0.27641587, 0.21701245, 0.20581183, 0.25301442,
       0.20681171, 0.24681411, 0.28701644, 0.36522098, 0.30921779,
       0.29261661, 0.3256186 , 0.32881885, 0.42262416, 0.36382074,
       0.36682105, 0.34621992, 0.34501977, 0.09000516, 0.0476027 ,
       0.04720273, 0.03880229, 0.09160528, 0.04520264, 0.05600314,
       0.05380311, 0.20861192, 0.06760378, 0.07400422, 0.09120522,
       0.16040931, 0.09540553, 0.11500659, 0.07320418, 0.66803827,
       0.51742969, 0.61743526, 0.54483109, 0.68683929, 0.54743118,
       0.71464086, 0.53923087, 0.63583627, 0

In [44]:
print('parans:\n',r_dt['params'],'\n')
print('mean_test_score:\n',r_dt['mean_test_score'],'\n')
print('std_test_score:\n',r_dt['std_test_score'],'\n')
print('mean_train_score:\n',r_dt['mean_train_score'],'\n')
print('std_train_score:\n',r_dt['std_train_score'],'\n')

parans:
 [{'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__max_depth': 4, 'scaler': Normalizer(copy=True, norm='l2')}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__max_depth': 4, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__max_depth': 4, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__max_depth': 4, 'scaler': None}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__max_depth': 6, 'scaler': Normalizer(copy=True,

 [0.70576412 0.70730678 0.8892524  0.8336695  0.8444767  0.8525748
 0.94473509 0.92217929 0.93021816 0.92661382 0.96538442 0.96819249
 0.97186738 0.970279   0.98011511 0.98773851 0.70605714 0.7111521
 0.89322856 0.83648263 0.84015094 0.85722049 0.94973611 0.91967872
 0.93572066 0.94055343 0.9816615  0.96554571 0.98055038 0.97091008
 0.99140622 0.98517304 0.70614337 0.71325868 0.89449985 0.83489883
 0.84215164 0.85719626 0.95549206 0.92656065 0.9358722  0.94346385
 0.98386551 0.97272367 0.98070311 0.98014159 0.99541621 0.99313236
 0.79137914 0.86476679 0.86476679 0.86476679 0.91909306 0.93736733
 0.93736733 0.93736733 0.97294959 0.97472292 0.97472292 0.97472292
 0.99209714 0.99100357 0.99100357 0.99100357 0.78864165 0.86009323
 0.86009323 0.86009323 0.91195672 0.92917144 0.92917144 0.92917144
 0.96790388 0.9671912  0.96719131 0.9671912  0.98930676 0.98618253
 0.98618253 0.98618245] 

std_train_score:
 [0.0175041  0.01156401 0.0051444  0.00280928 0.00893279 0.00784008
 0.00673023 0.00855

In [45]:
filename = r'C:\Users\user\Desktop\git_hub\finished\ml\kaggle房價預測\data\SearchCV_DecisionTree'
pickle.dump(SearchCV, open(filename, 'wb'))

### Random Forest

In [46]:
pipeline = Pipeline([
                      ('scaler', Normalizer()),
                      ('pca', PCA(n_components = 50)),
                      ('regressor', RandomForestRegressor())
                     ])

parameters = {    
              'scaler': [Normalizer(), MinMaxScaler(), StandardScaler(), None],
              'pca': [PCA(n_components = 50), PCA(n_components = 100), PCA(n_components = 150)
                      , None, SelectFromModel(ExtraTreesRegressor(random_state=111))],
              'regressor__max_depth': [4, 6, 8, 10]
              }

scoring = 'r2'

n_splits = 5
cv = KFold(n_splits = n_splits, shuffle = True, random_state = 123457)

SearchCV = GridSearchCV(estimator = pipeline,
                        param_grid = parameters,
                        scoring = scoring, 
                        cv = cv,
                        return_train_score = True,
                        verbose = 1, 
                        n_jobs = -1)

In [47]:
SearchCV.fit(X_train,y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  2.9min finished
  self._final_estimator.fit(Xt, y, **fit_params)


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', Normalizer(copy=True, norm='l2')), ('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', ma...s='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [Normalizer(copy=True, norm='l2'), MinMaxScaler(copy=True, feature_range=(0, 1)), StandardScaler(copy=True, with_mean=True, with_std=True), None], 'pca': [PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), PCA(..._features=None, norm_order=1, prefit=False, threshold=None)], 'regressor__max_depth

In [48]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'pca': PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__max_depth': 8, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}
0.8384176072002368


In [49]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.8821864911411412


In [50]:
r_rf = SearchCV.cv_results_
print(r_rf)

{'mean_fit_time': array([0.57243271, 0.62103543, 0.34941998, 0.37302132, 0.42442431,
       0.4646265 , 0.62723589, 0.50342884, 0.69263964, 0.59163384,
       0.59403405, 0.55723195, 0.75864348, 0.71624103, 0.91005201,
       0.58163342, 0.56283226, 0.78884506, 0.81104646, 0.73904228,
       0.9260529 , 0.97545581, 0.80664616, 0.9394537 , 1.30087442,
       1.17346716, 1.03425922, 1.05726042, 1.02245851, 1.11766376,
       1.08826213, 1.11226358, 0.6860393 , 0.82064695, 0.95285449,
       0.91365232, 1.0428597 , 1.11446381, 1.26127214, 1.45268307,
       1.50468607, 1.58949099, 1.99871435, 1.52808743, 1.51348653,
       2.17452426, 1.98811364, 2.09952011, 0.38842216, 0.24521408,
       0.25841475, 0.30101714, 0.62983594, 0.38522196, 0.39362249,
       0.42582431, 0.75664339, 0.62183557, 0.40402308, 0.38142185,
       1.00445738, 0.55203156, 0.56683235, 0.58223319, 0.74364257,
       0.80004563, 1.0212585 , 0.68363895, 0.87465005, 0.77784443,
       0.61743526, 0.68723931, 0.86244931, 0

In [51]:
print('parans:\n',r_rf['params'],'\n')
print('mean_test_score:\n',r_rf['mean_test_score'],'\n')
print('std_test_score:\n',r_rf['std_test_score'],'\n')
print('mean_train_score:\n',r_rf['mean_train_score'],'\n')
print('std_train_score:\n',r_rf['std_train_score'],'\n')

parans:
 [{'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__max_depth': 4, 'scaler': Normalizer(copy=True, norm='l2')}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__max_depth': 4, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__max_depth': 4, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__max_depth': 4, 'scaler': None}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__max_depth': 6, 'scaler': Normalizer(copy=True,

In [52]:
filename = r'C:\Users\user\Desktop\git_hub\finished\ml\kaggle房價預測\data\SearchCV_RandomForest'
pickle.dump(SearchCV, open(filename, 'wb'))

### AdaBoost

In [53]:
pipeline = Pipeline([
                    ('scaler', Normalizer()), 
                    ('pca', PCA(n_components = 50)),
                    ('regressor', AdaBoostRegressor())
                     ])

parameters = {    
                'scaler': [ Normalizer(), MinMaxScaler(), StandardScaler(), None], 
                'pca': [PCA(n_components = 50), PCA(n_components = 100), PCA(n_components = 150)
                      , None, SelectFromModel(ExtraTreesRegressor(random_state=111))],
                'regressor__n_estimators': [100, 200, 300, 400],
                'regressor__learning_rate': [0.1, 0.01, 0.001]
              }

scoring = 'r2'

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)


In [54]:
SearchCV.fit(X_train, y_train)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 37.7min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 86.6min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed: 123.9min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', Normalizer(copy=True, norm='l2')), ('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [Normalizer(copy=True, norm='l2'), MinMaxScaler(copy=True, feature_range=(0, 1)), StandardScaler(copy=True, with_mean=True, with_std=True), None], 'pca': [PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), PCA(...)], 'regressor__n_estimators': [100, 200, 300, 400], 'regressor__learning_rate': [0.1, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
 

In [55]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__learning_rate': 0.1, 'regressor__n_estimators': 300, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}
0.8211956827991539


In [56]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.8604880889997789


In [57]:
r_dt=SearchCV.cv_results_
print(r_dt)

{'mean_fit_time': array([  3.14157958,   3.05517473,   2.74475694,   2.97497015,
         4.49105687,   4.85707779,   4.57506161,   4.65246615,
         6.39396567,   7.18561106,   6.40656643,   6.68718233,
         8.11726427,   9.05391788,  11.24964342,   9.73315678,
         2.87936468,   3.22038431,   3.53300209,   3.54660282,
         6.56677561,   6.36236396,   5.90053749,   6.08594813,
         9.62715058,  11.28904562,  10.27638769,  11.21744151,
        12.41771026,  12.19789772,  13.41636739,  13.05474672,
         3.01137228,   3.81581821,   3.49960022,   4.00162888,
         6.29176011,   5.73252788,   7.09020553,   7.62703629,
         9.25212917,  10.28278813,  10.24778614,  10.00537229,
        12.87933664,  14.82924824,  12.55991836,  13.67798228,
         5.42331023,   5.29650288,   5.51631556,   5.25810075,
        10.11197834,  12.30090365,  10.93542552,  11.35484943,
        12.85293517,  16.38693738,  13.82719078,  12.97074189,
        15.6498951 ,  21.99905829,  1

In [58]:
print('parans:\n',r_dt['params'],'\n')
print('mean_test_score:\n',r_dt['mean_test_score'],'\n')
print('std_test_score:\n',r_dt['std_test_score'],'\n')
print('mean_train_score:\n',r_dt['mean_train_score'],'\n')
print('std_train_score:\n',r_dt['std_train_score'],'\n')

parans:
 [{'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__learning_rate': 0.1, 'regressor__n_estimators': 100, 'scaler': Normalizer(copy=True, norm='l2')}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__learning_rate': 0.1, 'regressor__n_estimators': 100, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__learning_rate': 0.1, 'regressor__n_estimators': 100, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__learning_rate': 0.1, 'regressor__n_estimators': 100, 'scaler': None}, {'pca': PCA(copy=True, iterated_p

In [59]:
filename = r'C:\Users\user\Desktop\git_hub\finished\ml\kaggle房價預測\data\SearchCV_AdaBoost'
pickle.dump(SearchCV, open(filename, 'wb'))

### Lasso Regression

In [25]:
pipeline = Pipeline([
                    ('scaler', Normalizer()), 
                    ('pca', PCA(n_components = 50)),                 
                    ('regressor', Lasso())
                     ])

parameters = {    
                'scaler': [Normalizer(), StandardScaler(), MinMaxScaler(), None], 
                'pca': [PCA(n_components = 50), PCA(n_components = 100), PCA(n_components = 150)
                      , None, SelectFromModel(ExtraTreesRegressor(random_state=111))],
                'regressor__normalize': [False, True],
                'regressor__alpha': np.logspace(-4, 0, 5)
              }

scoring = 'r2'

n_splits = 3
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [26]:
SearchCV.fit(X_train,y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  1.4min finished


GridSearchCV(cv=KFold(n_splits=3, random_state=123457, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', Normalizer(copy=True, norm='l2')), ('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [Normalizer(copy=True, norm='l2'), StandardScaler(copy=True, with_mean=True, with_std=True), MinMaxScaler(copy=True, feature_range=(0, 1)), None], 'pca': [PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), PCA(...sor__normalize': [False, True], 'regressor__alpha': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00])

In [27]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'pca': PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__alpha': 0.1, 'regressor__normalize': False, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}
0.7723002020763438


In [28]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.8547164832145242


In [29]:
r_ls = SearchCV.cv_results_
print(r_ls)

{'mean_fit_time': array([0.13034074, 0.21467892, 0.05400308, 0.05000289, 0.05533648,
       0.07567112, 0.28834987, 0.05867004, 0.05900343, 0.0636704 ,
       0.26368181, 0.06367032, 0.05600309, 0.05933666, 0.16767629,
       0.04933612, 0.15067538, 0.18167702, 0.05700318, 0.04833619,
       0.05266968, 0.06167014, 0.14434147, 0.05133629, 0.06000336,
       0.22267954, 0.06233684, 0.0553364 , 0.12967412, 0.13634117,
       0.05466986, 0.23168   , 0.05233614, 0.05967013, 0.05200307,
       0.16500934, 0.0546697 , 0.06800397, 0.1200068 , 0.0846715 ,
       0.1146733 , 0.13167421, 0.13034081, 0.10733938, 0.10767285,
       0.19434436, 0.15134192, 0.14067491, 0.11767348, 0.12167358,
       0.11300627, 0.10733938, 0.11500661, 0.14067467, 0.15167538,
       0.12167366, 0.22701295, 0.17300995, 0.13467439, 0.15500887,
       0.12634055, 0.42869123, 0.16767629, 0.25434796, 0.13600771,
       0.12900742, 0.12234044, 0.12434053, 0.13000743, 0.14067483,
       0.12467376, 0.13167397, 0.11833994, 0

In [30]:
print('parans:\n', r_ls['params'], '\n')
print('mean_test_score:\n', r_ls['mean_test_score'], '\n')
print('std_test_score:\n', r_ls['std_test_score'], '\n')
print('mean_train_score:\n', r_ls['mean_train_score'], '\n')
print('std_train_score:\n', r_ls['std_train_score'], '\n')

parans:
 [{'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__alpha': 0.0001, 'regressor__normalize': False, 'scaler': Normalizer(copy=True, norm='l2')}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__alpha': 0.0001, 'regressor__normalize': False, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__alpha': 0.0001, 'regressor__normalize': False, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__alpha': 0.0001, 'regressor__normalize': False, 'scaler': None}, {'pca': PCA(copy=True, iterated_power='auto', n_component

In [31]:
filename = r'C:\Users\user\Desktop\git_hub\finished\ml\kaggle房價預測\data\SearchCV_Lasso'
pickle.dump(SearchCV, open(filename, 'wb'))

### MLP Regressor

In [39]:
pipeline = Pipeline([
                    ('scaler', Normalizer()),
                    ('pca', PCA(n_components = 50)),
                    ('regressor', MLPRegressor())
                     ])

parameters = {    
                'scaler': [Normalizer(), StandardScaler(), MinMaxScaler(), None],
                'pca': [PCA(n_components = 50), PCA(n_components = 100), None],
                'regressor__hidden_layer_sizes': [(64), (128), (64, 64)],
                'regressor__activation': ['relu'],
                'regressor__solver': ['lbfgs', 'adam'],
                'regressor__alpha': [0.01, 0.1],
                'regressor__batch_size': ['auto'], 
                'regressor__learning_rate': ['constant'],
                'regressor__learning_rate_init': [0.01, 0.1],
                'regressor__max_iter': [200]
              }

scoring = 'r2'

n_splits = 3
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [40]:
SearchCV.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   47.4s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 22.2min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed: 24.6min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=KFold(n_splits=3, random_state=123457, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', Normalizer(copy=True, norm='l2')), ('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_s...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [Normalizer(copy=True, norm='l2'), StandardScaler(copy=True, with_mean=True, with_std=True), MinMaxScaler(copy=True, feature_range=(0, 1)), None], 'pca': [PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), PCA(...ing_rate': ['constant'], 'regressor__learning_rate_init': [0.01, 0.1], 'regressor__

In [41]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'pca': PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__activation': 'relu', 'regressor__alpha': 0.01, 'regressor__batch_size': 'auto', 'regressor__hidden_layer_sizes': (64, 64), 'regressor__learning_rate': 'constant', 'regressor__learning_rate_init': 0.1, 'regressor__max_iter': 200, 'regressor__solver': 'adam', 'scaler': Normalizer(copy=True, norm='l2')}
0.8066225274319554


In [42]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.8703021446178515


In [43]:
r_mlp = SearchCV.cv_results_
print(r_mlp)

{'mean_fit_time': array([2.12245464, 2.31213228, 1.76376756, 1.92044314, 2.0234491 ,
       2.1074539 , 2.45180702, 2.40447084, 1.64276052, 1.76476757,
       1.57975698, 1.48375154, 2.12445474, 2.32746657, 2.7794923 ,
       1.27040601, 2.78149263, 3.04684099, 2.7458237 , 2.70115447,
       3.28352118, 3.85055375, 3.25885304, 2.55414621, 2.38346958,
       3.28752144, 2.63381728, 2.70682144, 3.09551032, 3.20218317,
       3.19784959, 0.75971015, 2.69282071, 2.90349952, 2.88016462,
       2.8791647 , 3.53720236, 3.69787828, 3.719546  , 3.45819783,
       3.30052193, 3.60587287, 3.43452994, 3.53453565, 4.09490093,
       5.22063208, 4.25124319, 0.97338899, 1.86610667, 1.55842241,
       1.77010139, 1.47975127, 2.00278139, 2.05478422, 2.2094597 ,
       2.18979192, 1.42808175, 1.5890909 , 1.70943117, 1.9934473 ,
       2.56514684, 2.38780316, 2.05378421, 0.8523821 , 2.43313909,
       2.73915672, 2.62348342, 2.83549547, 3.84421992, 3.56887078,
       5.05295579, 2.4608074 , 2.88116471, 3

In [44]:
print('parans:\n', r_mlp['params'], '\n')
print('mean_test_score:\n', r_mlp['mean_test_score'], '\n')
print('std_test_score:\n', r_mlp['std_test_score'], '\n')
print('mean_train_score:\n', r_mlp['mean_train_score'], '\n')
print('std_train_score:\n', r_mlp['std_train_score'], '\n')

parans:
 [{'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__activation': 'relu', 'regressor__alpha': 0.01, 'regressor__batch_size': 'auto', 'regressor__hidden_layer_sizes': 64, 'regressor__learning_rate': 'constant', 'regressor__learning_rate_init': 0.01, 'regressor__max_iter': 200, 'regressor__solver': 'lbfgs', 'scaler': Normalizer(copy=True, norm='l2')}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__activation': 'relu', 'regressor__alpha': 0.01, 'regressor__batch_size': 'auto', 'regressor__hidden_layer_sizes': 64, 'regressor__learning_rate': 'constant', 'regressor__learning_rate_init': 0.01, 'regressor__max_iter': 200, 'regressor__solver': 'lbfgs', 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_so

  7.69268627e-01  9.97981950e-01  9.17607433e-01  7.66381470e-01] 

std_train_score:
 [6.62301772e-03 1.44535516e-04 1.91249806e-04 1.69417055e-01
 2.75600982e-01 1.79447045e-01 2.33895237e-01 5.52786801e-03
 1.48693314e-02 4.91924960e-04 9.64307753e-05 7.80656968e-02
 6.22899526e-03 5.95717512e-03 1.01715082e-02 1.50371546e+00
 1.37993572e-02 3.50623509e-04 7.49761287e-05 2.71962095e-02
 2.35026852e-01 7.05874622e-02 1.50970789e-01 6.24219995e-02
 5.59682497e-03 4.05299019e-04 6.28649555e-05 5.19777441e-02
 2.08458032e-02 4.70097913e-03 1.15344920e-02 5.73124808e-02
 1.91478806e-03 1.48220546e-04 6.98139116e-05 7.83238543e-02
 1.46498130e-02 8.03735268e-03 9.36549817e-03 7.33404019e-02
 5.09844818e-03 3.50409432e-04 7.80324348e-05 8.72040382e-02
 7.69214417e-03 6.08076622e-04 7.56498355e-04 2.29173877e-01
 7.57716687e-03 1.73798651e-04 1.79528551e-04 2.27646497e-02
 2.40158967e-01 1.71075889e-01 2.35125217e-01 1.78135184e-02
 1.61531081e-03 1.99356636e-04 2.00743950e-04 7.75483955e-02

In [45]:
filename = r'C:\Users\user\Desktop\git_hub\finished\ml\kaggle房價預測\data\SearchCV_MLP'
pickle.dump(SearchCV, open(filename, 'wb'))

### XGBoost

In [8]:
pipeline = Pipeline([
                      ('scaler', Normalizer()),
                      ('pca', PCA(n_components = 50)),
                      ('regressor', xgb.XGBRegressor())
                     ])

parameters = {    
              'scaler': [Normalizer(), StandardScaler(), MinMaxScaler(), None],
              'pca': [PCA(n_components = 50), PCA(n_components = 100), None],
              'regressor__learning_rate': [0.1, 0.01],
              'regressor__max_depth': [4, 6],
              'regressor__min_child_weight': [1, 3],
              'regressor__gamma': [0.1, 0.2],
              'regressor__colsample_bytree': [0.3, 0.7]
              }

scoring = 'r2'

n_splits = 3
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [9]:
SearchCV.fit(X_train, y_train)

Fitting 3 folds for each of 384 candidates, totalling 1152 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 1152 out of 1152 | elapsed: 18.7min finished


GridSearchCV(cv=KFold(n_splits=3, random_state=123457, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', Normalizer(copy=True, norm='l2')), ('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, i... reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [Normalizer(copy=True, norm='l2'), StandardScaler(copy=True, with_mean=True, with_std=True), MinMaxScaler(copy=True, feature_range=(0, 1)), None], 'pca': [PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), PCA(...in_child_weight': [1, 3], 'regressor__gamma': [0.1, 0.2], 'regressor__colsample_byt

In [10]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'pca': PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__colsample_bytree': 0.7, 'regressor__gamma': 0.1, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 4, 'regressor__min_child_weight': 1, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}
0.8616861864892196


In [11]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.9002635780801779


In [12]:
r_xgb = SearchCV.cv_results_
print(r_xgb)

{'mean_fit_time': array([0.81237984, 0.77437766, 1.16139984, 0.8910509 , 0.86204926,
       0.7210412 , 0.75137615, 0.79104535, 1.00372426, 1.4800849 ,
       1.76243416, 1.03405929, 1.87777408, 1.13273136, 1.04172611,
       1.34141008, 0.76037685, 1.03639253, 0.805046  , 1.28640676,
       0.87571685, 0.78004448, 0.75770998, 1.17073369, 0.98838989,
       0.93938708, 0.99672365, 1.34541051, 0.92838637, 1.20806909,
       0.96905557, 1.33540964, 0.72804165, 0.80604617, 0.74904291,
       1.10772999, 0.81971351, 0.8163801 , 0.74004237, 0.96638854,
       1.27107255, 1.08106192, 1.00805767, 1.03072564, 0.98038952,
       1.00739098, 0.96638854, 1.03139218, 1.13673162, 0.83571442,
       0.73304192, 0.73837558, 0.77671115, 0.8200469 , 0.74237577,
       0.8523821 , 0.9670554 , 0.98138952, 0.98205622, 0.92105269,
       0.92471957, 1.10172963, 1.04805994, 0.90671849, 1.34607681,
       1.31507516, 1.34741028, 1.75643357, 1.36574475, 1.36074448,
       1.35774422, 1.25073822, 1.8261044 , 1

In [13]:
print('parans:\n', r_xgb['params'], '\n')
print('mean_test_score:\n', r_xgb['mean_test_score'], '\n')
print('std_test_score:\n', r_xgb['std_test_score'], '\n')
print('mean_train_score:\n', r_xgb['mean_train_score'], '\n')
print('std_train_score:\n', r_xgb['std_train_score'], '\n')

parans:
 [{'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__colsample_bytree': 0.3, 'regressor__gamma': 0.1, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 4, 'regressor__min_child_weight': 1, 'scaler': Normalizer(copy=True, norm='l2')}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__colsample_bytree': 0.3, 'regressor__gamma': 0.1, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 4, 'regressor__min_child_weight': 1, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__colsample_bytree': 0.3, 'regressor__gamma': 0.1, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 4, 'regressor__min_child_weight': 1, 'scaler': MinMaxScaler(copy=Tr

In [14]:
filename = r'C:\Users\user\Desktop\git_hub\finished\ml\kaggle房價預測\data\SearchCV_XGBoost'
pickle.dump(SearchCV, open(filename, 'wb'))

### SVM

In [32]:
pipeline = Pipeline([
                      ('scaler', Normalizer()),
                      ('pca', PCA(n_components = 50)),
                      ('regressor', LinearSVR())
                     ])

parameters = {    
              'scaler': [Normalizer(), StandardScaler(), MinMaxScaler(), None],
              'pca': [PCA(n_components = 50), PCA(n_components = 100), None],
              'regressor__C': [1, 0.1, 0.01]
              }

scoring = 'r2'

n_splits = 3
cv = KFold(n_splits = n_splits, shuffle = True, random_state = 123457)

SearchCV = GridSearchCV(estimator = pipeline,
                        param_grid = parameters,
                        scoring = scoring, 
                        cv = cv,
                        return_train_score = True,
                        verbose = 1, 
                        n_jobs = -1)

In [33]:
SearchCV.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:   10.8s finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=KFold(n_splits=3, random_state=123457, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', Normalizer(copy=True, norm='l2')), ('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [Normalizer(copy=True, norm='l2'), StandardScaler(copy=True, with_mean=True, with_std=True), MinMaxScaler(copy=True, feature_range=(0, 1)), None], 'pca': [PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [34]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'pca': None, 'regressor__C': 0.01, 'scaler': None}
0.6381252565352293


In [35]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.8444849449142878


In [36]:
r_svm = SearchCV.cv_results_
print(r_svm)

{'mean_fit_time': array([0.09067194, 0.10333927, 0.12700717, 0.49469511, 0.13234099,
       0.09367212, 0.09500543, 0.38535555, 0.06333693, 0.06500387,
       0.06467048, 0.29901703, 0.09867231, 0.14300807, 0.13034074,
       0.73570863, 0.10533937, 0.14167492, 0.11067287, 0.62136912,
       0.16400957, 0.11967349, 0.1176734 , 0.57603304, 0.01300073,
       0.01766769, 0.0120008 , 0.46602678, 0.01300081, 0.03066842,
       0.01800108, 0.49269485, 0.01266734, 0.02033448, 0.04533609,
       0.39135575]), 'std_fit_time': array([0.03458843, 0.05704902, 0.0270691 , 0.07841097, 0.02931999,
       0.00385881, 0.01275473, 0.06422374, 0.00339935, 0.00294408,
       0.00385874, 0.01867373, 0.00309138, 0.01779634, 0.02117249,
       0.03209885, 0.00498919, 0.03582871, 0.00784615, 0.01438445,
       0.05267168, 0.01087362, 0.0089942 , 0.06558325, 0.00141428,
       0.00124717, 0.        , 0.0214177 , 0.00355937, 0.0114413 ,
       0.00163307, 0.0495766 , 0.00235725, 0.00262499, 0.03262432,
       

In [37]:
print('parans:\n', r_svm['params'], '\n')
print('mean_test_score:\n', r_svm['mean_test_score'], '\n')
print('std_test_score:\n', r_svm['std_test_score'], '\n')
print('mean_train_score:\n', r_svm['mean_train_score'], '\n')
print('std_train_score:\n', r_svm['std_train_score'], '\n')

parans:
 [{'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__C': 1, 'scaler': Normalizer(copy=True, norm='l2')}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__C': 1, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__C': 1, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__C': 1, 'scaler': None}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__C': 0.1, 'scaler': Normalizer(copy=True, norm='l2')}, {'pca': PCA(copy=True, i

In [38]:
filename = r'C:\Users\user\Desktop\git_hub\finished\ml\kaggle房價預測\data\SearchCV_SVM'
pickle.dump(SearchCV, open(filename, 'wb'))

### LightGBM

In [18]:
pipeline = Pipeline([
                      ('scaler', Normalizer()),
                      ('pca', PCA(n_components = 50)),
                      ('regressor', lgb.LGBMRegressor())
                     ])

parameters = {    
              'scaler': [Normalizer(), StandardScaler(), MinMaxScaler(), None],
              'pca': [PCA(n_components = 50), PCA(n_components = 100), None],
              'regressor__learning_rate': [0.1, 0.01],
              'regressor__max_depth': [6, 8],
              'regressor__min_child_samples': [16, 18],
              'regressor__reg_alpha': [0.01, 0.1],
              'regressor__lambda': [0.01, 0.1]
              }

scoring = 'r2'

n_splits = 3
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [19]:
SearchCV.fit(X_train, y_train)

Fitting 3 folds for each of 384 candidates, totalling 1152 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 21.7min
[Parallel(n_jobs=-1)]: Done 1152 out of 1152 | elapsed: 25.3min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=KFold(n_splits=3, random_state=123457, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', Normalizer(copy=True, norm='l2')), ('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='spli....0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [Normalizer(copy=True, norm='l2'), StandardScaler(copy=True, with_mean=True, with_std=True), MinMaxScaler(copy=True, feature_range=(0, 1)), None], 'pca': [PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), PCA(...min_child_samples': [16, 18], 'regressor__reg_alpha': [0.01, 0.1], 'regressor__lamb

In [20]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'pca': None, 'regressor__lambda': 0.01, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 6, 'regressor__min_child_samples': 16, 'regressor__reg_alpha': 0.1, 'scaler': None}
0.8506879061968194


In [21]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.9007278096646715


In [22]:
r_gbm = SearchCV.cv_results_
print(r_gbm)

{'mean_fit_time': array([1.77643482, 1.76876799, 1.78576883, 1.73643263, 1.87777408,
       1.42608142, 1.54775524, 1.38141227, 1.5080862 , 1.39841342,
       1.69876401, 1.33507625, 1.6430939 , 1.64109389, 1.67742928,
       2.03411643, 2.13912233, 1.83477171, 1.99544748, 2.14578938,
       2.42147183, 2.58881474, 3.49353313, 2.69148707, 2.99883819,
       3.1118447 , 3.76921558, 3.2851878 , 3.0038383 , 2.26912975,
       2.29613129, 1.78276857, 2.95950262, 3.11284463, 2.39680378,
       2.39046995, 2.20579274, 2.51314378, 2.22246035, 2.16879066,
       3.27818735, 2.21579337, 2.05178412, 2.00611464, 2.01711535,
       2.55681276, 2.32013273, 2.88983194, 3.20385003, 3.90222303,
       3.15818048, 3.50586732, 2.97517014, 4.18990636, 2.63815085,
       2.86916399, 3.19351602, 3.79155008, 4.14357026, 3.94155892,
       3.03650705, 3.14084633, 2.74282352, 2.43913953, 1.61242565,
       1.49875236, 1.73909934, 1.42474826, 1.70843116, 1.49308538,
       2.66748587, 2.08811951, 2.02678251, 1

In [23]:
print('parans:\n', r_gbm['params'], '\n')
print('mean_test_score:\n', r_gbm['mean_test_score'], '\n')
print('std_test_score:\n', r_gbm['std_test_score'], '\n')
print('mean_train_score:\n', r_gbm['mean_train_score'], '\n')
print('std_train_score:\n', r_gbm['std_train_score'], '\n')

parans:
 [{'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__lambda': 0.01, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 6, 'regressor__min_child_samples': 16, 'regressor__reg_alpha': 0.01, 'scaler': Normalizer(copy=True, norm='l2')}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__lambda': 0.01, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 6, 'regressor__min_child_samples': 16, 'regressor__reg_alpha': 0.01, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}, {'pca': PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), 'regressor__lambda': 0.01, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 6, 'regressor__min_child_samples': 16, 'regressor__reg_alpha': 0.01, 'scaler': MinMaxScaler(copy=True, fe

 0.70334513 0.70387887 0.64652851 0.7022942  0.70334512 0.70387885] 

std_test_score:
 [0.03789523 0.01866548 0.0394234  0.0592573  0.04452668 0.01617795
 0.03290123 0.06465623 0.04941231 0.0209825  0.03179703 0.06594742
 0.04617889 0.02112616 0.03416745 0.05843417 0.04116349 0.01782598
 0.03227228 0.06063595 0.03992741 0.02105233 0.03940999 0.0626715
 0.04647445 0.01580941 0.03618334 0.06397618 0.04247122 0.01738765
 0.03214392 0.06190858 0.06760965 0.04956218 0.05419651 0.06760532
 0.06915026 0.04577671 0.05588954 0.06710434 0.07069146 0.05179771
 0.0517673  0.06740101 0.0693587  0.04630118 0.050165   0.06681262
 0.06791878 0.0471246  0.05354225 0.06835533 0.06853046 0.04631846
 0.04986343 0.0679649  0.07051278 0.04153968 0.05131088 0.0673843
 0.06822437 0.04809593 0.05349613 0.06627977 0.04005665 0.01546518
 0.0370097  0.06642708 0.04261679 0.02099681 0.03915909 0.05789232
 0.04277931 0.02736564 0.03159671 0.06738396 0.05041587 0.02241156
 0.0296364  0.06240589 0.04332852 0.02105177

 0.75386589 0.75420547 0.73285743 0.75383895 0.75386587 0.75420546] 

std_train_score:
 [0.01031036 0.00647063 0.00887021 0.0112066  0.01060794 0.00687222
 0.00957959 0.01038141 0.01046451 0.00895281 0.01027062 0.01113255
 0.01213332 0.00829154 0.00989435 0.01057279 0.00884531 0.00607285
 0.01010305 0.00936352 0.00832938 0.00568652 0.00780799 0.00988681
 0.00929218 0.00765295 0.00912286 0.00967387 0.00962629 0.0080423
 0.0084631  0.0094828  0.01346232 0.01067682 0.00639909 0.01503251
 0.01334334 0.00994587 0.00588513 0.01497223 0.01430162 0.0137783
 0.00763501 0.01620394 0.01407419 0.01136349 0.00719025 0.01667046
 0.01547402 0.01272526 0.00958377 0.01541529 0.015393   0.01022961
 0.00723872 0.01574132 0.01530773 0.01088073 0.00864238 0.01665916
 0.01506257 0.01037867 0.00777514 0.01678771 0.01082254 0.00733627
 0.00955971 0.01071827 0.01085597 0.00820526 0.00924968 0.01111432
 0.01019667 0.009801   0.00972858 0.01067536 0.01076175 0.00627483
 0.00982933 0.01131335 0.0098339  0.0072855

In [24]:
filename = r'C:\Users\user\Desktop\git_hub\finished\ml\kaggle房價預測\data\SearchCV_GBM'
pickle.dump(SearchCV, open(filename, 'wb'))