### Path

In [1]:
import os
os.chdir(r'/Users/iflab/Desktop')

### Package

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle

### Data

In [3]:
data = pd.read_csv('mix_platforms.csv', encoding='utf-8',low_memory=False)

In [4]:
data.drop(columns=['platforms','pledged_percent','status'],inplace=True)

In [5]:
data.set_index('project_id',inplace=True)

In [6]:
data.head(3)

Unnamed: 0_level_0,category,location,backers,goal_usd,pledged_usd
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
620302213,Art,USD,6,0.01,100.0
9572984,Film & Video,USD,0,0.15,0.0
1379346088,Art,MXN,7,0.49,16.41


In [7]:
data['category'].loc[data['category']=='art']='Art'
data['category'].loc[data['category']=='comics']='Comics'
data['category'].loc[data['category']=='music']='Music'
data['category'].loc[data['category']=='design']='Design'
data['category'].loc[data['category']=='technology']='Technology'
data['category'].loc[data['category']=='food']='Food'
data['category'].loc[data['category']=='photography']='Photography'
data['category'].loc[data['category']=='theatre']='Theater'
data['category'].loc[data['category']=='fashion']='Fashion'
data['category'].loc[data['category']=='dance']='Dance'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [8]:
data.shape

(593011, 5)

In [9]:
data=pd.get_dummies(data,drop_first=True)

In [10]:
data.head(3)

Unnamed: 0_level_0,backers,goal_usd,pledged_usd,category_Comics,category_Crafts,category_Dance,category_Design,category_Fashion,category_Film & Video,category_Food,...,location_EUR,location_GBP,location_HKD,location_JPY,location_MXN,location_NOK,location_NZD,location_SEK,location_SGD,location_USD
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
620302213,6,0.01,100.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9572984,0,0.15,0.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1379346088,7,0.49,16.41,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [11]:
data.shape

(593011, 61)

In [None]:
data.to_csv('mix_platforms_preprocessing.csv')

## Train Test Spliting

In [12]:
data_v1 = data.copy()

In [13]:
X = data_v1.drop("pledged_usd", axis=1).values
y = data_v1["pledged_usd"].values.reshape(-1,1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
print(np.mean(y_train),np.std(y_train),'\n')
print(np.mean(y_test),np.std(y_test))

11818.4288975322 123761.02171133753 

12084.047838491548 119781.33735993227


## HyperTune

In [16]:
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor 
from sklearn.linear_model import Lasso,Ridge
from sklearn.svm import LinearSVR

### Decision Tree

In [20]:
pipeline = Pipeline([
                    ('scaler', Normalizer()),
                    ('regressor', DecisionTreeRegressor())
                     ])

parameters = { 
              'scaler': [Normalizer(), MinMaxScaler(), None],
              'regressor__max_depth': [4, 6, 8, 10]
              }

scoring = 'r2'

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [21]:
SearchCV.fit(X_train,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.3min
[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed:  1.8min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('regressor',
                                        DecisionTreeRegressor(criterion='mse',
                                                              max_depth=None,
                                                              max_features=None,
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split=None,
                                                              min_samples_leaf=1,
                                                              min_samples_split=2,

In [22]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'regressor__max_depth': 6, 'scaler': None}
0.7397881198482646


In [23]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.6378497330816209


In [24]:
r_dt=SearchCV.cv_results_

In [25]:
print(r_dt)

{'mean_fit_time': array([2.98776255, 2.45326791, 2.11883445, 2.99934769, 3.12775245,
       3.29102993, 3.60204725, 3.91453295, 3.7185216 , 3.88194342,
       4.3942944 , 4.23350024]), 'std_fit_time': array([0.35927021, 0.13867583, 0.02021315, 0.09293019, 0.04121096,
       0.4851518 , 0.13346905, 0.10630174, 0.05567935, 0.25471042,
       0.07541504, 0.04308548]), 'mean_score_time': array([0.05380774, 0.03929586, 0.01995215, 0.04327435, 0.04332671,
       0.02034225, 0.03928151, 0.03928552, 0.0206841 , 0.04276762,
       0.04015465, 0.01844459]), 'std_score_time': array([0.01862724, 0.00381138, 0.0024549 , 0.002232  , 0.003649  ,
       0.00225314, 0.0016405 , 0.00227   , 0.00148871, 0.00249717,
       0.00215987, 0.00112156]), 'param_regressor__max_depth': masked_array(data=[4, 4, 4, 6, 6, 6, 8, 8, 8, 10, 10, 10],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False],
       fill_value='?',
            dtype=object),

In [26]:
print('parans:\n',r_dt['params'],'\n')
print('mean_test_score:\n',r_dt['mean_test_score'],'\n')
print('std_test_score:\n',r_dt['std_test_score'],'\n')
print('mean_train_score:\n',r_dt['mean_train_score'],'\n')
print('std_train_score:\n',r_dt['std_train_score'],'\n')

parans:
 [{'regressor__max_depth': 4, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__max_depth': 4, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__max_depth': 4, 'scaler': None}, {'regressor__max_depth': 6, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__max_depth': 6, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__max_depth': 6, 'scaler': None}, {'regressor__max_depth': 8, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__max_depth': 8, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__max_depth': 8, 'scaler': None}, {'regressor__max_depth': 10, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__max_depth': 10, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__max_depth': 10, 'scaler': None}] 

mean_test_score:
 [0.28592639 0.69023048 0.67026438 0.49225584 0.71701483 0.73978812
 0.57548952 0.73070497 0.72574268 0.57482358 0.70445892 0.71576546] 

std_test_score

In [27]:
filename = 'SearchCV_DecisionTree'
pickle.dump(SearchCV, open(filename, 'wb'))

### Random Forest

In [18]:
pipeline = Pipeline([
                      ('scaler', Normalizer()),
                      ('regressor', RandomForestRegressor())
                     ])

parameters = {    
              'scaler': [Normalizer(), MinMaxScaler(), None],
              'regressor__max_depth': [4, 6, 8]
              }

scoring = 'r2'

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [19]:
SearchCV.fit(X_train,y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  6.1min finished
  self._final_estimator.fit(Xt, y, **fit_params)


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('regressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split=None...
                                                              n_estimators='warn',

In [20]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'regressor__max_depth': 8, 'scaler': None}
0.7587036524239468


In [21]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.7534544913376432


In [22]:
r_dt=SearchCV.cv_results_

In [23]:
print(r_dt)

{'mean_fit_time': array([26.10589681, 21.16363716, 21.57030988, 30.45291376, 30.43098078,
       30.37619061, 41.06919284, 39.60113235, 34.37029519]), 'std_fit_time': array([1.78813062, 0.5531927 , 0.38874875, 0.66656019, 0.06727203,
       0.37085243, 0.97843173, 0.48680535, 6.41417137]), 'mean_score_time': array([0.10879788, 0.10626936, 0.06745949, 0.11311297, 0.11639848,
       0.07822523, 0.13526702, 0.13756738, 0.07273145]), 'std_score_time': array([0.00536239, 0.00668117, 0.00723919, 0.01032546, 0.00425384,
       0.00688106, 0.00796496, 0.00932761, 0.01892694]), 'param_regressor__max_depth': masked_array(data=[4, 4, 4, 6, 6, 6, 8, 8, 8],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_scaler': masked_array(data=[Normalizer(copy=True, norm='l2'),
                   MinMaxScaler(copy=True, feature_range=(0, 1)), None,
                   Normalizer(copy=True, norm='l2'),
 

In [24]:
print('parans:\n',r_dt['params'],'\n')
print('mean_test_score:\n',r_dt['mean_test_score'],'\n')
print('std_test_score:\n',r_dt['std_test_score'],'\n')
print('mean_train_score:\n',r_dt['mean_train_score'],'\n')
print('std_train_score:\n',r_dt['std_train_score'],'\n')

parans:
 [{'regressor__max_depth': 4, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__max_depth': 4, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__max_depth': 4, 'scaler': None}, {'regressor__max_depth': 6, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__max_depth': 6, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__max_depth': 6, 'scaler': None}, {'regressor__max_depth': 8, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__max_depth': 8, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__max_depth': 8, 'scaler': None}] 

mean_test_score:
 [0.28662063 0.67101106 0.71337462 0.55060827 0.72834983 0.7343493
 0.6466343  0.74823976 0.75870365] 

std_test_score:
 [0.0731086  0.09568673 0.08294244 0.09001382 0.11673942 0.09993877
 0.12522519 0.09672122 0.08951917] 

mean_train_score:
 [0.3352741  0.77808354 0.78582929 0.64434524 0.85493423 0.8547613
 0.81341707 0.88912803 0.89069499] 

std_train_sc

In [25]:
filename = 'SearchCV_RandomForest'
pickle.dump(SearchCV, open(filename, 'wb'))

### AdaBoost

In [17]:
pipeline = Pipeline([
                    ('scaler', Normalizer()),                 
                    ('regressor', AdaBoostRegressor())
                     ])

parameters = {    
                'scaler': [ Normalizer(), MinMaxScaler(), None],          
                'regressor__n_estimators': [100, 200, 300, 400],
                'regressor__learning_rate': [0.1, 0.01]
              }

scoring = 'r2'

n_splits = 3
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)


In [18]:
SearchCV.fit(X_train,y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 60.5min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 139.9min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=KFold(n_splits=3, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('regressor',
                                        AdaBoostRegressor(base_estimator=None,
                                                          learning_rate=1.0,
                                                          loss='linear',
                                                          n_estimators=50,
                                                          random_state=None))],
                                verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'regressor__learning_rate': [0.1, 0.01],
                         'regressor__n_estimators': [100, 200, 300, 400],
                         'scaler': [Normalizer(copy=T

In [19]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'regressor__learning_rate': 0.01, 'regressor__n_estimators': 100, 'scaler': None}
0.648895431328589


In [20]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.610495783989793


In [21]:
r_dt=SearchCV.cv_results_

In [22]:
print(r_dt)

{'mean_fit_time': array([221.58707444, 199.30616713, 193.69772037, 329.33251166,
       345.96807472, 323.63848646, 275.22223306, 421.54165999,
       433.16904283, 255.07317702, 549.87216051, 532.35666958,
       227.12427028, 205.87080264, 207.78822764, 446.28628238,
       410.82063158, 416.25650104, 672.7289513 , 634.06869499,
       624.47824907, 887.03042936, 820.32915934, 652.40309111]), 'std_fit_time': array([ 21.17907162,  10.11238873,   2.13599463, 124.42082974,
        13.33107387,  45.12661063,  76.74651542, 139.19693934,
       122.77458171,  92.67491094, 177.57516561, 204.19972785,
         0.93935486,   0.84017702,   0.90528422,   1.45183278,
         3.87313683,   3.29495453,   2.65482809,   0.92265405,
        13.21896925,   1.53815349,   1.81429779, 121.14070745]), 'mean_score_time': array([ 3.48575425,  3.6734612 ,  3.61869438,  6.23136727,  7.43980018,
        6.91250165,  5.20224722,  9.08342608,  8.85476295,  4.48959057,
       11.917418  , 11.10935911,  3.6313862

In [23]:
print('parans:\n',r_dt['params'],'\n')
print('mean_test_score:\n',r_dt['mean_test_score'],'\n')
print('std_test_score:\n',r_dt['std_test_score'],'\n')
print('mean_train_score:\n',r_dt['mean_train_score'],'\n')
print('std_train_score:\n',r_dt['std_train_score'],'\n')

parans:
 [{'regressor__learning_rate': 0.1, 'regressor__n_estimators': 100, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 100, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 100, 'scaler': None}, {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 200, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 200, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 200, 'scaler': None}, {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 300, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 300, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 300, 'scaler': None}, {'regressor__learning_rate': 0.1, 'r

In [24]:
filename = 'SearchCV_AdaBoost'
pickle.dump(SearchCV, open(filename, 'wb'))

### Lasso Regression

In [25]:
pipeline = Pipeline([
                    ('scaler', StandardScaler()),                 
                    ('regressor', Lasso())
                     ])

parameters = {    
                'scaler': [StandardScaler(), MinMaxScaler(), None],          
                'regressor__normalize': [False, True],
                'regressor__alpha': np.logspace(-4, 0, 5)
              }

scoring = 'r2'

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [26]:
SearchCV.fit(X_train,y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  7.4min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('regressor',
                                        Lasso(alpha=1.0, copy_X=True,
                                              fit_intercept=True, max_iter=1000,
                                              normalize=False, positive=False,
                                              precompute=False,
                                              random_state=None,
                                              selecti...
                                              warm_start=False))],
                                verbose=Fals

In [27]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'regressor__alpha': 1.0, 'regressor__normalize': True, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}
0.5093228285978368


In [28]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.5924377219747763


In [29]:
r_dt=SearchCV.cv_results_

In [30]:
print(r_dt)

{'mean_fit_time': array([21.65586591, 20.65540919, 22.55563045, 14.47856879, 13.80966468,
       13.20497952, 19.28855743, 17.67630172, 20.29647403, 11.54968495,
       11.11240764, 10.47004194, 16.40492754, 14.74931521, 17.17800217,
        7.60365491,  6.94220891,  6.12770348, 13.56931376, 11.46197329,
       13.33061991,  3.6578475 ,  2.66451159,  2.18678279, 10.4548769 ,
        6.82462559,  8.30059342,  2.8640378 ,  2.05385656,  1.6019969 ]), 'std_fit_time': array([0.40576291, 0.15794638, 0.38558457, 0.4584532 , 0.24905994,
       0.28968038, 0.15356646, 0.18479274, 0.37505222, 0.12447201,
       0.12196851, 0.15770066, 0.24278027, 0.40525514, 0.51005581,
       0.67013899, 0.78565524, 0.68018711, 0.21283528, 0.30000254,
       0.78571796, 0.30442473, 0.04152808, 0.16140819, 0.26760084,
       0.50720592, 0.51139786, 0.17285909, 0.04536165, 0.02355806]), 'mean_score_time': array([0.07968197, 0.07758522, 0.01499419, 0.08120117, 0.08780041,
       0.01605067, 0.08812709, 0.08130293,

In [31]:
print('parans:\n',r_dt['params'],'\n')
print('mean_test_score:\n',r_dt['mean_test_score'],'\n')
print('std_test_score:\n',r_dt['std_test_score'],'\n')
print('mean_train_score:\n',r_dt['mean_train_score'],'\n')
print('std_train_score:\n',r_dt['std_train_score'],'\n')

parans:
 [{'regressor__alpha': 0.0001, 'regressor__normalize': False, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}, {'regressor__alpha': 0.0001, 'regressor__normalize': False, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__alpha': 0.0001, 'regressor__normalize': False, 'scaler': None}, {'regressor__alpha': 0.0001, 'regressor__normalize': True, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}, {'regressor__alpha': 0.0001, 'regressor__normalize': True, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__alpha': 0.0001, 'regressor__normalize': True, 'scaler': None}, {'regressor__alpha': 0.001, 'regressor__normalize': False, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}, {'regressor__alpha': 0.001, 'regressor__normalize': False, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__alpha': 0.001, 'regressor__normalize': False, 'scaler': None}, {'regressor__alpha': 0.0

In [32]:
filename = 'SearchCV_Lasso'
pickle.dump(SearchCV, open(filename, 'wb'))

### MLP Regressor

In [33]:
pipeline = Pipeline([
                    ('scaler', Normalizer()),                 
                    ('regressor', MLPRegressor())
                     ])

parameters = {    
                'scaler': [Normalizer(), MinMaxScaler(), None],          
                'regressor__hidden_layer_sizes': [(32), (64), (128), (64, 64)],
                'regressor__activation': ['relu'],
                'regressor__solver': ['lbfgs'],
                'regressor__alpha': [0.1, 0.01],
                'regressor__batch_size': ['auto'], 
                'regressor__learning_rate': ['constant'],
                'regressor__learning_rate_init': [0.01],
                'regressor__max_iter': [200]
              }

scoring = 'r2'

n_splits = 3
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [34]:
SearchCV.fit(X_train,y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 31.7min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 60.2min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=KFold(n_splits=3, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('regressor',
                                        MLPRegressor(activation='relu',
                                                     alpha=0.0001,
                                                     batch_size='auto',
                                                     beta_1=0.9, beta_2=0.999,
                                                     early_stopping=False,
                                                     epsilon=1e-08,
                                                     hidden_layer_sizes=(100,),
                                                     learning_r...
                         'regressor__hidden_layer_sizes': [32, 64, 128,
         

In [35]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'regressor__activation': 'relu', 'regressor__alpha': 0.1, 'regressor__batch_size': 'auto', 'regressor__hidden_layer_sizes': 64, 'regressor__learning_rate': 'constant', 'regressor__learning_rate_init': 0.01, 'regressor__max_iter': 200, 'regressor__solver': 'lbfgs', 'scaler': None}
0.5473576139719951


In [36]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.6032325083454158


In [37]:
r_dt=SearchCV.cv_results_

In [38]:
print(r_dt)

{'mean_fit_time': array([ 65.33528789,  65.31971184,  55.28994592, 147.09206899,
       146.09891454, 148.17036438, 282.5468878 , 280.83875394,
       288.30925306, 306.82461214, 301.55317465, 254.28775358,
        63.58463073,  63.74197229,  62.61407272, 145.60055971,
       144.00814017, 132.34722765, 326.26563295, 288.12685712,
       245.33574494, 306.08483934, 303.30921006, 245.80762434]), 'std_fit_time': array([ 0.48704783,  0.987063  , 15.43473104,  0.48796178,  0.69868957,
        0.88440977,  2.12793512,  0.98788218,  3.19182219,  2.08922381,
        0.16016405, 89.08992204,  0.48152097,  1.00530483,  4.89648046,
        1.01395647,  2.07950817, 35.56093476,  0.50610444, 11.74386737,
       42.71606848,  0.78840554,  1.65534025, 37.5515289 ]), 'mean_score_time': array([0.14431135, 0.15273873, 0.09388574, 0.20579958, 0.20918218,
       0.15705991, 0.38428156, 0.41411829, 0.34785565, 0.40169772,
       0.41326165, 0.33122786, 0.15415533, 0.1519533 , 0.09094628,
       0.21378398

In [39]:
print('parans:\n',r_dt['params'],'\n')
print('mean_test_score:\n',r_dt['mean_test_score'],'\n')
print('std_test_score:\n',r_dt['std_test_score'],'\n')
print('mean_train_score:\n',r_dt['mean_train_score'],'\n')
print('std_train_score:\n',r_dt['std_train_score'],'\n')

parans:
 [{'regressor__activation': 'relu', 'regressor__alpha': 0.1, 'regressor__batch_size': 'auto', 'regressor__hidden_layer_sizes': 32, 'regressor__learning_rate': 'constant', 'regressor__learning_rate_init': 0.01, 'regressor__max_iter': 200, 'regressor__solver': 'lbfgs', 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__activation': 'relu', 'regressor__alpha': 0.1, 'regressor__batch_size': 'auto', 'regressor__hidden_layer_sizes': 32, 'regressor__learning_rate': 'constant', 'regressor__learning_rate_init': 0.01, 'regressor__max_iter': 200, 'regressor__solver': 'lbfgs', 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__activation': 'relu', 'regressor__alpha': 0.1, 'regressor__batch_size': 'auto', 'regressor__hidden_layer_sizes': 32, 'regressor__learning_rate': 'constant', 'regressor__learning_rate_init': 0.01, 'regressor__max_iter': 200, 'regressor__solver': 'lbfgs', 'scaler': None}, {'regressor__activation': 'relu', 'regressor__alpha': 0.1, 'regressor__b

In [40]:
filename = 'SearchCV_MLP'
pickle.dump(SearchCV, open(filename, 'wb'))

### XGBoost

In [41]:
pipeline = Pipeline([
                      ('scaler', Normalizer()),
                      ('regressor', xgb.XGBRegressor())
                     ])

parameters = {    
              'scaler': [Normalizer(), MinMaxScaler(), None],
              'regressor__learning_rate': [0.1, 0.01],
              'regressor__max_depth': [4, 6, 8],
              'regressor__min_child_weight': [1, 3],
              'regressor__gamma': [0.0, 0.1, 0.2],
              'regressor__colsample_bytree': [0.3, 0.5, 0.7, 1]
              }

scoring = 'r2'

n_splits = 3
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [42]:
SearchCV.fit(X_train,y_train)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 18.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 81.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 201.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 402.8min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 736.1min
[Parallel(n_jobs=-1)]: Done 1296 out of 1296 | elapsed: 779.0min finished




GridSearchCV(cv=KFold(n_splits=3, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('regressor',
                                        XGBRegressor(base_score=0.5,
                                                     booster='gbtree',
                                                     colsample_bylevel=1,
                                                     colsample_bynode=1,
                                                     colsample_bytree=1,
                                                     gamma=0,
                                                     importance_type='gain',
                                                     learning_rate=0.1,
                                                     max_d...
             param_grid={'re

In [43]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'regressor__colsample_bytree': 0.7, 'regressor__gamma': 0.0, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 8, 'regressor__min_child_weight': 1, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}
0.7596090028797056


In [44]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.7875404099265876


In [45]:
r_dt=SearchCV.cv_results_

In [46]:
print(r_dt)

{'mean_fit_time': array([ 85.03228227,  81.2793409 ,  85.88820974, 100.62872179,
        95.53083595,  85.61545857, 103.34528248,  98.18514935,
        98.26147834, 103.58297515,  97.85068162,  98.237173  ,
       125.90761201, 115.21949975, 114.98412387, 123.72440434,
       114.89533059, 113.89263741,  84.57555946,  81.81584024,
        81.85847012,  85.08403476,  82.00047461,  81.88234011,
       104.39531477,  98.39975937,  98.33510669, 103.49279952,
        98.68612663,  98.28831959, 124.95710079, 116.36088236,
       119.8023862 , 128.31408898, 115.43061757, 114.84433126,
        82.52027472,  79.6548907 ,  79.42980353,  81.80671573,
        79.77137661,  79.63494611, 100.46102317,  95.63144875,
        95.56378961, 100.21471667,  95.68352938,  95.65943352,
       122.29075027, 112.72558331, 112.51241104, 121.4545548 ,
       112.56627806, 112.71632202,  83.79697529,  82.23155125,
        81.29320971,  83.62075456,  80.60865394,  80.51200867,
       101.68993743,  97.00126576,  9

In [47]:
print('parans:\n',r_dt['params'],'\n')
print('mean_test_score:\n',r_dt['mean_test_score'],'\n')
print('std_test_score:\n',r_dt['std_test_score'],'\n')
print('mean_train_score:\n',r_dt['mean_train_score'],'\n')
print('std_train_score:\n',r_dt['std_train_score'],'\n')

parans:
 [{'regressor__colsample_bytree': 0.3, 'regressor__gamma': 0.0, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 4, 'regressor__min_child_weight': 1, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__colsample_bytree': 0.3, 'regressor__gamma': 0.0, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 4, 'regressor__min_child_weight': 1, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__colsample_bytree': 0.3, 'regressor__gamma': 0.0, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 4, 'regressor__min_child_weight': 1, 'scaler': None}, {'regressor__colsample_bytree': 0.3, 'regressor__gamma': 0.0, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 4, 'regressor__min_child_weight': 3, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__colsample_bytree': 0.3, 'regressor__gamma': 0.0, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 4, 'regressor__min_child_weight': 3, 'scaler': MinMaxScaler(copy=True, feature_ran

In [48]:
filename = 'SearchCV_XGBoost'
pickle.dump(SearchCV, open(filename, 'wb'))

### SVM

In [49]:
pipeline = Pipeline([
                      ('scaler', Normalizer()),
                      ('regressor', LinearSVR())
                     ])

parameters = {    
              'scaler': [Normalizer(), MinMaxScaler(), None],
              'regressor__C': [1, 0.1, 0.01]
              }

scoring = 'r2'

n_splits = 3
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)

SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=-1)

In [50]:
SearchCV.fit(X_train,y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  3.9min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=KFold(n_splits=3, random_state=123457, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        Normalizer(copy=True, norm='l2')),
                                       ('regressor',
                                        LinearSVR(C=1.0, dual=True, epsilon=0.0,
                                                  fit_intercept=True,
                                                  intercept_scaling=1.0,
                                                  loss='epsilon_insensitive',
                                                  max_iter=1000,
                                                  random_state=None, tol=0.0001,
                                                  verbose=0))],
                                verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'regressor__C': [1, 0.1, 0.01],
        

In [51]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)
print(SearchCV.best_score_)

{'regressor__C': 0.01, 'scaler': None}
0.474239347286399


In [52]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (r2): 0.541218650722493


In [53]:
r_dt=SearchCV.cv_results_

In [54]:
print(r_dt)

{'mean_fit_time': array([ 0.88728102,  1.11157211, 99.10561959,  1.12351274,  1.29156899,
       85.24658942,  1.24985329,  1.29504569, 67.77525107]), 'std_fit_time': array([3.83551904e-02, 2.48020187e-02, 2.19269865e-01, 1.00927047e-02,
       2.13526324e-03, 9.39169334e+00, 9.52747647e-02, 6.68429453e-02,
       1.08608462e+01]), 'mean_score_time': array([0.07993094, 0.07470926, 0.01687749, 0.07298207, 0.07894532,
       0.01582424, 0.07265306, 0.07351263, 0.01107637]), 'std_score_time': array([0.0047576 , 0.0144949 , 0.00055579, 0.00290023, 0.00349196,
       0.00029896, 0.00018965, 0.0018397 , 0.00308277]), 'param_regressor__C': masked_array(data=[1, 1, 1, 0.1, 0.1, 0.1, 0.01, 0.01, 0.01],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_scaler': masked_array(data=[Normalizer(copy=True, norm='l2'),
                   MinMaxScaler(copy=True, feature_range=(0, 1)), None,
    

In [55]:
print('parans:\n',r_dt['params'],'\n')
print('mean_test_score:\n',r_dt['mean_test_score'],'\n')
print('std_test_score:\n',r_dt['std_test_score'],'\n')
print('mean_train_score:\n',r_dt['mean_train_score'],'\n')
print('std_train_score:\n',r_dt['std_train_score'],'\n')

parans:
 [{'regressor__C': 1, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__C': 1, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__C': 1, 'scaler': None}, {'regressor__C': 0.1, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__C': 0.1, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__C': 0.1, 'scaler': None}, {'regressor__C': 0.01, 'scaler': Normalizer(copy=True, norm='l2')}, {'regressor__C': 0.01, 'scaler': MinMaxScaler(copy=True, feature_range=(0, 1))}, {'regressor__C': 0.01, 'scaler': None}] 

mean_test_score:
 [-0.00700683 -0.00697571 -0.41007798 -0.00728709 -0.00721662  0.074646
 -0.00777759 -0.00776646  0.47423935] 

std_test_score:
 [1.35929381e-04 1.52209578e-04 5.65066999e-01 1.54097612e-04
 1.56748616e-04 2.30196831e-01 1.61162566e-04 1.61003556e-04
 3.25697138e-02] 

mean_train_score:
 [-0.00700585 -0.00697539 -0.55766856 -0.00728637 -0.00721599  0.06143198
 -0.00777689 -0.00776577  0.47779272] 

std_train_scor

In [56]:
filename = 'SearchCV_SVM'
pickle.dump(SearchCV, open(filename, 'wb'))