In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

In [26]:
file = "../../data/raw/ENB2012_data.csv"
df = pd.read_csv(file)
X = df.drop(columns=['Y1', 'Y2'], axis=1)
y1 = df['Y1']
y2 = df['Y2']

In [27]:
cat_ftrs = ['X6','X8']
num_ftrs = ['X1','X2','X3','X4','X5','X7']
target_ftrs = ['Y1', 'Y2']

## MinMaxScaler for num_ftrs
scaler = MinMaxScaler()
num_transformer = make_pipeline(scaler)
## One-hot encoding for cat_ftrs
ohe = OneHotEncoder(sparse=False, categories="auto")
cat_transformer = make_pipeline(ohe)

## Create a preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_ftrs),
    ('cat', cat_transformer, cat_ftrs)
])

In [29]:
def ML_pipeline_GridSearchCV_kfold(X, y, seed, n_folds, 
                                 reg, param_grid):
    ## reg: the regressor
    ## param_grid: hyperparameters to be tuned
    X_other, X_test, y_other, y_test = train_test_split(X, y, 
                   test_size=0.2, random_state=seed)
    kf = KFold(n_splits=n_folds) # no need to shuffle again
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', reg)])
    grid = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1,
                        cv=kf, iid=False, scoring=\
#         make_scorer(r2_score))
        make_scorer(mean_squared_error, greater_is_better=False))
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)

### Lasso linear regression

In [5]:
from sklearn.linear_model import Lasso

In [6]:
lasso = Lasso(max_iter=10000)
param_grid = {'regressor__alpha': np.logspace(-4, 2, 11)}
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y2, 
          seed=42*i, n_folds=5, reg=lasso, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [7]:
print("Mean of the best score is %.3f." % -np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 10.044.
std of the best score is 1.183.


In [8]:
best_params

[{'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.0001},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.0001},
 {'regressor__alpha': 0.00039810717055349735}]

### RandomForest regression

In [30]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
rfr = RandomForestRegressor(random_state=0, n_estimators=20)
max_depths = [int(x) for x in np.linspace(2, 20, num=5)]
min_samples_splits = range(2, 12, 2)
param_grid = { "regressor__max_depth" : max_depths, 
               "regressor__min_samples_split" : min_samples_splits }

In [11]:
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y2, 
          seed=42*i, n_folds=5, reg=rfr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [12]:
print("Mean of the best score is %.3f." % -np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 3.317.
std of the best score is 0.426.


In [13]:
best_params

[{'regressor__max_depth': 6, 'regressor__min_samples_split': 10},
 {'regressor__max_depth': 6, 'regressor__min_samples_split': 4},
 {'regressor__max_depth': 6, 'regressor__min_samples_split': 10},
 {'regressor__max_depth': 6, 'regressor__min_samples_split': 10},
 {'regressor__max_depth': 6, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 6, 'regressor__min_samples_split': 10},
 {'regressor__max_depth': 6, 'regressor__min_samples_split': 10},
 {'regressor__max_depth': 6, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 6, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 6, 'regressor__min_samples_split': 2}]

In [14]:
importance = grid.best_estimator_.named_steps["regressor"].\
feature_importances_
importance

array([5.28267437e-01, 9.18543656e-02, 3.24600817e-02, 4.49310874e-02,
       2.46003352e-01, 4.54598805e-02, 6.72733347e-04, 7.24127612e-04,
       7.77445828e-04, 1.54113150e-03, 4.17411504e-03, 6.17179969e-04,
       2.43520934e-04, 4.71319024e-04, 1.18502099e-03, 6.17200880e-04])

In [31]:
max_depths = [int(x) for x in np.linspace(2, 20, num=5)]
min_samples_splits = range(2, 12, 2)
param_grid = { "regressor__max_depth" : max_depths, 
               "regressor__min_samples_split" : min_samples_splits}

In [32]:
best_scores = []
best_params = []
for i in range(10):
    rfr = RandomForestRegressor(random_state=42*i, n_estimators=20)
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y2, 
          seed=42, n_folds=5, reg=rfr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [33]:
print("Mean of the best score is %.3f." % -np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 3.787.
std of the best score is 0.233.


### Support vector regression

In [15]:
from sklearn.svm import SVR

In [16]:
svr = SVR(epsilon=0.2)
gammas = np.logspace(-3, 2, num=4)
Cs = np.logspace(0, 3, num=4)
# kernels = ['rbf', 'linear']
param_grid = { "regressor__gamma" : gammas, 
               "regressor__C" : Cs,}
#                 "regressor__kernel": kernels}

In [17]:
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y2, 
          seed=42*i, n_folds=5, reg=svr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [18]:
print("Mean of the best score is %.3f." % -np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 10.082.
std of the best score is 1.625.


In [19]:
best_params

[{'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 100.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 100.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 100.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795}]

### Kernel ridge regression

In [20]:
# from sklearn.kernel_ridge import KernelRidge

# krr = KernelRidge()
# gammas = np.logspace(-3,2, num=5)
# alphas = np.logspace(-2,3, num=4)
# param_grid = { "regressor__gamma" : gammas, 
#                "regressor__alpha" : alphas}

# best_scores = []
# best_params = []
# for i in range(1):
#     grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y1, 
#           seed=42*i, n_folds=10, reg=krr, param_grid=param_grid)
#     best_scores.append(test_score)
#     best_params.append(grid.best_params_)

# best_scores

# best_params

### Multilayer Perceptron regression

In [34]:
from sklearn.neural_network import MLPRegressor

In [21]:
nnr = MLPRegressor(max_iter=10000,solver='sgd', 
                   activation='tanh', batch_size=100,
                   learning_rate='adaptive',
                   random_state=42, hidden_layer_sizes=[10,10])
# alphas = np.logspace(-4, -2, num=3)
alphas = [0.001]
# # hls = [[10,10], [15,15], [20,20]]
hls = [[10,10]]
param_grid = { "regressor__alpha" : alphas, 
               "regressor__hidden_layer_sizes" : hls}

In [22]:
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y2, 
          seed=42*i, n_folds=5, reg=nnr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [23]:
print("Mean of the best score is %.3f." % -np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 2.386.
std of the best score is 0.625.


In [24]:
best_params

[{'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]}]

In [35]:
alphas = [0.001]
# # hls = [[10,10], [15,15], [20,20]]
hls = [[10,10]]
param_grid = { "regressor__alpha" : alphas, 
               "regressor__hidden_layer_sizes" : hls}

In [36]:
best_scores = []
best_params = []
for i in range(5):
    nnr = MLPRegressor(max_iter=10000,solver='sgd', 
                   activation='tanh', batch_size=100,
                   learning_rate='adaptive',
                   random_state=42*i, hidden_layer_sizes=[10,10])   
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y2, 
          seed=42, n_folds=5, reg=nnr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [38]:
print("Mean of the best score is %.3f." % -np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))
best_scores

Mean of the best score is 2.898.
std of the best score is 2.836.


[-1.861239422860036,
 -2.2671724786653566,
 -0.9690150904536919,
 -8.474421994481297,
 -0.9157917990554618]