In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

In [2]:
file = "../../data/raw/ENB2012_data.csv"
df = pd.read_csv(file)
X = df.drop(columns=['Y1', 'Y2'], axis=1)
y1 = df['Y1']
y2 = df['Y2']

In [3]:
df.X8.unique()

array([0, 1, 2, 3, 4, 5])

In [4]:
cat_ftrs = ['X6','X8']
num_ftrs = ['X1','X2','X3','X4','X5','X7']
target_ftrs = ['Y1', 'Y2']

## MinMaxScaler for num_ftrs
scaler = MinMaxScaler()
num_transformer = make_pipeline(scaler)
## One-hot encoding for cat_ftrs
ohe = OneHotEncoder(sparse=False, categories="auto")
cat_transformer = make_pipeline(ohe)

## Create a preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_ftrs),
    ('cat', cat_transformer, cat_ftrs)
])

In [5]:
def ML_pipeline_GridSearchCV_kfold(X, y, seed, n_folds, 
                                 reg, param_grid):
    ## reg: the regressor
    ## param_grid: hyperparameters to be tuned
    X_other, X_test, y_other, y_test = train_test_split(X, y, 
                   test_size=0.2, random_state=seed)
    kf = KFold(n_splits=n_folds) # no need to shuffle again
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', reg)])
    grid = GridSearchCV(pipe, param_grid=param_grid,
                        cv=kf, iid=False, scoring=\
        make_scorer(r2_score))
#         make_scorer(mean_squared_error, greater_is_better=False))
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)

### Lasso linear regression

In [6]:
from sklearn.linear_model import Lasso

In [7]:
lasso = Lasso(max_iter=10000)
param_grid = {'regressor__alpha': np.logspace(-4, 2, 11)}
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y1, 
          seed=42*i, n_folds=5, reg=lasso, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [9]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 0.923.
std of the best score is 0.005.


In [10]:
best_params

[{'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.001584893192461114},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.0001},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.0001},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735}]

### RandomForest regression

In [12]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
rfr = RandomForestRegressor(random_state=42, n_estimators=20)
max_depths = [int(x) for x in np.linspace(2, 20, num=5)]
min_samples_splits = range(2, 12, 2)
param_grid = { "regressor__max_depth" : max_depths, 
               "regressor__min_samples_split" : min_samples_splits }
max_depths

[2, 6, 11, 15, 20]

In [14]:
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y1, 
          seed=42*i, n_folds=5, reg=rfr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [16]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.5f." % np.std(best_scores))

Mean of the best score is 0.997.
std of the best score is 0.00026.


In [17]:
best_params

[{'regressor__max_depth': 11, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 15, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 11, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 15, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 15, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 11, 'regressor__min_samples_split': 4},
 {'regressor__max_depth': 11, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 11, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 11, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 15, 'regressor__min_samples_split': 2}]

In [18]:
importance = grid.best_estimator_.named_steps["regressor"].\
feature_importances_
importance

array([4.22236381e-01, 1.39614371e-01, 2.78833713e-02, 1.99252465e-01,
       1.17439308e-01, 8.01092791e-02, 1.62664844e-04, 1.76732577e-04,
       1.43686687e-04, 2.74588257e-04, 1.16523270e-02, 2.06627504e-04,
       1.17146456e-04, 2.71622668e-04, 1.34411134e-04, 3.25016753e-04])

### Support vector regression

In [19]:
from sklearn.svm import SVR

In [20]:
svr = SVR(epsilon=0.2)
gammas = np.logspace(-3, 2, num=4)
Cs = np.logspace(0, 3, num=4)
# kernels = ['rbf', 'linear']
param_grid = { "regressor__gamma" : gammas, 
               "regressor__C" : Cs,}
#                 "regressor__kernel": kernels}

In [21]:
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y1, 
          seed=42*i, n_folds=5, reg=svr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [22]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 0.936.
std of the best score is 0.004.


In [23]:
best_params

[{'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795}]

### Kernel ridge regression

In [59]:
# from sklearn.kernel_ridge import KernelRidge

# krr = KernelRidge()
# gammas = np.logspace(-3,2, num=5)
# alphas = np.logspace(-2,3, num=4)
# param_grid = { "regressor__gamma" : gammas, 
#                "regressor__alpha" : alphas}

# best_scores = []
# best_params = []
# for i in range(1):
#     grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y1, 
#           seed=42*i, n_folds=10, reg=krr, param_grid=param_grid)
#     best_scores.append(test_score)
#     best_params.append(grid.best_params_)

# best_scores

# best_params

### Multilayer Perceptron regression

In [26]:
from sklearn.neural_network import MLPRegressor

In [31]:
nnr = MLPRegressor(max_iter=10000,solver='sgd', 
                   activation='tanh', batch_size=100,
                   learning_rate='adaptive',
                   random_state=42, hidden_layer_sizes=[10,10])
alphas = np.logspace(-4, -2, num=3)
alphas = [0.001]
# hls = [[10,10], [15,15]]
hls = [[10,10]]
param_grid = { "regressor__alpha" : alphas, 
               "regressor__hidden_layer_sizes" : hls}

In [32]:
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y1, 
          seed=42*i, n_folds=5, reg=nnr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [33]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 0.997.
std of the best score is 0.001.


In [34]:
best_params

[{'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': [10, 10]}]

### Global feature importance