In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

In [2]:
file = "../../data/raw/ENB2012_data.csv"
df = pd.read_csv(file)
X = df.drop(columns=['Y1', 'Y2'], axis=1)
y1 = df['Y1']
y2 = df['Y2']

In [3]:
cat_ftrs = ['X6','X8']
num_ftrs = ['X1','X2','X3','X4','X5','X7']
target_ftrs = ['Y1', 'Y2']

## MinMaxScaler for num_ftrs
scaler = MinMaxScaler()
num_transformer = make_pipeline(scaler)
## One-hot encoding for cat_ftrs
ohe = OneHotEncoder(sparse=False, categories="auto")
cat_transformer = make_pipeline(ohe)

## Create a preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_ftrs),
    ('cat', cat_transformer, cat_ftrs)
])

In [4]:
def ML_pipeline_GridSearchCV_kfold(X, y, seed, n_folds, 
                                 reg, param_grid):
    ## reg: the regressor
    ## param_grid: hyperparameters to be tuned
    X_other, X_test, y_other, y_test = train_test_split(X, y, 
                   test_size=0.2, random_state=seed)
    kf = KFold(n_splits=n_folds) # no need to shuffle again
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', reg)])
    grid = GridSearchCV(pipe, param_grid=param_grid,
                        cv=kf, iid=False, scoring=\
#         make_scorer(r2_score))
        make_scorer(mean_squared_error, greater_is_better=False))
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)

### Lasso linear regression

In [5]:
from sklearn.linear_model import Lasso

In [6]:
lasso = Lasso(max_iter=10000)
param_grid = {'regressor__alpha': np.logspace(-4, 2, 11)}
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y2, 
          seed=42*i, n_folds=5, reg=lasso, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [7]:
print("Mean of the best score is %.3f." % -np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 10.044.
std of the best score is 1.183.


In [8]:
best_params

[{'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.0001},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.00039810717055349735},
 {'regressor__alpha': 0.0001},
 {'regressor__alpha': 0.00039810717055349735}]

### RandomForest regression

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [26]:
rfr = RandomForestRegressor(random_state=0, n_estimators=20)
max_depths = [int(x) for x in np.linspace(2, 50, num=10)]
min_samples_splits = range(2, 12, 2)
param_grid = { "regressor__max_depth" : max_depths, 
               "regressor__min_samples_split" : min_samples_splits }

In [27]:
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y2, 
          seed=42*i, n_folds=5, reg=rfr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [28]:
print("Mean of the best score is %.3f." % -np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 3.340.
std of the best score is 0.442.


In [29]:
best_params

[{'regressor__max_depth': 7, 'regressor__min_samples_split': 10},
 {'regressor__max_depth': 7, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 7, 'regressor__min_samples_split': 10},
 {'regressor__max_depth': 7, 'regressor__min_samples_split': 10},
 {'regressor__max_depth': 7, 'regressor__min_samples_split': 2},
 {'regressor__max_depth': 7, 'regressor__min_samples_split': 10},
 {'regressor__max_depth': 7, 'regressor__min_samples_split': 10},
 {'regressor__max_depth': 7, 'regressor__min_samples_split': 10},
 {'regressor__max_depth': 7, 'regressor__min_samples_split': 10},
 {'regressor__max_depth': 7, 'regressor__min_samples_split': 4}]

In [14]:
importance = grid.best_estimator_.named_steps["regressor"].\
feature_importances_
importance

array([0.36583976, 0.32483374, 0.04165945, 0.04445223, 0.16268616,
       0.04542173, 0.00069304, 0.00170358, 0.00136328, 0.0025463 ,
       0.003376  , 0.0007576 , 0.00047545, 0.00132742, 0.00163048,
       0.00123379])

### Support vector regression

In [15]:
from sklearn.svm import SVR

In [16]:
svr = SVR(epsilon=0.2)
gammas = np.logspace(-3, 2, num=4)
Cs = np.logspace(0, 3, num=4)
# kernels = ['rbf', 'linear']
param_grid = { "regressor__gamma" : gammas, 
               "regressor__C" : Cs,}
#                 "regressor__kernel": kernels}

In [17]:
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y1, 
          seed=42*i, n_folds=5, reg=svr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [18]:
print("Mean of the best score is %.3f." % -np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 6.650.
std of the best score is 0.693.


In [19]:
best_params

[{'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795},
 {'regressor__C': 1000.0, 'regressor__gamma': 0.046415888336127795}]

### Kernel ridge regression

In [20]:
# from sklearn.kernel_ridge import KernelRidge

# krr = KernelRidge()
# gammas = np.logspace(-3,2, num=5)
# alphas = np.logspace(-2,3, num=4)
# param_grid = { "regressor__gamma" : gammas, 
#                "regressor__alpha" : alphas}

# best_scores = []
# best_params = []
# for i in range(1):
#     grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y1, 
#           seed=42*i, n_folds=10, reg=krr, param_grid=param_grid)
#     best_scores.append(test_score)
#     best_params.append(grid.best_params_)

# best_scores

# best_params

### Multilayer Perceptron regression

In [21]:
from sklearn.neural_network import MLPRegressor

In [22]:
nnr = MLPRegressor(max_iter=10000,solver='sgd', 
                   activation='tanh', batch_size=100,
                   learning_rate='adaptive',
                   random_state=42, hidden_layer_sizes=[10,10])
# alphas = np.logspace(-3, 2, num=5)
alphas = [0.0001]
# # hls = [[10,10], [15,15], [20,20]]
hls = [[10,10]]
param_grid = { "regressor__alpha" : alphas, 
               "regressor__hidden_layer_sizes" : hls}

In [23]:
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y1, 
          seed=42*i, n_folds=5, reg=nnr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [24]:
print("Mean of the best score is %.3f." % -np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 0.344.
std of the best score is 0.071.


In [25]:
best_params

[{'regressor__alpha': 0.0001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.0001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.0001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.0001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.0001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.0001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.0001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.0001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.0001, 'regressor__hidden_layer_sizes': [10, 10]},
 {'regressor__alpha': 0.0001, 'regressor__hidden_layer_sizes': [10, 10]}]

### Global feature importance