In [21]:
import sys
import time
import pandas as pd
from sklearn import model_selection
from sklearn import ensemble, metrics, cluster, tree
from matplotlib import pyplot as plt
import scipy
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.feature_selection import *
from sklearn.model_selection import GridSearchCV, KFold

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

datasets = ["228_elusage.tsv", "485_analcatdata_vehicle.tsv", "523_analcatdata_neavote.tsv", \
            '663_rabe_266.tsv', '687_sleuth_ex1605.tsv']

#datasets = ["228_elusage.tsv"]
cv = 3

In [24]:
start_time = time.time()

regressor = Pipeline([
  #('preprocessing', preprocessing.RobustScaler()),
  ('regression', ensemble.RandomForestRegressor())
])

ard_parameters = [{ \
    #'regression__criterion': ['mse', 'mae'], \
    'regression__n_estimators': [1, 40, 75, 100], \
    'regression__max_depth': [1, 3, 5, 7, 9], \
    #'regression__min_weight_fraction_leaf': [0.0, 0.1, 0.25, 0.5, 1.0], \
    #'regression__max_features': [None, 'auto', 'log2', 'sqrt'], \
    #'regression__bootstrap': [True, False], \
    #'regression__random_state': [3111696] \
}]

print('Training started...')
dataset_accuracies = list()
r2_scores = list()
for d_set in datasets:
    print("Processing dataset: %s" % d_set)
    data_path = "data/" + d_set
    df = pd.read_csv(data_path, sep="\t")
    label = df["target"].copy()
    data = df.drop("target", axis=1)
    optimized_regressor = GridSearchCV(regressor, ard_parameters, \
                                       cv=KFold(n_splits=cv, shuffle=True, random_state=3111696), \
                                       error_score=0, scoring='r2')
    optimized_regressor.fit(data, label)
    best_regressor = optimized_regressor.best_estimator_
    best_result = optimized_regressor.cv_results_
    print(optimized_regressor.best_params_)
    best_score = optimized_regressor.best_score_
    r2_scores.append(best_score)
    print("Best score: ", best_score)
    print("Finished dataset: %s" % d_set)
    print("------------------------------------------------------------")

print('Training finished')
print("Mean R2 square: \n", np.mean(r2_scores))
end_time = time.time()
print('Total time taken: %d seconds' % int(end_time - start_time))

Training started...
Processing dataset: 228_elusage.tsv
{'regression__max_depth': 7, 'regression__n_estimators': 100}
Best score:  0.7417197569417688
Finished dataset: 228_elusage.tsv
------------------------------------------------------------
Processing dataset: 485_analcatdata_vehicle.tsv
{'regression__max_depth': 9, 'regression__n_estimators': 75}
Best score:  0.6296412466704896
Finished dataset: 485_analcatdata_vehicle.tsv
------------------------------------------------------------
Processing dataset: 523_analcatdata_neavote.tsv
{'regression__max_depth': 3, 'regression__n_estimators': 1}
Best score:  0.8861469521611808
Finished dataset: 523_analcatdata_neavote.tsv
------------------------------------------------------------
Processing dataset: 663_rabe_266.tsv
{'regression__max_depth': 9, 'regression__n_estimators': 40}
Best score:  0.9834739552655704
Finished dataset: 663_rabe_266.tsv
------------------------------------------------------------
Processing dataset: 687_sleuth_ex1

In [43]:
# Parameter optimisation using Hyperopt (Bayesian optimisation)

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

max_eval = 150

config = {
    "n_estimators": "1,100",
    "max_depth": "1,100",  
}

l_n_estimators = list(map(int, config["n_estimators"].split(",")))
l_max_depth = list(map(int, config["max_depth"].split(",")))

params = {	    
    "n_estimators": hp.quniform("n_estimators", l_n_estimators[0], l_n_estimators[1], 1),
    "max_depth": hp.quniform("max_depth", l_max_depth[0], l_max_depth[1], 1),
}

best_acc_datasets = list()

for d_set in datasets:
    data_path = "data/" + d_set
    df = pd.read_csv(data_path, sep="\t")
    label = df["target"].copy()
    data = df.drop("target", axis=1)
    print("Dataset: %s" % d_set)
    def create_model(params):
        clf = RandomForestRegressor(
            n_estimators=int(params["n_estimators"]),
            max_depth=int(params["max_depth"]),
        )
        r2_score = cross_val_score(clf, data, label, scoring='r2', cv=KFold(n_splits=cv, shuffle=True, random_state=3111696)).mean()
        return {'loss': -r2_score, 'status': STATUS_OK}

    # minimize the objective function using the set of parameters above
    trials = Trials()
    learned_params = fmin(create_model, params, trials=trials, algo=tpe.suggest, max_evals=max_eval)
    loss_val = list()
    for item in trials:
        loss_val.append(item["result"]["loss"])
    best_acc = -min(loss_val)
    best_acc_datasets.append(best_acc))
    print(learned_params)
    print("-----------------------------------")
    
print("Mean R2 square: \n", np.mean(best_acc_datasets))

Dataset: 228_elusage.tsv
100%|██████████| 500/500 [01:02<00:00,  6.59it/s, best loss: -0.7722333047839318]
0.7722333047839318
{'max_depth': 95.0, 'n_estimators': 11.0}
-----------------------------------
Dataset: 485_analcatdata_vehicle.tsv
100%|██████████| 500/500 [01:07<00:00,  8.75it/s, best loss: -0.6814334093402792]
0.6814334093402792
{'max_depth': 68.0, 'n_estimators': 19.0}
-----------------------------------
Dataset: 523_analcatdata_neavote.tsv
100%|██████████| 500/500 [01:04<00:00,  7.38it/s, best loss: -0.8847788022081454]
0.8847788022081454
{'max_depth': 22.0, 'n_estimators': 1.0}
-----------------------------------
Dataset: 663_rabe_266.tsv
100%|██████████| 500/500 [01:15<00:00,  7.04it/s, best loss: -0.9866993715309423]
0.9866993715309423
{'max_depth': 64.0, 'n_estimators': 66.0}
-----------------------------------
Dataset: 687_sleuth_ex1605.tsv
100%|██████████| 500/500 [02:27<00:00,  2.66it/s, best loss: -0.597096007323035] 
0.597096007323035
{'max_depth': 2.0, 'n_estimat