In [1]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut
from sklearn.linear_model import (Ridge, BayesianRidge)

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error
)

from tqdm.auto import tqdm

from copy import deepcopy

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning)

def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)


def normalized_root_mean_squared_error(y_true, y_pred, norm_factor=None):
    if norm_factor is None:
        assert False, "Set norm_factor (for example the average target value for the training set)"
    rmse = root_mean_squared_error(y_true, y_pred)
    return (rmse / norm_factor)*100

model2paper = {
    'CatBoostRegressor':         "CatBoost",
    'GradientBoostingRegressor': "GBDT",
    'KNeighborsRegressor':       "KNN",
    'MLPRegressor':              "MLP",
    'RandomForestRegressor':     "RF",
    'SVR':                       "SVR",
    'XGBRegressor':              "XGBoost",
    # =========================================
    "GaussianProcessRegressor": "GaussProc",
    "BayesianRidge": "BayesianNN",
    "StackingRegressor": "StackEns",
    "LGBMRegressor": "LightGBM",
    "AdaBoostRegressor": "AdaBoost",
    "BaggingRegressor": "BaggedDT",
}

from configs import *
from experiments_to_run import MODELS, CONFIGS, TARGETS
random.seed(123)

# # ============== spatial clustering 5 clusters [start]
# cluster_df = pd.read_csv("../spatial_clusters_5.csv")

# test_folds = []
# train_folds = []

# for cluster_idx, data in cluster_df.groupby("cluster"):
#     test_folds.append(data.index.tolist())
#     train_folds.append(
#         list(set(cluster_df.index.tolist())-set(data.index.tolist()))
#     )

# [random.shuffle(x) for x in test_folds]
# [random.shuffle(x) for x in train_folds]
# # ============== spatial clustering 5 clusters [end]


# ============== spatial clustering 10 clusters [start]
cluster_df = pd.read_csv("../spatial_clusters_10.csv")

tmp_test_folds = []
test_folds = []
train_folds = []

for cluster_idx, data in cluster_df.groupby("cluster"):
    tmp_test_folds.append(data.index.tolist())

random.shuffle(tmp_test_folds)
for i in range(5):
    test_folds.append(tmp_test_folds[i*2]+tmp_test_folds[i*2+1])
    train_folds.append(
        list(set(cluster_df.index.tolist())-set(test_folds[-1]))
    )
    
[random.shuffle(x) for x in test_folds]
[random.shuffle(x) for x in train_folds]
# ============== spatial clustering 10 clusters [end]

# ============== spatial clustering 10 clusters [start]
tuning_test_folds = []
tuning_train_folds = []

random.shuffle(tmp_test_folds)
for i in range(5):
    tuning_test_folds.append(tmp_test_folds[i*2]+tmp_test_folds[i*2+1])
    tuning_train_folds.append(
        list(set(cluster_df.index.tolist())-set(tuning_test_folds[-1]))
    )
    
[random.shuffle(x) for x in tuning_test_folds]
[random.shuffle(x) for x in tuning_train_folds]
# ============== spatial clustering 10 clusters [end]


if not os.path.exists("../results"):
    os.makedirs("../results")
if not os.path.exists("../figures_and_tables"):
    os.makedirs("../figures_and_tables")

OVERWRITE = False

total_iterations = len(MODELS)*len(CONFIGS)*len(TARGETS)
curr_iteration = 0

for MODEL in MODELS:
    for CONFIG in CONFIGS:
        for TARGET in TARGETS:
            
            curr_iteration+=1
            print(f"---------------- [{curr_iteration} / {total_iterations}]")
            
            if not OVERWRITE:
                conf = f"{TARGET}--{CONFIG.name}--{model2paper[MODEL[0](_).__class__.__name__]}"
                save_path = f"../results/metrics--{conf}.pickle"
                if os.path.exists(save_path):
                    print("File already exists. Skipping", conf)
                    continue
                else:
                    print("Running", conf)

            # fixing random seed as soon as possible
            # for reproducibility
            np.random.seed(123)
            random.seed(123)

            df = pd.read_csv("../data.csv")
            X = df[CONFIG.features]
            y = df[TARGET]

            # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

            model_class, param_distributions, search_cv_args = MODEL
            
            if param_distributions is None:
                
                regressor = model_class(_)
            
            else:
                
                search_cv = RandomizedSearchCV(
                    model_class(_), param_distributions=param_distributions,
                    scoring="neg_mean_squared_error", random_state=0,
                    # cv=5,
                    cv=zip(tuning_train_folds, tuning_test_folds),
                    n_jobs=-1,
                    n_iter=30,
                    return_train_score=True,
                    **search_cv_args
                )
                search_cv.fit(X.values, y.values)

                print("The best hyperparameters are ",search_cv.best_params_)
                print("The best score is ",search_cv.best_score_)
                
                if type(search_cv.estimator) not in [
                    KNeighborsRegressor,
                    SVR,
                    StackingRegressor,
                    BayesianRidge,
                ]:
                    regressor = model_class(_).set_params( # use search_cv.estimator, to make it independent from the estimator's class
                        random_state=0,           # fixed random state
                        **search_cv.best_params_, # pass all parameters without to need to manually assign them
                    )
                else:
                    regressor = model_class(_).set_params( # use search_cv.estimator, to make it independent from the estimator's class
                        **search_cv.best_params_, # pass all parameters without to need to manually assign them
                    )

            data = []
            
#             # =================================================================
#             # the following code is an alternative, to split the dataset in
#             # 66/33 random train/test splits for 10 times to run the evaluation
            
#             folds = []

#             for random_state in range(10):
#                 cv = KFold(n_splits=3, random_state=random_state, shuffle=True)
#                 tmp_folds = cv.split(X)
#                 folds.append(next(tmp_folds))

            # =================================================================
            # 5-fold cross validation
            
            # cv = KFold(n_splits=5, random_state=0, shuffle=True)
            # folds = cv.split(X)
            
            folds = zip(train_folds, test_folds)

            for i, (train_index, test_index) in enumerate(folds):

                X_train = X.iloc[train_index]
                y_train = y.iloc[train_index].values

                X_test = X.iloc[test_index]
                y_test = y.iloc[test_index].values

                regressor.fit(X_train.values, y_train)
                y_pred = regressor.predict(X_test.values)
                y_pred_train = regressor.predict(X_train.values)

                data.append({
                    "target": TARGET,
                    "config": CONFIG.name,
                    "model_name": regressor.__class__.__name__,
                    "model": model2paper[regressor.__class__.__name__],
                    "hyperparams": None if param_distributions is None else search_cv.best_params_,
                    "fold": i,
                    "X_train": X_train,
                    "y_train": y_train,
                    "X_test": X_test,
                    "y_test": y_test,
                    "y_pred": y_pred,
                    "y_pred_train": y_pred_train,
                    "model_obj": deepcopy(regressor),
                })

            data = pd.DataFrame(data)
            save_path = f"../results/predictions--{TARGET}--{CONFIG.name}--{model2paper[regressor.__class__.__name__]}.pickle"
            data.to_pickle(save_path)
            # print("predictions saved to", save_path)
            # display(data)
            
            tmp = data
            y_test = pd.concat(
                [x for (i,x) in tmp[["fold","y_test"]].explode(column="y_test").groupby("fold")],
                axis=0
            ).reset_index(drop=True)
            y_pred = pd.concat(
                [x for (i,x) in tmp[["fold","y_pred"]].explode(column="y_pred").groupby("fold")],
                axis=0
            ).reset_index(drop=True)
            
            y_train = pd.concat(
                [x for (i,x) in tmp[["fold","y_train"]].explode(column="y_train").groupby("fold")],
                axis=0
            ).reset_index(drop=True)
            y_pred_train = pd.concat(
                [x for (i,x) in tmp[["fold","y_pred_train"]].explode(column="y_pred_train").groupby("fold")],
                axis=0
            ).reset_index(drop=True)
            
            test_index = pd.DataFrame(
                sum(tmp.X_test.apply(lambda d: d.index.tolist()).values.tolist(), []),
                columns=["sample_idx"]
            )
            train_index = pd.DataFrame(
                sum(tmp.X_train.apply(lambda d: d.index.tolist()).values.tolist(), []),
                columns=["sample_idx"]
            )
            
            tmp2 = pd.concat(( y_test, y_pred["y_pred"], test_index), axis=1)
            tmp2.to_csv(save_path.replace("pickle", "csv").replace("predictions", "predictions_test"), index=None)
            
            tmp2 = pd.concat(( y_train, y_pred_train["y_pred_train"], train_index), axis=1)
            tmp2.to_csv(save_path.replace("pickle", "csv").replace("predictions", "predictions_train"), index=None)

            data["MSE"] = data.apply(lambda row: mean_squared_error(row.y_test, row.y_pred), axis=1)
            data["R2"] = data.apply(lambda row: r2_score(row.y_test, row.y_pred), axis=1)
            data["MAPE"] = data.apply(lambda row: mean_absolute_percentage_error(row.y_test, row.y_pred), axis=1)
            data["RMSE"] = data.apply(lambda row: root_mean_squared_error(row.y_test, row.y_pred), axis=1)
            data["NRMSE"] = data.apply(lambda row: normalized_root_mean_squared_error(row.y_test, row.y_pred, norm_factor=row.y_train.mean()), axis=1)
            
            data["MSE_train"] = data.apply(lambda row: mean_squared_error(row.y_train, row.y_pred_train), axis=1)
            data["R2_train"] = data.apply(lambda row: r2_score(row.y_train, row.y_pred_train), axis=1)
            data["MAPE_train"] = data.apply(lambda row: mean_absolute_percentage_error(row.y_train, row.y_pred_train), axis=1)
            data["RMSE_train"] = data.apply(lambda row: root_mean_squared_error(row.y_train, row.y_pred_train), axis=1)
            data["NRMSE_train"] = data.apply(lambda row: normalized_root_mean_squared_error(row.y_train, row.y_pred_train, norm_factor=row.y_train.mean()), axis=1)

            data = data.drop(columns=["X_train", "y_train", "X_test", "y_test", "y_pred", "y_pred_train"])
            save_path = f"../results/metrics--{TARGET}--{CONFIG.name}--{model2paper[regressor.__class__.__name__]}.pickle"
            data.to_pickle(save_path)
            print("metrics, predictions and models saved to\n", save_path)
            # display(data)

            for metric in ["R2", "MAPE", "RMSE", "NRMSE"]:

                print(f"{metric:>10} {data[metric].mean().round(2):>7} ± {data[metric].std().round(2):>5}  {data[metric+'_train'].mean().round(2):>7} ± {data[metric+'_train'].std().round(2):>5}")

---------------- [1 / 96]
Running CS--Conf1--BaggedDT
The best hyperparameters are  {'n_estimators': 200, 'base_estimator__max_depth': 10}
The best score is  -2535.625112949097
metrics, predictions and models saved to
 ../results/metrics--CS--Conf1--BaggedDT.pickle
        R2    0.31 ±  0.08      0.9 ±  0.01
      MAPE    2.51 ±  1.72     1.03 ±  0.18
      RMSE   50.11 ±  9.59    19.73 ±  0.76
     NRMSE   65.44 ± 13.82    25.69 ±  1.29
---------------- [2 / 96]
Running CSE--Conf1--BaggedDT
The best hyperparameters are  {'n_estimators': 100, 'base_estimator__max_depth': 10}
The best score is  -1.0372815097703278
metrics, predictions and models saved to
 ../results/metrics--CSE--Conf1--BaggedDT.pickle
        R2    0.23 ±  0.16     0.89 ±  0.01
      MAPE     1.6 ±  0.43     0.66 ±  0.07
      RMSE    1.03 ±  0.05      0.4 ±   0.0
     NRMSE   61.15 ±  4.61    23.51 ±  0.77
---------------- [3 / 96]
Running CS--Conf2--BaggedDT
The best hyperparameters are  {'n_estimators': 200, 'base_e



The best hyperparameters are  {'max_iter': 50, 'learning_rate_init': 0.1, 'hidden_layer_sizes': [8]}
The best score is  -1.5114867054264078
metrics, predictions and models saved to
 ../results/metrics--CSE--Conf1--MLP.pickle
        R2   -0.03 ±  0.03     -0.0 ±  0.01
      MAPE     2.3 ±   1.0     2.45 ±  0.28
      RMSE     1.2 ±  0.14      1.2 ±  0.04
     NRMSE   71.67 ±  9.62    71.59 ±  2.43
---------------- [43 / 96]
Running CS--Conf2--MLP




The best hyperparameters are  {'max_iter': 10, 'learning_rate_init': 0.01, 'hidden_layer_sizes': [8]}
The best score is  -3065.9395298592967
metrics, predictions and models saved to
 ../results/metrics--CS--Conf2--MLP.pickle
        R2    0.06 ±   0.3     0.19 ±  0.08
      MAPE    4.26 ±  3.99      4.0 ±  0.94
      RMSE   58.38 ± 15.62    55.38 ±  5.39
     NRMSE   76.39 ± 21.88    71.99 ±  5.78
---------------- [44 / 96]
Running CSE--Conf2--MLP




The best hyperparameters are  {'max_iter': 50, 'learning_rate_init': 0.1, 'hidden_layer_sizes': [8]}
The best score is  -1.5267340243364165
metrics, predictions and models saved to
 ../results/metrics--CSE--Conf2--MLP.pickle
        R2   -0.03 ±  0.03     -0.0 ±   0.0
      MAPE    2.37 ±  0.97      2.4 ±  0.28
      RMSE     1.2 ±  0.14      1.2 ±  0.04
     NRMSE   71.64 ±  9.71    71.48 ±  2.25
---------------- [45 / 96]
Running CS--Conf3--MLP




The best hyperparameters are  {'max_iter': 50, 'learning_rate_init': 0.1, 'hidden_layer_sizes': [64]}
The best score is  -1318.908793695782
metrics, predictions and models saved to
 ../results/metrics--CS--Conf3--MLP.pickle
        R2    0.63 ±  0.08     0.69 ±  0.06
      MAPE    1.36 ±  0.97     1.06 ±   0.4
      RMSE   36.81 ±  9.95    33.93 ±  4.14
     NRMSE   48.16 ± 14.06    44.17 ±  5.43
---------------- [46 / 96]
Running CSE--Conf3--MLP




The best hyperparameters are  {'max_iter': 10, 'learning_rate_init': 0.1, 'hidden_layer_sizes': [2]}
The best score is  -1.5084032261997937
metrics, predictions and models saved to
 ../results/metrics--CSE--Conf3--MLP.pickle
        R2   -0.03 ±  0.03     -0.0 ±   0.0
      MAPE    2.33 ±  0.96     2.37 ±  0.27
      RMSE     1.2 ±  0.15      1.2 ±  0.04
     NRMSE   71.56 ±  9.84     71.5 ±  2.25
---------------- [47 / 96]
Running CS--Conf4--MLP




The best hyperparameters are  {'max_iter': 50, 'learning_rate_init': 0.1, 'hidden_layer_sizes': [64]}
The best score is  -1291.7626166134114
metrics, predictions and models saved to
 ../results/metrics--CS--Conf4--MLP.pickle
        R2    0.53 ±  0.07     0.62 ±  0.09
      MAPE    0.91 ±  0.32     0.96 ±   0.4
      RMSE   41.53 ± 10.01    37.61 ±  4.24
     NRMSE   54.21 ± 13.72    48.89 ±  4.77
---------------- [48 / 96]
Running CSE--Conf4--MLP




The best hyperparameters are  {'max_iter': 50, 'learning_rate_init': 0.1, 'hidden_layer_sizes': [8]}
The best score is  -1.494832503721327
metrics, predictions and models saved to
 ../results/metrics--CSE--Conf4--MLP.pickle
        R2   -0.03 ±  0.04     -0.0 ±   0.0
      MAPE    2.25 ±   0.9     2.29 ±   0.3
      RMSE     1.2 ±  0.15      1.2 ±  0.04
     NRMSE    71.7 ± 10.35    71.55 ±  2.25
---------------- [49 / 96]
Running CS--Conf1--SVR
The best hyperparameters are  {'gamma': 1e-05, 'C': 100}
The best score is  -3202.8669065607455
metrics, predictions and models saved to
 ../results/metrics--CS--Conf1--SVR.pickle
        R2    0.08 ±  0.25     0.48 ±  0.04
      MAPE    3.46 ±  3.68     2.43 ±  0.76
      RMSE    56.5 ±  6.91    44.19 ±  1.39
     NRMSE   73.76 ± 11.13    57.51 ±   1.3
---------------- [50 / 96]
Running CSE--Conf1--SVR
The best hyperparameters are  {'gamma': 0.001, 'C': 1000}
The best score is  -1.4817557711576916
metrics, predictions and models saved to
 ../r



The best hyperparameters are  {'learning_rate': 0.03, 'l2_leaf_reg': 0.2, 'iterations': 200, 'depth': 4}
The best score is  -0.9804365070215658
metrics, predictions and models saved to
 ../results/metrics--CSE--Conf2--CatBoost.pickle
        R2    0.27 ±  0.17     0.77 ±  0.01
      MAPE    1.78 ±  0.78     1.04 ±   0.1
      RMSE     1.0 ±  0.05     0.58 ±  0.01
     NRMSE   59.45 ±  4.01    34.22 ±  0.97
---------------- [85 / 96]
Running CS--Conf3--CatBoost
The best hyperparameters are  {'learning_rate': 0.1, 'l2_leaf_reg': 0.5, 'iterations': 250, 'depth': 2}
The best score is  -941.2102981642554
metrics, predictions and models saved to
 ../results/metrics--CS--Conf3--CatBoost.pickle
        R2     0.7 ±  0.07     0.97 ±   0.0
      MAPE    0.92 ±  0.73      0.4 ±  0.08
      RMSE   33.48 ±  9.72    11.23 ±  1.02
     NRMSE   43.79 ± 13.72    14.61 ±  1.18
---------------- [86 / 96]
Running CSE--Conf3--CatBoost
The best hyperparameters are  {'learning_rate': 0.1, 'l2_leaf_reg': 0.2,



The best hyperparameters are  {'learning_rate': 0.1, 'l2_leaf_reg': 0.2, 'iterations': 200, 'depth': 4}
The best score is  -952.4073647586017
metrics, predictions and models saved to
 ../results/metrics--CS--Conf4--CatBoost.pickle
        R2    0.71 ±  0.07      1.0 ±   0.0
      MAPE    0.78 ±  0.47     0.16 ±  0.02
      RMSE   32.58 ±  9.62      3.6 ±  0.23
     NRMSE   42.58 ±  13.3     4.69 ±  0.35
---------------- [88 / 96]
Running CSE--Conf4--CatBoost




The best hyperparameters are  {'learning_rate': 0.03, 'l2_leaf_reg': 1, 'iterations': 250, 'depth': 4}
The best score is  -0.7493604693423422
metrics, predictions and models saved to
 ../results/metrics--CSE--Conf4--CatBoost.pickle
        R2    0.42 ±  0.08     0.85 ±  0.01
      MAPE    1.27 ±   0.4      0.7 ±  0.09
      RMSE     0.9 ±  0.08     0.46 ±  0.02
     NRMSE    53.4 ±  6.14    27.39 ±  1.32
---------------- [89 / 96]
Running CS--Conf1--XGBoost




The best hyperparameters are  {'n_estimators': 600, 'max_depth': 5, 'learning_rate': 0.015}
The best score is  -2504.812262269389
metrics, predictions and models saved to
 ../results/metrics--CS--Conf1--XGBoost.pickle
        R2     0.3 ±   0.1      1.0 ±   0.0
      MAPE     2.4 ±  1.47     0.16 ±  0.05
      RMSE   50.03 ±  9.24     3.15 ±  0.56
     NRMSE   65.33 ± 13.41     4.11 ±  0.86
---------------- [90 / 96]
Running CSE--Conf1--XGBoost
The best hyperparameters are  {'n_estimators': 700, 'max_depth': 5, 'learning_rate': 0.02}
The best score is  -1.0407612042465508
metrics, predictions and models saved to
 ../results/metrics--CSE--Conf1--XGBoost.pickle
        R2    0.17 ±  0.21      1.0 ±   0.0
      MAPE    1.65 ±  0.46     0.05 ±  0.01
      RMSE    1.06 ±  0.04     0.03 ±   0.0
     NRMSE   63.12 ±  3.39     1.84 ±  0.23
---------------- [91 / 96]
Running CS--Conf2--XGBoost
The best hyperparameters are  {'n_estimators': 600, 'max_depth': 4, 'learning_rate': 0.02}
The best sc

In [3]:
import shutil

if os.path.exists("catboost_info"):
    shutil.rmtree("catboost_info")