In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR

In [3]:
DATA_DIR = "../data"   # change the data directory
DATA_RAW_DIR = f"{DATA_DIR}/raw"
DATA_PROCESSED_DIR = f"{DATA_DIR}/processed"

RESULT_DIR = "../results"
RESULT_EXECUTION_TIME_DIR = f"{RESULT_DIR}"
RESULT_PREDICTIVE_PERFORMANCE_DIR = f"{RESULT_DIR}/predictive_performance"

In [4]:
DATASETS = {
    "boston": "HousValue",
    "Abalone": "Rings",
    "bank8FM": "rej",
    "heat": "heat",
    "cpuSm": "usr",
    "energy": "Appliances",
    "superconductivity": "critical_temp"
}

EXPERIMENTS = {
    "base": {
        "name": "No Sampling",
        "file_postfix": ""
    },
    "rus": {
        "name": "RUS",
        "file_postfix": "_rus"
    },
    "ros": {
        "name": "ROS",
        "file_postfix": "_ros"
    },
    "smogn": {
        "name": "SMOGN",
        "file_postfix": "_smogn"
    },
    "dist_smogn_2": {
        "name": "Distributed SMOGN (k_partitions = 2)",
        "file_postfix": "_dist_smogn_2"
    },
    "dist_smogn_4": {
        "name": "Distributed SMOGN (k_partitions = 4)",
        "file_postfix": "_dist_smogn_4"
    },
    "dist_smogn_8": {
        "name": "Distributed SMOGN (k_partitions = 8)",
        "file_postfix": "_dist_smogn_8"
    }
}

REGRESSORS = {
    "lr": {
        "name": "Linear Regression (LR)",
        "variants": [
            LinearRegression()
        ]
    },
    "svm": {
        "name": "Support Vector Machine (SVM)",
        "variants": [
            SVR(C=10, gamma=0.01),
            SVR(C=10, gamma=0.001),
            SVR(C=150, gamma=0.01),
            SVR(C=150, gamma=0.001),
            SVR(C=300, gamma=0.01),
            SVR(C=300, gamma=0.001)
        ]
    },
    "rf": {
        "name": "Random Forest (RF)",
        "variants": [
            RandomForestRegressor(min_samples_leaf=1, min_samples_split=2),
            RandomForestRegressor(min_samples_leaf=1, min_samples_split=5),
            RandomForestRegressor(min_samples_leaf=2, min_samples_split=2),
            RandomForestRegressor(min_samples_leaf=2, min_samples_split=5),
            RandomForestRegressor(min_samples_leaf=4, min_samples_split=2),
            RandomForestRegressor(min_samples_leaf=4, min_samples_split=5)
        ]
    },
    "nn": {
        "name": "Neural Network (NN)",
        "variants": [
            MLPRegressor(hidden_layer_sizes=1, max_iter=500),
            MLPRegressor(hidden_layer_sizes=1, max_iter=1000),
            MLPRegressor(hidden_layer_sizes=5, max_iter=500),
            MLPRegressor(hidden_layer_sizes=5, max_iter=1000),
            MLPRegressor(hidden_layer_sizes=10, max_iter=500),
            MLPRegressor(hidden_layer_sizes=10, max_iter=1000)
        ]
    }
}

In [None]:
for dataset, label_col in DATASETS.items():
    DATA_PROCESSED_TRAIN_DIR = f"{DATA_PROCESSED_DIR}/{dataset}/train"
    DATA_PROCESSED_TEST_DIR = f"{DATA_PROCESSED_DIR}/{dataset}/test"

    for regressor, regressor_config in REGRESSORS.items():
        results = {}

        for experiment, experiment_config in EXPERIMENTS.items():
            train = pd.read_csv(f"{DATA_PROCESSED_TRAIN_DIR}/{dataset}{experiment_config['file_postfix']}.csv")
            y_train = train.pop(label_col)
            x_train = pd.get_dummies(train)

            test = pd.read_csv(f"{DATA_PROCESSED_TEST_DIR}/{dataset}.csv")
            y_test = test.pop(label_col)
            x_test = pd.get_dummies(test)

            scaler = MinMaxScaler().fit(pd.concat([x_train, x_test]))

            x_train = scaler.transform(x_train)
            x_test = scaler.transform(x_test)

            y_phi = pd.read_csv(f"{DATA_PROCESSED_TEST_DIR}/{dataset}_phi.csv")

            mae_list = []
            rmse_list = []

            results[experiment_config["name"]] = {}

            for model in regressor_config["variants"]:
                model.fit(x_train, y_train)

                y_pred = model.predict(x_test)

                mae_list.append(mean_absolute_error(y_true=y_test, y_pred=y_pred, sample_weight=y_phi))
                rmse_list.append(mean_squared_error(y_true=y_test, y_pred=y_pred, sample_weight=y_phi, squared=False))

            results[experiment_config["name"]]["mae"] = round(np.mean(mae_list), 3)
            results[experiment_config["name"]]["rmse"] = round(np.mean(rmse_list), 3)

        pd.DataFrame(data=results).transpose().to_csv(f"{RESULT_PREDICTIVE_PERFORMANCE_DIR}/{dataset}/{regressor}.csv",
                                                      index=True)












