In [21]:
import numpy as np
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")

In [22]:
DATA_DIR = "data"
DATA_RAW_DIR = f"{DATA_DIR}/raw"
DATA_PROCESSED_DIR = f"{DATA_DIR}/processed"

RESULT_DIR = "results"
RESULT_EXECUTION_TIME_DIR = f"{RESULT_DIR}"
RESULT_PREDICTIVE_PERFORMANCE_DIR = f"{RESULT_DIR}/predictive_performance"

In [23]:
DATASETS = {
    "flights": "ActualElapsedTime",
    "power": "Global_active_power",
    "sales": "Sale Amount"
}

EXPERIMENTS = {
    "base": {
        "name": "No Sampling",
        "file_postfix": ""
    },
    "dist_smogn_2": {
        "name": "Distributed SMOGN (k_partitions = 2)",
        "file_postfix": "_dist_smogn_2"
    },
    "dist_smogn_4": {
        "name": "Distributed SMOGN (k_partitions = 4)",
        "file_postfix": "_dist_smogn_4"
    },
    "dist_smogn_8": {
        "name": "Distributed SMOGN (k_partitions = 8)",
        "file_postfix": "_dist_smogn_8"
    }
}

REGRESSORS = {
    "lr": {
        "name": "Linear Regression (LR)",
        "variants": [
            LinearRegression()
        ]
    },
    "svm": {
        "name": "Support Vector Machine (SVM)",
        "variants": [
            SVR(C=10, gamma=0.01),
            SVR(C=10, gamma=0.001),
            SVR(C=150, gamma=0.01),
            SVR(C=150, gamma=0.001),
            SVR(C=300, gamma=0.01),
            SVR(C=300, gamma=0.001)
        ]
    },
    "rf": {
        "name": "Random Forest (RF)",
        "variants": [
            RandomForestRegressor(min_samples_leaf=1, min_samples_split=2),
            RandomForestRegressor(min_samples_leaf=1, min_samples_split=5),
            RandomForestRegressor(min_samples_leaf=2, min_samples_split=2),
            RandomForestRegressor(min_samples_leaf=2, min_samples_split=5),
            RandomForestRegressor(min_samples_leaf=4, min_samples_split=2),
            RandomForestRegressor(min_samples_leaf=4, min_samples_split=5)
        ]
    },
    "nn": {
        "name": "Neural Network (NN)",
        "variants": [
            MLPRegressor(hidden_layer_sizes=1, max_iter=500),
            MLPRegressor(hidden_layer_sizes=1, max_iter=1000),
            MLPRegressor(hidden_layer_sizes=5, max_iter=500),
            MLPRegressor(hidden_layer_sizes=5, max_iter=1000),
            MLPRegressor(hidden_layer_sizes=10, max_iter=500),
            MLPRegressor(hidden_layer_sizes=10, max_iter=1000)
        ]
    }
}

In [24]:
def preprocess_data(train, test, dataset):
    scaler = StandardScaler()

    if dataset == "flights":
        y_train = train.pop('ActualElapsedTime')
        X_train = train
        
        y_test = test.pop('ActualElapsedTime')
        X_test = test

        X_train = scaler.fit_transform(X_train)  
        X_test = scaler.transform(X_test)  
    elif dataset == "power":
        y_train = train.pop('Global_active_power')
        X_train = train

        y_test = test.pop('Global_active_power')
        X_test = test

        X_train = scaler.fit_transform(X_train)  
        X_test = scaler.transform(X_test) 
    elif dataset == "sales":
        y_train = train.pop('Sale Amount')
        X_train = train

        y_test = test.pop('Sale Amount')
        X_test = test

        numerical_cols = ['Assessed Value', 'Sales Ratio']
        categorical_cols = ['Property Type', 'Residential Type', 'List Year']

        X_train = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
        X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
        
        X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
        X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
    return X_train, X_test, y_train, y_test

In [None]:
for dataset, label_col in DATASETS.items():
    DATA_PROCESSED_TRAIN_DIR = f"{DATA_PROCESSED_DIR}/{dataset}/train"
    DATA_PROCESSED_TEST_DIR = f"{DATA_PROCESSED_DIR}/{dataset}/test"

    for regressor, regressor_config in REGRESSORS.items():
        results = {}

        for experiment, experiment_config in EXPERIMENTS.items():
            train = pd.read_csv(f"{DATA_PROCESSED_TRAIN_DIR}/{dataset}{experiment_config['file_postfix']}.csv")
            test = pd.read_csv(f"{DATA_PROCESSED_TEST_DIR}/{dataset}.csv")

            X_train, X_test, y_train, y_test = preprocess_data(train, test, dataset)

            y_phi = pd.read_csv(f"{DATA_PROCESSED_TEST_DIR}/{dataset}_phi.csv")

            mae_list = []
            rmse_list = []

            results[experiment_config["name"]] = {}

            for model in regressor_config["variants"]:
                model.fit(X_train, y_train)

                y_pred = model.predict(X_test)

                mae_list.append(mean_absolute_error(y_true=y_test, y_pred=y_pred, sample_weight=y_phi))
                rmse_list.append(mean_squared_error(y_true=y_test, y_pred=y_pred, sample_weight=y_phi, squared=False))

            results[experiment_config["name"]]["mae"] = round(np.mean(mae_list), 3)
            results[experiment_config["name"]]["rmse"] = round(np.mean(rmse_list), 3)

        pd.DataFrame(data=results).transpose().to_csv(f"{RESULT_PREDICTIVE_PERFORMANCE_DIR}/{dataset}/{regressor}.csv", index=True)


flights
First few rows of X_train:
[[-0.35860505 -1.70929585 -1.89122039 -1.50314531  0.          0.
  -1.57702971 -0.47847589 -0.76381136]
 [-0.35860505 -1.70929585 -1.70037857 -1.50314531  0.          0.
  -1.57702971 -0.371338   -0.96927028]
 [-0.35860505 -1.62999022 -1.7767153  -1.50314531  0.          0.
  -1.57702971 -0.79988956 -0.14743458]
 [-0.35860505 -1.59033741 -1.85305203 -1.50314531  0.          0.
  -1.57702971 -0.79988956  0.05802434]
 [-0.35860505 -1.59033741 -1.58587348 -1.3853991   0.          0.
  -1.57702971 -0.15706222 -0.14743458]]

First few rows of X_test:
[[-0.35860505 -1.78860147 -1.73854694 -1.55433931  0.          0.
  -1.57702971 -0.58561378 -0.76381136]
 [-0.35860505 -1.59033741 -1.39503166 -1.46730951  0.          0.
  -1.57702971 -1.22844112 -0.14743458]
 [-0.35860505 -1.39207334 -1.28052657 -1.55433931  0.          0.
  -1.57702971 -0.90702745  3.55082607]
 [-0.35860505 -1.11450365 -1.12785311 -0.75571286  0.          0.
  -1.57702971  0.05721356 -0.76

In [26]:
# Code for sales.csv 

# warnings.filterwarnings('ignore', category=FutureWarning)

# DATA_PROCESSED_TRAIN_DIR = f"{DATA_PROCESSED_DIR}/sales/train"
# DATA_PROCESSED_TEST_DIR = f"{DATA_PROCESSED_DIR}/sales/test"

# for regressor, regressor_config in REGRESSORS.items():
#     results = {}

#     for experiment, experiment_config in EXPERIMENTS.items():
#         train = pd.read_csv(f"{DATA_PROCESSED_TRAIN_DIR}/sales{experiment_config['file_postfix']}.csv")

#         # One hot encoding
#         train = pd.get_dummies(train, columns=['Property Type', 'Residential Type'])

#         test = pd.read_csv(f"{DATA_PROCESSED_TEST_DIR}/sales.csv")

#         # One hot encoding
#         test = pd.get_dummies(test, columns=['Property Type', 'Residential Type'])
#         test = test.reindex(columns=train.columns, fill_value=0)

#         y_train = train.pop('Sale Amount')
#         x_train = train

#         y_test = test.pop('Sale Amount')
#         x_test = test

#         numeric_cols = ['Assessed Value', 'Sales Ratio']
#         one_hot_cols_train = [col for col in x_train.columns if col not in numeric_cols]
#         one_hot_cols_test = [col for col in x_test.columns if col not in numeric_cols]

#         # Apply MinMaxScaler to only the numeric columns
#         scaler = MinMaxScaler().fit(pd.concat([x_train[numeric_cols], x_test[numeric_cols]]))

#         x_train[numeric_cols] = scaler.transform(x_train[numeric_cols])
#         x_test[numeric_cols] = scaler.transform(x_test[numeric_cols])

#         x_train = pd.concat([x_train[numeric_cols], x_train[one_hot_cols_train]], axis=1)
#         x_test = pd.concat([x_test[numeric_cols], x_test[one_hot_cols_test]], axis=1)

#         y_phi = pd.read_csv(f"{DATA_PROCESSED_TEST_DIR}/sales_phi.csv")

#         mae_list = []
#         rmse_list = []

#         results[experiment_config["name"]] = {}

#         for model in regressor_config["variants"]:
#             model.fit(x_train, y_train)

#             y_pred = model.predict(x_test)

#             mae_list.append(mean_absolute_error(y_true=y_test, y_pred=y_pred, sample_weight=y_phi))
#             rmse_list.append(mean_squared_error(y_true=y_test, y_pred=y_pred, sample_weight=y_phi, squared=False))

#         results[experiment_config["name"]]["mae"] = round(np.mean(mae_list), 3)
#         results[experiment_config["name"]]["rmse"] = round(np.mean(rmse_list), 3)

#     pd.DataFrame(data=results).transpose().to_csv(f"{RESULT_PREDICTIVE_PERFORMANCE_DIR}/sales/{regressor}.csv", index=True)