In [41]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import PowerTransformer
os.chdir('C:\\Users\\Owner\\Documents\\School\\University Year 4\\Semester 2\\Honours Project\\distributed-resampling-parallelization')  # Move up to the parent directory

In [42]:
DATA_DIR = "data"
DATA_RAW_DIR = f"{DATA_DIR}/raw"
DATA_PROCESSED_DIR = f"{DATA_DIR}/processed"

RESULT_DIR = "results"
RESULT_EXECUTION_TIME_DIR = f"{RESULT_DIR}"
RESULT_PREDICTIVE_PERFORMANCE_DIR = f"{RESULT_DIR}/predictive_performance"

In [43]:
DATASETS = {
    # "boston": "HousValue",
    # "Abalone": "Rings",
    # "bank8FM": "rej",
    # "heat": "heat",
    # "cpuSm": "usr",
    # "energy": "Appliances",
    # "superconductivity": "critical_temp"
}

EXPERIMENTS = {
    "base": {
        "name": "No Sampling",
        "file_postfix": ""
    },
    "rus": {
        "name": "RUS",
        "file_postfix": "_rus"
    },
    "ros": {
        "name": "ROS",
        "file_postfix": "_ros"
    },
    "smogn": {
        "name": "SMOGN",
        "file_postfix": "_smogn"
    },
    "dist_smogn_2": {
        "name": "Distributed SMOGN (k_partitions = 2)",
        "file_postfix": "_dist_smogn_2"
    },
    "dist_smogn_4": {
        "name": "Distributed SMOGN (k_partitions = 4)",
        "file_postfix": "_dist_smogn_4"
    },
    "dist_smogn_8": {
        "name": "Distributed SMOGN (k_partitions = 8)",
        "file_postfix": "_dist_smogn_8"
    }
}

REGRESSORS = {
    "lr": {
        "name": "Linear Regression (LR)",
        "variants": [
            LinearRegression()
        ]
    },
    "svm": {
        "name": "Support Vector Machine (SVM)",
        "variants": [
            SVR(C=10, gamma=0.01),
            SVR(C=10, gamma=0.001),
            SVR(C=150, gamma=0.01),
            SVR(C=150, gamma=0.001),
            SVR(C=300, gamma=0.01),
            SVR(C=300, gamma=0.001)
        ]
    },
    "rf": {
        "name": "Random Forest (RF)",
        "variants": [
            RandomForestRegressor(min_samples_leaf=1, min_samples_split=2),
            RandomForestRegressor(min_samples_leaf=1, min_samples_split=5),
            RandomForestRegressor(min_samples_leaf=2, min_samples_split=2),
            RandomForestRegressor(min_samples_leaf=2, min_samples_split=5),
            RandomForestRegressor(min_samples_leaf=4, min_samples_split=2),
            RandomForestRegressor(min_samples_leaf=4, min_samples_split=5)
        ]
    },
    "nn": {
        "name": "Neural Network (NN)",
        "variants": [
            MLPRegressor(hidden_layer_sizes=1, max_iter=500),
            MLPRegressor(hidden_layer_sizes=1, max_iter=1000),
            MLPRegressor(hidden_layer_sizes=5, max_iter=500),
            MLPRegressor(hidden_layer_sizes=5, max_iter=1000),
            MLPRegressor(hidden_layer_sizes=10, max_iter=500),
            MLPRegressor(hidden_layer_sizes=10, max_iter=1000)
        ]
    }
}

In [44]:
# Code for sales.csv e02d06bb28a2d42907f8898c619cf73c7aa1f335
DATA_PROCESSED_TRAIN_DIR = f"{DATA_PROCESSED_DIR}/sales/train"
DATA_PROCESSED_TEST_DIR = f"{DATA_PROCESSED_DIR}/sales/test"

for regressor, regressor_config in REGRESSORS.items():
    results = {}

    for experiment, experiment_config in EXPERIMENTS.items():
        train = pd.read_csv(f"{DATA_PROCESSED_TRAIN_DIR}/sales{experiment_config['file_postfix']}.csv")
        columns_to_drop = ['Serial Number', 'Date Recorded', 'Address', 'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Location']
        train = train.drop(columns=columns_to_drop)

        # Fill missing values for property and residential type with the mode (most frequent value)
        train.loc[:, 'Property Type'] = train['Property Type'].fillna(train['Property Type'].mode()[0])
        train.loc[:, 'Residential Type'] = train['Residential Type'].fillna(train['Residential Type'].mode()[0])

        # One hot encoding
        train = pd.get_dummies(train, columns=['Town', 'Property Type', 'Residential Type'])

        # Apply power transformer for assessed value
        pt = PowerTransformer(method='yeo-johnson')
        train['Assessed Value'] = pt.fit_transform(train[['Assessed Value']])

        # Clip the values to remove outliers
        lower_bound = train['Assessed Value'].quantile(0.01)
        upper_bound = train['Assessed Value'].quantile(0.99)
        train['Assessed Value'] = train['Assessed Value'].clip(lower_bound, upper_bound)

        # Apply power transformer for sale amount
        pt = PowerTransformer(method='yeo-johnson')
        train['Sale Amount'] = pt.fit_transform(train[['Sale Amount']])

        # Clip the values to remove outliers
        lower_bound = train['Sale Amount'].quantile(0.01)
        upper_bound = train['Sale Amount'].quantile(0.99)
        train['Sale Amount'] = train['Sale Amount'].clip(lower_bound, upper_bound)

        # Apply power transformer to sales ratio
        pt = PowerTransformer(method='yeo-johnson')
        train['Sales Ratio'] = pt.fit_transform(train[['Sales Ratio']])

        # Clip the values to remove outliers
        lower_bound = train['Sales Ratio'].quantile(0.01)
        upper_bound = train['Sales Ratio'].quantile(0.99)
        train['Sales Ratio'] = train['Sales Ratio'].clip(lower_bound, upper_bound)

        test = pd.read_csv(f"{DATA_PROCESSED_TEST_DIR}/sales.csv")
        columns_to_drop = ['Serial Number', 'Date Recorded', 'Address', 'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Location']
        test = test.drop(columns=columns_to_drop)

        # Fill missing values for property and residential type with the mode (most frequent value)
        test.loc[:, 'Property Type'] = test['Property Type'].fillna(test['Property Type'].mode()[0])
        test.loc[:, 'Residential Type'] = test['Residential Type'].fillna(test['Residential Type'].mode()[0])

        # One hot encoding
        test = pd.get_dummies(test, columns=['Town', 'Property Type', 'Residential Type'])
        test = test.reindex(columns=train.columns, fill_value=0)

        # Apply power transformer for assessed value
        pt = PowerTransformer(method='yeo-johnson')
        test['Assessed Value'] = pt.fit_transform(test[['Assessed Value']])

        # Clip the values to remove outliers
        lower_bound = test['Assessed Value'].quantile(0.01)
        upper_bound = test['Assessed Value'].quantile(0.99)
        test['Assessed Value'] = test['Assessed Value'].clip(lower_bound, upper_bound)

        # Apply power transformer for sale amount
        pt = PowerTransformer(method='yeo-johnson')
        test['Sale Amount'] = pt.fit_transform(test[['Sale Amount']])

        # Clip the values to remove outliers
        lower_bound = test['Sale Amount'].quantile(0.01)
        upper_bound = test['Sale Amount'].quantile(0.99)
        test['Sale Amount'] = test['Sale Amount'].clip(lower_bound, upper_bound)

        # Apply power transformer to sales ratio
        pt = PowerTransformer(method='yeo-johnson')
        test['Sales Ratio'] = pt.fit_transform(test[['Sales Ratio']])

        # Clip the values to remove outliers
        lower_bound = test['Sales Ratio'].quantile(0.01)
        upper_bound = test['Sales Ratio'].quantile(0.99)
        test['Sales Ratio'] = test['Sales Ratio'].clip(lower_bound, upper_bound)

        y_train = train.pop('Sale Amount')
        x_train = train

        y_test = test.pop('Sale Amount')
        x_test = test

        numeric_cols = ['Assessed Value', 'Sales Ratio']  # List of numeric columns to scale
        one_hot_cols_train = [col for col in x_train.columns if col not in numeric_cols]
        one_hot_cols_test = [col for col in x_test.columns if col not in numeric_cols]
        # Apply MinMaxScaler to only the numeric columns
        scaler = MinMaxScaler().fit(pd.concat([x_train[numeric_cols], x_test[numeric_cols]]))

        x_train[numeric_cols] = scaler.transform(x_train[numeric_cols])
        x_test[numeric_cols] = scaler.transform(x_test[numeric_cols])

        x_train = pd.concat([x_train[numeric_cols], x_train[one_hot_cols_train]], axis=1)
        x_test = pd.concat([x_test[numeric_cols], x_test[one_hot_cols_test]], axis=1)

        y_phi = pd.read_csv(f"{DATA_PROCESSED_TEST_DIR}/sales_phi.csv")

        mae_list = []
        rmse_list = []

        results[experiment_config["name"]] = {}

        for model in regressor_config["variants"]:
            model.fit(x_train, y_train)

            y_pred = model.predict(x_test)

            mae_list.append(mean_absolute_error(y_true=y_test, y_pred=y_pred, sample_weight=y_phi))
            rmse_list.append(mean_squared_error(y_true=y_test, y_pred=y_pred, sample_weight=y_phi, squared=False))

        results[experiment_config["name"]]["mae"] = round(np.mean(mae_list), 3)
        results[experiment_config["name"]]["rmse"] = round(np.mean(rmse_list), 3)

    pd.DataFrame(data=results).transpose().to_csv(f"{RESULT_PREDICTIVE_PERFORMANCE_DIR}/sales/{regressor}.csv", index=True)



EmptyDataError: No columns to parse from file

In [45]:
for dataset, label_col in DATASETS.items():
    DATA_PROCESSED_TRAIN_DIR = f"{DATA_PROCESSED_DIR}/{dataset}/train"
    DATA_PROCESSED_TEST_DIR = f"{DATA_PROCESSED_DIR}/{dataset}/test"

    for regressor, regressor_config in REGRESSORS.items():
        results = {}

        for experiment, experiment_config in EXPERIMENTS.items():
            train = pd.read_csv(f"{DATA_PROCESSED_TRAIN_DIR}/{dataset}{experiment_config['file_postfix']}.csv")
            y_train = train.pop(label_col)
            x_train = pd.get_dummies(train)

            test = pd.read_csv(f"{DATA_PROCESSED_TEST_DIR}/{dataset}.csv")
            y_test = test.pop(label_col)
            x_test = pd.get_dummies(test)

            scaler = MinMaxScaler().fit(pd.concat([x_train, x_test]))

            x_train = scaler.transform(x_train)
            x_test = scaler.transform(x_test)

            y_phi = pd.read_csv(f"{DATA_PROCESSED_TEST_DIR}/{dataset}_phi.csv")

            mae_list = []
            rmse_list = []

            results[experiment_config["name"]] = {}

            for model in regressor_config["variants"]:
                model.fit(x_train, y_train)

                y_pred = model.predict(x_test)

                mae_list.append(mean_absolute_error(y_true=y_test, y_pred=y_pred, sample_weight=y_phi))
                rmse_list.append(mean_squared_error(y_true=y_test, y_pred=y_pred, sample_weight=y_phi, squared=False))

            results[experiment_config["name"]]["mae"] = round(np.mean(mae_list), 3)
            results[experiment_config["name"]]["rmse"] = round(np.mean(rmse_list), 3)

        pd.DataFrame(data=results).transpose().to_csv(f"{RESULT_PREDICTIVE_PERFORMANCE_DIR}/{dataset}/{regressor}.csv",
                                                      index=True)
