In [1]:
import time
import os
import sys
import pandas as pd
from pyspark.sql import SparkSession
from smogn import smoter
os.chdir('C:\\Users\\Owner\\Documents\\School\\University Year 4\\Semester 2\\Honours Project\\distributed-resampling-parallelization')  # Move up to the parent directory
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from src.relevance.phi import Phi
from src.sampling.mixed_sampling.distributed_smogn import DistributedSMOGN
from src.sampling.over_sampling.distributed_ros import DistributedROS
from src.sampling.under_sampling.distributed_rus import DistributedRUS

In [2]:
DATA_DIR = "data"
DATA_RAW_DIR = f"{DATA_DIR}/raw"
DATA_PROCESSED_DIR = f"{DATA_DIR}/processed"

RESULT_DIR = "results"
RESULT_EXECUTION_TIME_DIR = f"{RESULT_DIR}"
RESULT_PREDICTIVE_PERFORMANCE_DIR = "{RESULT_DIR}/predictive_performance"

In [3]:
DATASETS = {
    # "boston": "HousValue",
    # "Abalone": "Rings",
    # "bank8FM": "rej",
    # "heat": "heat",
    # "cpuSm": "usr",
    # "energy": "Appliances",
    # "superconductivity": "critical_temp",
    "sales": "Sale Amount"
}

EXPERIMENTS = {
    "ros": {
        "name": "ROS",
        "type": "dist",
        "sampler": DistributedROS
    },
    "rus": {
        "name": "RUS",
        "type": "dist",
        "sampler": DistributedRUS
    },
    "smogn": {
        "name": "SMOGN",
        "type": "seq",
        "sampler": smoter
    },
    "dist_smogn": {
        "name": "Distributed SMOGN",
        "type": "dist",
        "sampler": DistributedSMOGN,
        "k_partitions": [2, 4, 8]
    },
}

In [4]:
spark = SparkSession.builder.master('local[4]').appName('Distributed Resampling').getOrCreate()

In [5]:
execution_times = {}

In [6]:
for dataset, label_col in DATASETS.items():
    DATA_PROCESSED_TRAIN_DIR = f"{DATA_PROCESSED_DIR}/{dataset}/train"
    DATA_PROCESSED_TEST_DIR = f"{DATA_PROCESSED_DIR}/{dataset}/test"

    df = pd.read_csv(f"{DATA_RAW_DIR}/{dataset}.csv")

    df = spark.createDataFrame(df)

    relevance_col = "phi"
    df = Phi(input_col=label_col, output_col=relevance_col).transform(df)

    train, test = df.randomSplit(weights=[0.8, 0.2])
    train = train.sample(fraction=0.01)
    train = train.drop(relevance_col)
    test = test.toPandas()
    phi = test.pop(relevance_col)

    test.to_csv(f"{DATA_PROCESSED_TEST_DIR}/{dataset}.csv", index=False)
    phi.to_csv(f"{DATA_PROCESSED_TEST_DIR}/{dataset}_phi.csv", index=False)

    execution_times[dataset] = {}

    train_base = train.toPandas()
    train_base.to_csv(f"{DATA_PROCESSED_TRAIN_DIR}/{dataset}.csv", index=False)

    # start_time = time.time()
    # train_rus = DistributedRUS(label_col=label_col, k_partitions=1).transform(train)
    # end_time = time.time()
    # execution_times[dataset]["RUS"] = round(end_time - start_time, 3)
    # train_rus.toPandas().to_csv(f"{DATA_PROCESSED_TRAIN_DIR}/{dataset}_rus.csv", index=False)

    # start_time = time.time()
    # train_ros = DistributedROS(label_col=label_col, k_partitions=1).transform(train)
    # end_time = time.time()
    # execution_times[dataset]["ROS"] = round(end_time - start_time, 3)
    # train_ros.toPandas().to_csv(f"{DATA_PROCESSED_TRAIN_DIR}/{dataset}_ros.csv", index=False)

    # start_time = time.time()
    # train_smogn = smoter(data=train.toPandas(), y=label_col)
    # end_time = time.time()
    # execution_times[dataset]["SMOGN"] = round(end_time - start_time, 3)
    # train_smogn.to_csv(f"{DATA_PROCESSED_TRAIN_DIR}/{dataset}_smogn.csv", index=False)

    # start_time = time.time()
    # train_dist_smogn_2 = DistributedSMOGN(label_col=label_col, k_partitions=2).transform(train)
    # end_time = time.time()
    # execution_times[dataset]["Distributed SMOGN (k_partitions = 2)"] = round(end_time - start_time, 3)
    # train_dist_smogn_2.toPandas().to_csv(f"{DATA_PROCESSED_TRAIN_DIR}/{dataset}_dist_smogn_2.csv", index=False)

    # start_time = time.time()
    # train_dist_smogn_4 = DistributedSMOGN(label_col=label_col, k_partitions=4).transform(train)
    # end_time = time.time()
    # execution_times[dataset]["Distributed SMOGN (k_partitions = 4)"] = round(end_time - start_time, 3)
    # train_dist_smogn_4.toPandas().to_csv(f"{DATA_PROCESSED_TRAIN_DIR}/{dataset}_dist_smogn_4.csv", index=False)

    # start_time = time.time()
    # train_dist_smogn_8 = DistributedSMOGN(label_col=label_col, k_partitions=8).transform(train)
    # end_time = time.time()
    # execution_times[dataset]["Distributed SMOGN (k_partitions = 8)"] = round(end_time - start_time, 3)
    # train_dist_smogn_8.toPandas().to_csv(f"{DATA_PROCESSED_TRAIN_DIR}/{dataset}_dist_smogn_8.csv", index=False)

  df = pd.read_csv(f"{DATA_RAW_DIR}/{dataset}.csv")


In [7]:
pd.DataFrame(data=execution_times).to_csv(f"{RESULT_EXECUTION_TIME_DIR}/execution_time.csv", index=True)