In [None]:
from google.colab import drive
drive.mount('/content/drive')
# %cd #YOUR PATH TO THE NOTEBOOK FOLDER IN GOOGLE COLAB

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from configuration import experiment_id as EXPERIMENT_ID
from configuration import data_root as DATA_ROOT
from configuration import data_path as DATA_PATH
from configuration import budget as BUDGET
from configuration import algorithms as ALGORITHMS
from configuration import n_iid as N_IID

DATA_PATH

In [None]:
pd.set_option('display.max_columns', None)

BUDGET

In [None]:
ALGORITHMS

In [None]:
N_IID

In [None]:
def load_data(X_path, y_path):
    """Load feature and target data from specified paths."""
    X = pd.read_csv(X_path)
    y = pd.read_csv(y_path)
    return X, y

def merge_data(X, y):
    """Merge features and targets into a single DataFrame and set index."""
    data = pd.merge(X, y, on=["f_id", "i_id"])
    return data.set_index(["f_id", "i_id"])

def split_data(data, algorithms):
    """Split data into features and targets, and perform train/test split."""
    y = data[algorithms]
    X = data.drop(columns=algorithms)
    # Generate random train/test indices
    np.random.seed(1)
    i_ids = np.random.choice(range(45), size=5, replace=False).tolist()
    print(f"i_ids in the test dataset: {i_ids}")
    X_train = X[~X.index.get_level_values("i_id").isin(i_ids)]
    y_train = y[~y.index.get_level_values("i_id").isin(i_ids)]
    X_test = X[X.index.get_level_values("i_id").isin(i_ids)]
    y_test = y[y.index.get_level_values("i_id").isin(i_ids)]
    return X_train, y_train, X_test, y_test

def scale_features(X_train, X_test):
    """Scale features using MinMaxScaler and return scaled data."""
    scaler = MinMaxScaler(feature_range=(0, 1))
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
    return X_train_scaled, X_test_scaled

def save_data(X_train, y_train, X_test, y_test, save_path):
    """Save train and test datasets to the specified path."""
    X_train.reset_index().to_csv(f"{save_path}/X_train.csv", index=False)
    y_train.reset_index().to_csv(f"{save_path}/y_train.csv", index=False)
    X_test.reset_index().to_csv(f"{save_path}/X_test.csv", index=False)
    y_test.reset_index().to_csv(f"{save_path}/y_test.csv", index=False)

def create_datasets_mtr(X_path, y_path, algorithms, save_path):
    """High-level function to process and save dataset."""
    X, y = load_data(X_path, y_path)
    data = merge_data(X, y)
    print("Preview dataset: ")
    print(data.head())
    print(data.shape)
    X_train, y_train, X_test, y_test = split_data(data, algorithms)
    X_train_scaled, X_test_scaled = scale_features(X_train, X_test)
    save_data(X_train_scaled, y_train, X_test_scaled, y_test, save_path)

# Define data paths
X_PATH = f"{DATA_PATH}/X.csv"
Y_PATH = f"{DATA_PATH}/y.csv"

create_datasets_mtr(X_PATH, Y_PATH, ALGORITHMS, DATA_PATH)

### Plot target data

In [None]:
def load_and_prepare_data(file_path, id_vars):
    """Load CSV file and prepare for plotting."""
    data = pd.read_csv(file_path, dtype={"f_id": str, "i_id": str})
    return data.melt(id_vars=id_vars).rename(columns={"variable": "algorithm", "value": "ground_truth"})

def plot_data(data, save_path, plot_title):
    """Plot scatterplot of the ground truth data."""
    sns.set_theme(style='white', font_scale=1.1)
    fig, ax = plt.subplots(figsize=(10, 4))
    sns.scatterplot(
        data=data,
        x="f_id",
        y="ground_truth",
        hue="algorithm",
        style="algorithm",
        alpha=0.5,
        s=100,
        ax=ax
    )
    plt.gca().xaxis.grid(True)
    plt.tight_layout()
    plt.ylim(-0.3, 8)
    plt.title(plot_title)
    plt.savefig(f"{save_path}/ground_truth_{plot_title.lower().replace(' ', '_')}.png")
    plt.savefig(f"{save_path}/ground_truth_{plot_title.lower().replace(' ', '_')}.pdf")
    plt.show()

train_data_path = f"{DATA_PATH}/y_train.csv"
test_data_path = f"{DATA_PATH}/y_test.csv"

# Load and prepare data for plotting
train_data = load_and_prepare_data(train_data_path, id_vars=["f_id", "i_id"])
test_data = load_and_prepare_data(test_data_path, id_vars=["f_id", "i_id"])

# Plot data
plot_data(train_data, DATA_PATH, "Train Data Ground Truth")
plot_data(test_data, DATA_PATH, "Test Data Ground Truth")