#### **Download TOCUCO repository into a temporal directory**

In [None]:
import os
import zipfile
import requests
import tempfile

url = "https://www.uco.es/grupos/ayrna/datasets/TOC-UCO.zip"
response = requests.get(url)

temp_file = tempfile.NamedTemporaryFile(delete=False)
temp_file.write(response.content)

tmp_tocuco_path = tempfile.mkdtemp()
zip_ref = zipfile.ZipFile(temp_file.name, 'r')
zip_ref.extractall(tmp_tocuco_path)
extracted_files = zip_ref.namelist()

tmp_tocuco_path = os.path.join(tmp_tocuco_path, "TOC-UCO")

print(f"TOC-UCO repository saved temporarily at {tmp_tocuco_path}")
print(f"Files inside the repository: {os.listdir(tmp_tocuco_path)}")

TOCUCO repository saved temporarily at /tmp/tmp37lrq1zt/TOCUCO
Files inside the repository: ['metadata.csv', 'train_masks.json', 'data', 'train_masks.pkl']


#### **Iterate over the fixed training and test partitions of the TOCUCO datasets**

In [None]:
import joblib
import pandas as pd
import numpy as np

SEEDS = 3 #! Seeds should be less than 30

## Get training masks to make the train/test partitions
with open(os.path.join(tmp_tocuco_path, "train_masks.pkl"), "rb") as train_masks_binary:
    train_masks = joblib.load(train_masks_binary)

tocuco_datasets_path = os.path.join(tmp_tocuco_path, "data")

for dataset_name in os.listdir(tocuco_datasets_path):
    dataset = pd.read_csv(os.path.join(tocuco_datasets_path, dataset_name))
    for seed in range(SEEDS):
        dataset_name_without_extension = dataset_name.split(".")[0]

        # Get train mask for the current dataset and seed
        dataset_seed_train_mask = train_masks[f"{dataset_name_without_extension}_seed_{seed}"]

        # Apply the mask to get train and test partitions
        train = dataset.loc[dataset_seed_train_mask]
        test = dataset.loc[~dataset_seed_train_mask]

        # Separate features and target variable and convert to numpy arrays
        X_train = train.drop(columns=["y"]).to_numpy()
        X_test = test.drop(columns=["y"]).to_numpy()
        y_train = train["y"].to_numpy()
        y_test = test["y"].to_numpy()
        del train, test

        print(f"Dataset: {dataset_name_without_extension}, Seed: {seed}")
        print(f"X_train shape: {X_train.shape}")
        print(f"X_test shape: {X_test.shape}")
        print(f"y_train counts: {np.unique(y_train, return_counts=True)}")
        print(f"y_test unique and counts: {np.unique(y_test, return_counts=True)}")
        print("")

Dataset: oc03_balanceScale, Seed: 0
X_train shape: (437, 4)
X_test shape: (188, 4)
y_train counts: (array([0, 1, 2]), array([202,  34, 201]))
y_test unique and counts: (array([0, 1, 2]), array([86, 15, 87]))

Dataset: oc03_balanceScale, Seed: 1
X_train shape: (437, 4)
X_test shape: (188, 4)
y_train counts: (array([0, 1, 2]), array([202,  34, 201]))
y_test unique and counts: (array([0, 1, 2]), array([86, 15, 87]))

Dataset: oc03_balanceScale, Seed: 2
X_train shape: (437, 4)
X_test shape: (188, 4)
y_train counts: (array([0, 1, 2]), array([201,  34, 202]))
y_test unique and counts: (array([0, 1, 2]), array([87, 15, 86]))

Dataset: dr09_computer2, Seed: 0
X_train shape: (5734, 21)
X_test shape: (2458, 21)
y_train counts: (array([0, 1, 2, 3, 4, 5, 6, 7, 8]), array([ 379,  120,  169,  324,  450,  646,  966, 1221, 1459]))
y_test unique and counts: (array([0, 1, 2, 3, 4, 5, 6, 7, 8]), array([162,  52,  72, 139, 193, 277, 415, 523, 625]))

Dataset: dr09_computer2, Seed: 1
X_train shape: (5734, 