## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import logging
import itertools
import warnings
from datetime import datetime
import gc

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict

import matplotlib.pyplot as plt
import seaborn as sns
import scienceplots

import useful_rdkit_utils as uru
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator

In [3]:
plt.style.use(["science"])

In [4]:
%matplotlib inline

In [5]:
# setup tqdm
tqdm.pandas()

In [6]:
# setup logging
level = logging.DEBUG
logger = logging.getLogger(__name__)
if logger.hasHandlers():
    logger.handlers.clear()

formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.setLevel(level)

logger.info("Imports successful.")

2025-11-16 20:32:49,741 - __main__ - INFO - Imports successful.


## Load Data

In [7]:
# Data input and output directories
base_data_dir = Path().cwd().parents[0] / "assets/dataset/eda/data/set"
output_dir = base_data_dir.parents[2] / "splits"
output_dir.mkdir(parents=True, exist_ok=True)

output_fig_dir = output_dir / f"figures/{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
output_fig_dir.mkdir(parents=True, exist_ok=True)

if not base_data_dir.exists():
    raise FileNotFoundError(f"Data directory not found at {base_data_dir}")

logger.info(f"Output directory set to {output_dir}")
logger.info(f"Input data directory found at {base_data_dir}")
for dataset_dir in base_data_dir.iterdir():
    logger.info(f"Dataset name: {dataset_dir.name}")

2025-11-16 20:32:49,782 - __main__ - INFO - Output directory set to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits
2025-11-16 20:32:49,783 - __main__ - INFO - Input data directory found at /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/eda/data/set
2025-11-16 20:32:49,783 - __main__ - INFO - Dataset name: cleaned_combined_datasets_low_quality_summary_table.csv
2025-11-16 20:32:49,783 - __main__ - INFO - Dataset name: cleaned_combined_datasets_medium_quality_summary_table.csv
2025-11-16 20:32:49,783 - __main__ - INFO - Dataset name: cleaned_combined_datasets_medium_quality.csv
2025-11-16 20:32:49,783 - __main__ - INFO - Dataset name: cleaned_combined_datasets_low_medium_high_quality.csv
2025-11-16 20:32:49,784 - __main__ - INFO - Dataset name: cleaned_combined_datasets_high_quality.csv
2025-11-16 20:32:49,784 - __main__ - INFO - Dataset name: cleaned

In [8]:
# Load input datasets
datasets = {
    "high": pd.read_csv(base_data_dir / "cleaned_combined_datasets_high_quality.csv"),
    "medium": pd.read_csv(
        base_data_dir / "cleaned_combined_datasets_medium_high_quality.csv", low_memory=False
    ),
    "low": pd.read_csv(
        base_data_dir / "cleaned_combined_datasets_low_medium_high_quality.csv", low_memory=False
    ),
}

for name, df in datasets.items():
    logger.info(f"Dataset: {name}, shape: {df.shape}")
    logger.info(f"Columns: {df.columns.tolist()}")
    logger.info(f"Unique Dataset Constituents: {df['Dataset'].unique()}")

2025-11-16 20:32:50,042 - __main__ - INFO - Dataset: high, shape: (5326, 12)
2025-11-16 20:32:50,043 - __main__ - INFO - Columns: ['Molecule Name', 'SMILES', 'Dataset', 'LogD', 'KSOL', 'HLM CLint', 'MLM CLint', 'Caco-2 Permeability Papp A>B', 'Caco-2 Permeability Efflux', 'MPPB', 'MBPB', 'MGMB']
2025-11-16 20:32:50,043 - __main__ - INFO - Unique Dataset Constituents: ['expansionrx']
2025-11-16 20:32:50,044 - __main__ - INFO - Dataset: medium, shape: (94708, 12)
2025-11-16 20:32:50,044 - __main__ - INFO - Columns: ['Molecule Name', 'SMILES', 'Dataset', 'LogD', 'KSOL', 'HLM CLint', 'MLM CLint', 'Caco-2 Permeability Papp A>B', 'Caco-2 Permeability Efflux', 'MPPB', 'MBPB', 'MGMB']
2025-11-16 20:32:50,047 - __main__ - INFO - Unique Dataset Constituents: ['expansionrx' 'kermt_public']
2025-11-16 20:32:50,047 - __main__ - INFO - Dataset: low, shape: (116527, 12)
2025-11-16 20:32:50,048 - __main__ - INFO - Columns: ['Molecule Name', 'SMILES', 'Dataset', 'LogD', 'KSOL', 'HLM CLint', 'MLM CLint'

In [9]:
# calculate fingerprints for all molecules in each dataset
fpgen = rdFingerprintGenerator.GetMorganGenerator(
    radius=3,
    countSimulation=False,
    includeChirality=False,
    fpSize=2048,
)

for name, df in datasets.items():
    logger.info(f"Calculating fingerprints for dataset: {name}")
    df["mol"] = df["SMILES"].progress_apply(Chem.MolFromSmiles)
    df["Fingerprint"] = df["mol"].progress_apply(fpgen.GetCountFingerprintAsNumPy)

    df.drop(columns=["mol"], inplace=True)

    # put fingerprint column after "Molecule Name,SMILES,Dataset"
    cols = df.columns.tolist()
    cols.insert(3, cols.pop(cols.index("Fingerprint")))
    df = df[cols]

    # expand fingerprint numpy arrays into separate columns
    fp_array = np.vstack(df["Fingerprint"].values)
    fp_df = pd.DataFrame(fp_array, columns=[f"Morgan_FP_{i}" for i in range(fp_array.shape[1])])
    df = pd.concat([df.reset_index(drop=True), fp_df.reset_index(drop=True)], axis=1)
    df.drop(columns=["Fingerprint"], inplace=True)
    logger.debug(f"Number of fingerprint columns added: {fp_df.shape[1]}")

    datasets[name] = df
    logger.info(f"Fingerprints calculated for dataset: {name}")
    logger.debug(f"Dataset {name} columns after fingerprint calculation: {df.columns.tolist()}")

2025-11-16 20:32:50,083 - __main__ - INFO - Calculating fingerprints for dataset: high


  0%|          | 0/5326 [00:00<?, ?it/s]

  0%|          | 0/5326 [00:00<?, ?it/s]

2025-11-16 20:32:51,454 - __main__ - DEBUG - Number of fingerprint columns added: 2048
2025-11-16 20:32:51,455 - __main__ - INFO - Fingerprints calculated for dataset: high
2025-11-16 20:32:51,455 - __main__ - DEBUG - Dataset high columns after fingerprint calculation: ['Molecule Name', 'SMILES', 'Dataset', 'LogD', 'KSOL', 'HLM CLint', 'MLM CLint', 'Caco-2 Permeability Papp A>B', 'Caco-2 Permeability Efflux', 'MPPB', 'MBPB', 'MGMB', 'Morgan_FP_0', 'Morgan_FP_1', 'Morgan_FP_2', 'Morgan_FP_3', 'Morgan_FP_4', 'Morgan_FP_5', 'Morgan_FP_6', 'Morgan_FP_7', 'Morgan_FP_8', 'Morgan_FP_9', 'Morgan_FP_10', 'Morgan_FP_11', 'Morgan_FP_12', 'Morgan_FP_13', 'Morgan_FP_14', 'Morgan_FP_15', 'Morgan_FP_16', 'Morgan_FP_17', 'Morgan_FP_18', 'Morgan_FP_19', 'Morgan_FP_20', 'Morgan_FP_21', 'Morgan_FP_22', 'Morgan_FP_23', 'Morgan_FP_24', 'Morgan_FP_25', 'Morgan_FP_26', 'Morgan_FP_27', 'Morgan_FP_28', 'Morgan_FP_29', 'Morgan_FP_30', 'Morgan_FP_31', 'Morgan_FP_32', 'Morgan_FP_33', 'Morgan_FP_34', 'Morgan_FP_35

  0%|          | 0/94708 [00:00<?, ?it/s]

  0%|          | 0/94708 [00:00<?, ?it/s]

2025-11-16 20:33:12,136 - __main__ - DEBUG - Number of fingerprint columns added: 2048
2025-11-16 20:33:12,137 - __main__ - INFO - Fingerprints calculated for dataset: medium
2025-11-16 20:33:12,137 - __main__ - DEBUG - Dataset medium columns after fingerprint calculation: ['Molecule Name', 'SMILES', 'Dataset', 'LogD', 'KSOL', 'HLM CLint', 'MLM CLint', 'Caco-2 Permeability Papp A>B', 'Caco-2 Permeability Efflux', 'MPPB', 'MBPB', 'MGMB', 'Morgan_FP_0', 'Morgan_FP_1', 'Morgan_FP_2', 'Morgan_FP_3', 'Morgan_FP_4', 'Morgan_FP_5', 'Morgan_FP_6', 'Morgan_FP_7', 'Morgan_FP_8', 'Morgan_FP_9', 'Morgan_FP_10', 'Morgan_FP_11', 'Morgan_FP_12', 'Morgan_FP_13', 'Morgan_FP_14', 'Morgan_FP_15', 'Morgan_FP_16', 'Morgan_FP_17', 'Morgan_FP_18', 'Morgan_FP_19', 'Morgan_FP_20', 'Morgan_FP_21', 'Morgan_FP_22', 'Morgan_FP_23', 'Morgan_FP_24', 'Morgan_FP_25', 'Morgan_FP_26', 'Morgan_FP_27', 'Morgan_FP_28', 'Morgan_FP_29', 'Morgan_FP_30', 'Morgan_FP_31', 'Morgan_FP_32', 'Morgan_FP_33', 'Morgan_FP_34', 'Morgan_F

  0%|          | 0/116527 [00:00<?, ?it/s]

  0%|          | 0/116527 [00:00<?, ?it/s]

2025-11-16 20:33:37,122 - __main__ - DEBUG - Number of fingerprint columns added: 2048
2025-11-16 20:33:37,122 - __main__ - INFO - Fingerprints calculated for dataset: low
2025-11-16 20:33:37,123 - __main__ - DEBUG - Dataset low columns after fingerprint calculation: ['Molecule Name', 'SMILES', 'Dataset', 'LogD', 'KSOL', 'HLM CLint', 'MLM CLint', 'Caco-2 Permeability Efflux', 'MPPB', 'Caco-2 Permeability Papp A>B', 'MBPB', 'MGMB', 'Morgan_FP_0', 'Morgan_FP_1', 'Morgan_FP_2', 'Morgan_FP_3', 'Morgan_FP_4', 'Morgan_FP_5', 'Morgan_FP_6', 'Morgan_FP_7', 'Morgan_FP_8', 'Morgan_FP_9', 'Morgan_FP_10', 'Morgan_FP_11', 'Morgan_FP_12', 'Morgan_FP_13', 'Morgan_FP_14', 'Morgan_FP_15', 'Morgan_FP_16', 'Morgan_FP_17', 'Morgan_FP_18', 'Morgan_FP_19', 'Morgan_FP_20', 'Morgan_FP_21', 'Morgan_FP_22', 'Morgan_FP_23', 'Morgan_FP_24', 'Morgan_FP_25', 'Morgan_FP_26', 'Morgan_FP_27', 'Morgan_FP_28', 'Morgan_FP_29', 'Morgan_FP_30', 'Morgan_FP_31', 'Morgan_FP_32', 'Morgan_FP_33', 'Morgan_FP_34', 'Morgan_FP_35',

In [10]:
# on high-quality dataset, sort by Molecule Name ascending and split test/train by first 90%/10%
percentage_train = 0.9
percentage_validation = 0.1

high_quality_df = datasets["high"].sort_values(by="Molecule Name").reset_index(drop=True)
n_total = high_quality_df.shape[0]
n_train = int(n_total * percentage_train)
n_test = n_total - n_train

test_df = high_quality_df.iloc[n_train:]
train_df = high_quality_df.iloc[:n_train]

# randomly split train into train/validation sets
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
n_validation = int(train_df.shape[0] * percentage_validation)
validation_df = train_df.iloc[:n_validation]
train_df = train_df.iloc[n_validation:]


logger.info(f"High-quality dataset total samples: {n_total}")
logger.info(f"Training samples: {train_df.shape[0]}")
logger.info(f"Validation samples: {validation_df.shape[0]}")
logger.info(f"Testing samples: {test_df.shape[0]}")

# save to temporal datasplit
temporal_dir = output_dir / "high_quality/temporal_split"
temporal_dir.mkdir(parents=True, exist_ok=True)

# convert to hf dataset and save
train_hf = Dataset.from_pandas(train_df, preserve_index=False)
validation_hf = Dataset.from_pandas(validation_df, preserve_index=False)
test_hf = Dataset.from_pandas(test_df, preserve_index=False)
temporal_hf = DatasetDict({"train": train_hf, "validation": validation_hf, "test": test_hf})
# save to disk
temporal_hf.save_to_disk(str(temporal_dir))
logger.info(f"Temporal split datasets saved to {temporal_dir}")

2025-11-16 20:33:37,307 - __main__ - INFO - High-quality dataset total samples: 5326
2025-11-16 20:33:37,308 - __main__ - INFO - Training samples: 4314
2025-11-16 20:33:37,308 - __main__ - INFO - Validation samples: 479
2025-11-16 20:33:37,308 - __main__ - INFO - Testing samples: 533


Saving the dataset (0/1 shards):   0%|          | 0/4314 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/479 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/533 [00:00<?, ? examples/s]

2025-11-16 20:33:40,209 - __main__ - INFO - Temporal split datasets saved to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/temporal_split


In [None]:
n_folds = 5
n_splits = 5
percentage_validation = 0.1
stratify_column = "Dataset"

split_dict = {
    "random_cluster": uru.get_random_clusters,
    "scaffold_cluster": uru.get_bemis_murcko_clusters,
    "kmeans_cluster": uru.get_kmeans_clusters,  # n_clusters = 10 by default
    "umap_cluster": uru.get_umap_clusters,  # n_clusters = 7 by default
    # FIXME: enable butina clustering later
    # "butina_cluster": uru.get_butina_clusters,  # cutoff = 0.65 by default
}

In [12]:
split_datasets = {}

n_iter = len(datasets) * len(split_dict) * n_splits
logger.info(f"Total iterations for dataset splits: {n_iter}")

pbar = tqdm(total=n_iter, desc="Creating dataset splits")
for dset_name, data in datasets.items():  # iterate over different datasets
    split_datasets[dset_name] = {}

    for split_name, split in split_dict.items():  # iterate over different splitting methods
        logger.info(f"Processing dataset: {dset_name} with split method: {split_name}")
        split_datasets[dset_name][split_name] = {}

        for i in range(0, n_splits):  # iterate over different splits
            split_datasets[dset_name][split_name][f"split_{i}"] = {}
            group_kfold_shuffle = uru.GroupKFoldShuffle(n_splits=n_folds, random_state=i, shuffle=True)

            for group in data[stratify_column].unique():  # iterate over different dataset groups
                # stratified group k-fold split (based on "Dataset" column)
                subdata = data[data[stratify_column] == group]
                cluster_list = split(subdata.SMILES)

                # make fictitious subdata indices to map back to original data later
                subdata_indices = subdata.index.to_numpy().copy()

                # iterate over different folds within each split
                for j, (subdata_train_idx, subdata_test_idx) in tqdm(
                    enumerate(
                        group_kfold_shuffle.split(subdata_indices, groups=cluster_list),
                    ),
                    desc=f"Dataset: {dset_name}, Split: {split_name}, Group: {group}", leave=False,
                ):

                    if f"fold_{j}" not in split_datasets[dset_name][split_name][f"split_{i}"]:
                        split_datasets[dset_name][split_name][f"split_{i}"][f"fold_{j}"] = {}

                    # map indices back to original data
                    train_idx = subdata_indices[subdata_train_idx]
                    test_idx = subdata_indices[subdata_test_idx]

                    # further split train_idx into train and validation sets
                    n_train_samples = len(train_idx)
                    n_val_samples = int(n_train_samples * percentage_validation)
                    np.random.seed(i + j)  # ensure reproducibility
                    shuffled_train_idx = np.random.permutation(train_idx)
                    val_idx = shuffled_train_idx[:n_val_samples]
                    train_idx = shuffled_train_idx[n_val_samples:]

                    # save indices for each group split
                    split_datasets[dset_name][split_name][f"split_{i}"][f"fold_{j}"][group] = {
                        "train": train_idx,
                        "validation": val_idx,
                        "test": test_idx,
                    }

                    # garbage collection
                    gc.collect()

                # garbage collection
                gc.collect()

            pbar.update(1)

            # combine group splits into final train/test sets for each fold
            # logger.debug(f"Combining group splits for dataset: {dset_name}, split: {split_name}, iteration: {i}")
            for j in range(n_folds):
                if f"fold_{j}" not in split_datasets[dset_name][split_name][f"split_{i}"]:
                    raise ValueError(
                        f"Fold {j} not found in split {i} for dataset {dset_name} and split method {split_name}"
                    )

                combined_train_indices = []
                combined_val_indices = []
                combined_test_indices = []

                for group in data[stratify_column].unique():
                    group_split = split_datasets[dset_name][split_name][f"split_{i}"][f"fold_{j}"][group]
                    combined_train_indices.extend(group_split["train"])
                    combined_val_indices.extend(group_split["validation"])
                    combined_test_indices.extend(group_split["test"])

                combined_train_indices = np.array(combined_train_indices)
                combined_val_indices = np.array(combined_val_indices)
                combined_test_indices = np.array(combined_test_indices)

                # save combined train/test sets
                split_datasets[dset_name][split_name][f"split_{i}"][f"fold_{j}"]["total"] = {
                    "train": combined_train_indices,
                    "validation": combined_val_indices,
                    "test": combined_test_indices,
                }

                # final assertions


pbar.close()

2025-11-16 20:33:40,338 - __main__ - INFO - Total iterations for dataset splits: 60


Creating dataset splits:   0%|          | 0/60 [00:00<?, ?it/s]

2025-11-16 20:33:40,341 - __main__ - INFO - Processing dataset: high with split method: random_cluster
2025-11-16 20:33:40,376 - __main__ - INFO - Processing dataset: high with split method: scaffold_cluster
2025-11-16 20:33:47,957 - __main__ - INFO - Processing dataset: high with split method: kmeans_cluster
2025-11-16 20:33:55,350 - __main__ - INFO - Processing dataset: high with split method: umap_cluster
2025-11-16 20:34:23,974 - __main__ - INFO - Processing dataset: medium with split method: random_cluster
2025-11-16 20:34:27,565 - __main__ - INFO - Processing dataset: medium with split method: scaffold_cluster
2025-11-16 20:36:05,923 - __main__ - INFO - Processing dataset: medium with split method: kmeans_cluster
2025-11-16 20:37:54,459 - __main__ - INFO - Processing dataset: medium with split method: umap_cluster
2025-11-16 21:01:32,576 - __main__ - INFO - Processing dataset: low with split method: random_cluster
2025-11-16 21:01:43,523 - __main__ - INFO - Processing dataset: lo

In [13]:
# save all datasets with name format {dataset}_quality/{split_method}/split{split_number}_fold{fold_number}.csv
for dset_name, splits in split_datasets.items():
    for split_name, split_data in splits.items():
        for split_number, folds in split_data.items():
            for fold_number, datasets_dict in folds.items():

                split_output_dir = (
                    output_dir / f"{dset_name}_quality/{split_name}/{split_number}/{fold_number}"
                )
                split_output_dir.mkdir(parents=True, exist_ok=True)
                logger.debug(f"Saving dataset to {split_output_dir}")

                train_idx = datasets_dict["total"]["train"]
                val_idx = datasets_dict["total"]["validation"]
                test_idx = datasets_dict["total"]["test"]

                # convert pandas to HF dataset
                train_hf = Dataset.from_pandas(data.loc[train_idx], preserve_index=False)
                val_hf = Dataset.from_pandas(data.loc[val_idx], preserve_index=False)
                test_hf = Dataset.from_pandas(data.loc[test_idx], preserve_index=False)
                dset = DatasetDict({"train": train_hf, "validation": val_hf, "test": test_hf})

                # Save to disk as HF dataset
                dset.save_to_disk(f"{split_output_dir}/hf_dataset")

        # print size of folder in MB after saving all splits
        folder_size = sum(f.stat().st_size for f in split_output_dir.glob("**/*") if f.is_file())
        folder_size_mb = folder_size / (1024 * 1024)
        logger.info(
            f"Saved all splits for dataset: {dset_name}, split method: {split_name}. Folder size: {folder_size_mb:.2f} MB"
        )

2025-11-16 22:26:27,802 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_0/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3834 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1066 [00:00<?, ? examples/s]

2025-11-16 22:26:33,956 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_0/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:26:37,258 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_0/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:26:40,350 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_0/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:26:43,578 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_0/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:26:46,685 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_1/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3834 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1066 [00:00<?, ? examples/s]

2025-11-16 22:26:49,793 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_1/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:26:52,891 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_1/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:26:56,177 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_1/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:26:59,332 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_1/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:02,525 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_2/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3834 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1066 [00:00<?, ? examples/s]

2025-11-16 22:27:05,742 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_2/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:09,043 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_2/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:12,108 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_2/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:15,218 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_2/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:18,472 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_3/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3834 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1066 [00:00<?, ? examples/s]

2025-11-16 22:27:21,573 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_3/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:24,701 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_3/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:27,827 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_3/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:31,079 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_3/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:34,216 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_4/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3834 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1066 [00:00<?, ? examples/s]

2025-11-16 22:27:37,330 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_4/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:40,493 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_4/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:43,745 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_4/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:46,891 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/random_cluster/split_4/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1065 [00:00<?, ? examples/s]

2025-11-16 22:27:49,987 - __main__ - INFO - Saved all splits for dataset: high, split method: random_cluster. Folder size: 44.14 MB
2025-11-16 22:27:49,988 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_0/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3773 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/419 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1134 [00:00<?, ? examples/s]

2025-11-16 22:27:53,306 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_0/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3894 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/432 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

2025-11-16 22:27:56,432 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_0/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3894 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/432 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

2025-11-16 22:27:59,515 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_0/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3742 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/415 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1169 [00:00<?, ? examples/s]

2025-11-16 22:28:02,625 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_0/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3873 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/430 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1023 [00:00<?, ? examples/s]

2025-11-16 22:28:05,886 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_1/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3751 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/416 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1159 [00:00<?, ? examples/s]

2025-11-16 22:28:09,027 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_1/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/4059 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/451 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/816 [00:00<?, ? examples/s]

2025-11-16 22:28:12,279 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_1/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3789 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/420 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1117 [00:00<?, ? examples/s]

2025-11-16 22:28:15,431 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_1/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3811 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/423 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1092 [00:00<?, ? examples/s]

2025-11-16 22:28:18,732 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_1/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3766 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/418 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1142 [00:00<?, ? examples/s]

2025-11-16 22:28:21,860 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_2/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3742 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/415 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1169 [00:00<?, ? examples/s]

2025-11-16 22:28:25,014 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_2/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3867 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/429 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1030 [00:00<?, ? examples/s]

2025-11-16 22:28:28,138 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_2/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3885 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/431 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1010 [00:00<?, ? examples/s]

2025-11-16 22:28:31,359 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_2/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3816 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/424 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1086 [00:00<?, ? examples/s]

2025-11-16 22:28:34,528 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_2/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3866 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/429 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1031 [00:00<?, ? examples/s]

2025-11-16 22:28:37,643 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_3/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3732 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/414 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1180 [00:00<?, ? examples/s]

2025-11-16 22:28:40,847 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_3/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3773 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/419 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1134 [00:00<?, ? examples/s]

2025-11-16 22:28:44,114 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_3/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3934 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/437 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/955 [00:00<?, ? examples/s]

2025-11-16 22:28:47,183 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_3/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3810 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/423 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1093 [00:00<?, ? examples/s]

2025-11-16 22:28:50,387 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_3/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3926 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/436 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/964 [00:00<?, ? examples/s]

2025-11-16 22:28:53,534 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_4/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3929 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/436 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/961 [00:00<?, ? examples/s]

2025-11-16 22:28:56,607 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_4/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3871 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/430 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1025 [00:00<?, ? examples/s]

2025-11-16 22:28:59,863 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_4/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3877 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/430 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1019 [00:00<?, ? examples/s]

2025-11-16 22:29:03,005 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_4/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3683 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/409 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1234 [00:00<?, ? examples/s]

2025-11-16 22:29:06,134 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/scaffold_cluster/split_4/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3816 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/423 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1087 [00:00<?, ? examples/s]

2025-11-16 22:29:09,304 - __main__ - INFO - Saved all splits for dataset: high, split method: scaffold_cluster. Folder size: 44.14 MB
2025-11-16 22:29:09,306 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_0/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3675 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/408 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1243 [00:00<?, ? examples/s]

2025-11-16 22:29:12,469 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_0/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3728 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/414 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1184 [00:00<?, ? examples/s]

2025-11-16 22:29:15,749 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_0/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3660 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/406 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1260 [00:00<?, ? examples/s]

2025-11-16 22:29:18,887 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_0/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3900 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/433 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/993 [00:00<?, ? examples/s]

2025-11-16 22:29:21,936 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_0/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/4212 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/468 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/646 [00:00<?, ? examples/s]

2025-11-16 22:29:25,065 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_1/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/4410 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/489 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/427 [00:00<?, ? examples/s]

2025-11-16 22:29:28,143 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_1/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3751 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/416 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1159 [00:00<?, ? examples/s]

2025-11-16 22:29:31,405 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_1/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3168 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/351 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1807 [00:00<?, ? examples/s]

2025-11-16 22:29:34,521 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_1/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3932 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/436 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/958 [00:00<?, ? examples/s]

2025-11-16 22:29:37,612 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_1/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3916 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/435 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/975 [00:00<?, ? examples/s]

2025-11-16 22:29:40,681 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_2/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3856 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/428 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1042 [00:00<?, ? examples/s]

2025-11-16 22:29:43,805 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_2/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3987 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/443 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/896 [00:00<?, ? examples/s]

2025-11-16 22:29:46,940 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_2/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3796 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/421 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1109 [00:00<?, ? examples/s]

2025-11-16 22:29:50,225 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_2/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3227 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/358 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1741 [00:00<?, ? examples/s]

2025-11-16 22:29:53,365 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_2/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/4310 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/478 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/538 [00:00<?, ? examples/s]

2025-11-16 22:29:56,468 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_3/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3888 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/432 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1006 [00:00<?, ? examples/s]

2025-11-16 22:29:59,595 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_3/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3475 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/386 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1465 [00:00<?, ? examples/s]

2025-11-16 22:30:02,714 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_3/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3691 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/410 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1225 [00:00<?, ? examples/s]

2025-11-16 22:30:05,983 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_3/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/4205 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/467 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/654 [00:00<?, ? examples/s]

2025-11-16 22:30:09,045 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_3/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3915 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/435 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/976 [00:00<?, ? examples/s]

2025-11-16 22:30:12,203 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_4/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3814 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/423 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1089 [00:00<?, ? examples/s]

2025-11-16 22:30:15,387 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_4/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3959 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/439 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/928 [00:00<?, ? examples/s]

2025-11-16 22:30:18,454 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_4/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3882 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/431 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1013 [00:00<?, ? examples/s]

2025-11-16 22:30:21,682 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_4/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3332 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/370 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1624 [00:00<?, ? examples/s]

2025-11-16 22:30:24,819 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/kmeans_cluster/split_4/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/4189 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/465 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/672 [00:00<?, ? examples/s]

2025-11-16 22:30:27,925 - __main__ - INFO - Saved all splits for dataset: high, split method: kmeans_cluster. Folder size: 44.14 MB
2025-11-16 22:30:27,926 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_0/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3520 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/391 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1415 [00:00<?, ? examples/s]

2025-11-16 22:30:31,010 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_0/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/2934 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/325 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2067 [00:00<?, ? examples/s]

2025-11-16 22:30:34,101 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_0/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3838 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1062 [00:00<?, ? examples/s]

2025-11-16 22:30:37,371 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_0/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/4454 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/494 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/378 [00:00<?, ? examples/s]

2025-11-16 22:30:40,512 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_0/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/4430 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/492 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/404 [00:00<?, ? examples/s]

2025-11-16 22:30:43,647 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_1/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/2682 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2346 [00:00<?, ? examples/s]

2025-11-16 22:30:46,763 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_1/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3134 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/348 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1844 [00:00<?, ? examples/s]

2025-11-16 22:30:49,916 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_1/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/4452 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/494 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/380 [00:00<?, ? examples/s]

2025-11-16 22:30:53,140 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_1/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/4419 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/491 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/416 [00:00<?, ? examples/s]

2025-11-16 22:30:56,288 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_1/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/4488 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/498 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/340 [00:00<?, ? examples/s]

2025-11-16 22:30:59,461 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_2/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3244 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/360 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1722 [00:00<?, ? examples/s]

2025-11-16 22:31:02,566 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_2/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3236 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/359 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1731 [00:00<?, ? examples/s]

2025-11-16 22:31:05,829 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_2/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/4472 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/496 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/358 [00:00<?, ? examples/s]

2025-11-16 22:31:08,888 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_2/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/4234 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/470 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/622 [00:00<?, ? examples/s]

2025-11-16 22:31:11,960 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_2/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3990 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/443 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/893 [00:00<?, ? examples/s]

2025-11-16 22:31:15,047 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_3/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3718 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/413 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1195 [00:00<?, ? examples/s]

2025-11-16 22:31:18,234 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_3/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/4191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/465 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/670 [00:00<?, ? examples/s]

2025-11-16 22:31:21,401 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_3/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3438 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1506 [00:00<?, ? examples/s]

2025-11-16 22:31:24,505 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_3/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/3838 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1062 [00:00<?, ? examples/s]

2025-11-16 22:31:27,768 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_3/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3990 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/443 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/893 [00:00<?, ? examples/s]

2025-11-16 22:31:30,831 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_4/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/3938 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/437 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/951 [00:00<?, ? examples/s]

2025-11-16 22:31:33,889 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_4/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/3505 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/389 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1432 [00:00<?, ? examples/s]

2025-11-16 22:31:37,008 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_4/fold_2


Saving the dataset (0/1 shards):   0%|          | 0/3720 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/413 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1193 [00:00<?, ? examples/s]

2025-11-16 22:31:40,240 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_4/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/4174 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/463 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/689 [00:00<?, ? examples/s]

2025-11-16 22:31:43,323 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/umap_cluster/split_4/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/3839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1061 [00:00<?, ? examples/s]

2025-11-16 22:31:46,569 - __main__ - INFO - Saved all splits for dataset: high, split method: umap_cluster. Folder size: 44.10 MB
2025-11-16 22:31:46,569 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_0/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/68189 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18943 [00:00<?, ? examples/s]

2025-11-16 22:32:10,428 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_0/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/68190 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18942 [00:00<?, ? examples/s]

2025-11-16 22:32:22,089 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_0/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:32:33,164 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_0/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:32:44,818 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_0/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:32:55,987 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_1/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/68189 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18943 [00:00<?, ? examples/s]

2025-11-16 22:33:07,126 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_1/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/68190 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18942 [00:00<?, ? examples/s]

2025-11-16 22:33:18,203 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_1/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:33:29,350 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_1/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:33:40,639 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_1/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:33:51,625 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_2/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/68189 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18943 [00:00<?, ? examples/s]

2025-11-16 22:34:02,886 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_2/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/68190 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18942 [00:00<?, ? examples/s]

2025-11-16 22:34:14,392 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_2/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:34:25,725 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_2/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:34:36,970 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_2/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:34:48,301 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_3/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/68189 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18943 [00:00<?, ? examples/s]

2025-11-16 22:34:59,715 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_3/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/68190 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18942 [00:00<?, ? examples/s]

2025-11-16 22:35:10,882 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_3/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:35:22,282 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_3/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:35:33,766 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_3/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:35:45,025 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_4/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/68189 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18943 [00:00<?, ? examples/s]

2025-11-16 22:35:56,156 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_4/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/68190 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18942 [00:00<?, ? examples/s]

2025-11-16 22:36:07,418 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_4/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:36:18,659 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_4/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:36:29,692 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/random_cluster/split_4/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/68191 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7576 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18941 [00:00<?, ? examples/s]

2025-11-16 22:36:41,630 - __main__ - INFO - Saved all splits for dataset: medium, split method: random_cluster. Folder size: 763.04 MB
2025-11-16 22:36:41,631 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_0/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/68564 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7618 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18526 [00:00<?, ? examples/s]

2025-11-16 22:36:53,583 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_0/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/67112 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7456 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20140 [00:00<?, ? examples/s]

2025-11-16 22:37:04,689 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_0/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/67886 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7542 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19280 [00:00<?, ? examples/s]

2025-11-16 22:37:16,985 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_0/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/68507 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7611 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18590 [00:00<?, ? examples/s]

2025-11-16 22:37:29,056 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_0/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/68883 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7653 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18172 [00:00<?, ? examples/s]

2025-11-16 22:37:40,535 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_1/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/67849 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7537 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19322 [00:00<?, ? examples/s]

2025-11-16 22:37:53,569 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_1/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/69250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7694 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17764 [00:00<?, ? examples/s]

2025-11-16 22:38:04,824 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_1/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/69216 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7689 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17803 [00:00<?, ? examples/s]

2025-11-16 22:38:17,863 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_1/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/66219 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7357 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/21132 [00:00<?, ? examples/s]

2025-11-16 22:38:29,501 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_1/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/68420 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7601 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18687 [00:00<?, ? examples/s]

2025-11-16 22:38:41,060 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_2/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/68535 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7614 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18559 [00:00<?, ? examples/s]

2025-11-16 22:38:52,313 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_2/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/69057 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7672 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17979 [00:00<?, ? examples/s]

2025-11-16 22:39:04,261 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_2/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/66547 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7393 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20768 [00:00<?, ? examples/s]

2025-11-16 22:39:15,475 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_2/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/68097 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7566 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19045 [00:00<?, ? examples/s]

2025-11-16 22:39:26,652 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_2/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/68717 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7634 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18357 [00:00<?, ? examples/s]

2025-11-16 22:39:37,586 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_3/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/68594 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7620 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18494 [00:00<?, ? examples/s]

2025-11-16 22:39:48,516 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_3/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/66131 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7347 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/21230 [00:00<?, ? examples/s]

2025-11-16 22:39:59,727 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_3/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/68723 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7635 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18350 [00:00<?, ? examples/s]

2025-11-16 22:40:10,882 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_3/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/68154 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7572 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18982 [00:00<?, ? examples/s]

2025-11-16 22:40:21,949 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_3/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/69351 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7705 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17652 [00:00<?, ? examples/s]

2025-11-16 22:40:33,134 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_4/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/68671 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7629 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18408 [00:00<?, ? examples/s]

2025-11-16 22:40:44,118 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_4/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/68363 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7595 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18750 [00:00<?, ? examples/s]

2025-11-16 22:40:56,354 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_4/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/68529 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7613 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18566 [00:00<?, ? examples/s]

2025-11-16 22:41:07,539 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_4/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/66623 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7402 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20683 [00:00<?, ? examples/s]

2025-11-16 22:41:18,727 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/scaffold_cluster/split_4/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/68768 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7639 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18301 [00:00<?, ? examples/s]

2025-11-16 22:41:29,921 - __main__ - INFO - Saved all splits for dataset: medium, split method: scaffold_cluster. Folder size: 763.04 MB
2025-11-16 22:41:29,922 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_0/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/75365 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8373 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10970 [00:00<?, ? examples/s]

2025-11-16 22:41:41,653 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_0/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/74963 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8328 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11417 [00:00<?, ? examples/s]

2025-11-16 22:41:53,044 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_0/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/69411 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7711 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17586 [00:00<?, ? examples/s]

2025-11-16 22:42:03,930 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_0/fold_3


Saving the dataset (0/1 shards):   0%|          | 0/54615 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6068 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/34025 [00:00<?, ? examples/s]

2025-11-16 22:42:15,437 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_0/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/66599 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7399 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20710 [00:00<?, ? examples/s]

2025-11-16 22:42:26,496 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_1/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/70635 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7848 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/16225 [00:00<?, ? examples/s]

2025-11-16 22:42:37,632 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_1/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/71502 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7944 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15262 [00:00<?, ? examples/s]

2025-11-16 22:42:49,224 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_1/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/69417 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7712 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17579 [00:00<?, ? examples/s]

2025-11-16 22:43:00,547 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_1/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/70544 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7838 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/16326 [00:00<?, ? examples/s]

2025-11-16 22:43:11,690 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_1/fold_4


Saving the dataset (0/1 shards):   0%|          | 0/58853 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6539 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/29316 [00:00<?, ? examples/s]

2025-11-16 22:43:22,573 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_2/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/64236 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7136 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/23336 [00:00<?, ? examples/s]

2025-11-16 22:43:34,166 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_2/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/71499 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7944 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15265 [00:00<?, ? examples/s]

2025-11-16 22:43:45,373 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_2/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/69412 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7711 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17585 [00:00<?, ? examples/s]

2025-11-16 22:43:56,532 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_2/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/65664 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7295 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/21749 [00:00<?, ? examples/s]

2025-11-16 22:44:07,794 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_2/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/70143 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7792 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/16773 [00:00<?, ? examples/s]

2025-11-16 22:44:19,546 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_3/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/75526 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8391 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10791 [00:00<?, ? examples/s]

2025-11-16 22:44:31,142 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_3/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/73022 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8113 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/13573 [00:00<?, ? examples/s]

2025-11-16 22:44:42,693 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_3/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/60703 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6744 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/27261 [00:00<?, ? examples/s]

2025-11-16 22:44:54,474 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_3/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/64285 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/23282 [00:00<?, ? examples/s]

2025-11-16 22:45:05,865 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_3/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/67418 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7489 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19801 [00:00<?, ? examples/s]

2025-11-16 22:45:17,136 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_4/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/70711 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7855 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/16142 [00:00<?, ? examples/s]

2025-11-16 22:45:28,774 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_4/fold_1


Saving the dataset (0/2 shards):   0%|          | 0/71186 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7909 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15613 [00:00<?, ? examples/s]

2025-11-16 22:45:40,944 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_4/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/64953 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7216 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/22539 [00:00<?, ? examples/s]

2025-11-16 22:45:52,238 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_4/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/67956 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7549 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19203 [00:00<?, ? examples/s]

2025-11-16 22:46:04,291 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/kmeans_cluster/split_4/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/66148 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7349 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/21211 [00:00<?, ? examples/s]

2025-11-16 22:46:15,317 - __main__ - INFO - Saved all splits for dataset: medium, split method: kmeans_cluster. Folder size: 763.13 MB
2025-11-16 22:46:15,318 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/umap_cluster/split_0/fold_0


Saving the dataset (0/1 shards):   0%|          | 0/50476 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5607 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/38625 [00:00<?, ? examples/s]

2025-11-16 22:46:25,496 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/umap_cluster/split_0/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/56317 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6256 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/32135 [00:00<?, ? examples/s]

2025-11-16 22:46:35,481 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/umap_cluster/split_0/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/74510 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8278 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11920 [00:00<?, ? examples/s]

2025-11-16 22:46:46,174 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/umap_cluster/split_0/fold_3


Saving the dataset (0/2 shards):   0%|          | 0/82267 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9139 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3302 [00:00<?, ? examples/s]

2025-11-16 22:46:57,336 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/umap_cluster/split_0/fold_4


Saving the dataset (0/2 shards):   0%|          | 0/77384 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8598 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8726 [00:00<?, ? examples/s]

2025-11-16 22:47:07,783 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/umap_cluster/split_1/fold_0


Saving the dataset (0/2 shards):   0%|          | 0/70663 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7850 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/16195 [00:00<?, ? examples/s]

2025-11-16 22:47:18,298 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/umap_cluster/split_1/fold_1


Saving the dataset (0/1 shards):   0%|          | 0/46734 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5192 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/42782 [00:00<?, ? examples/s]

2025-11-16 22:47:28,602 - __main__ - DEBUG - Saving dataset to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/medium_quality/umap_cluster/split_1/fold_2


Saving the dataset (0/2 shards):   0%|          | 0/78999 [00:00<?, ? examples/s]

OSError: [Errno 28] No space left on device

In [None]:
# plot the number of samples for each endpoint in each split
for dset_name, splits in split_datasets.items():
    for split_name, split_data in splits.items():
        for split_number, folds in split_data.items():
            for fold_number, datasets_dict in folds.items():
                split_output_dir = (
                    output_dir / f"{dset_name}_quality/{split_name}/{split_number}/{fold_number}"
                )

                train_idx = datasets_dict["total"]["train"]
                val_idx = datasets_dict["total"]["validation"]
                test_idx = datasets_dict["total"]["test"]

                train_df = datasets[dset_name].loc[train_idx]
                val_df = datasets[dset_name].loc[val_idx]
                test_df = datasets[dset_name].loc[test_idx]

                # count number of samples for each endpoint
                endpoints = [
                    "LogD",
                    "KSOL",
                    "HLM CLint",
                    "MLM CLint",
                    "Caco-2 Permeability Papp A>B",
                    "Caco-2 Permeability Efflux",
                    "MPPB",
                    "MBPB",
                    "MGMB",
                ]

                counts = {
                    "train": [train_df[ep].notnull().sum() for ep in endpoints],
                    "validation": [val_df[ep].notnull().sum() for ep in endpoints],
                    "test": [test_df[ep].notnull().sum() for ep in endpoints],
                }

                counts_df = pd.DataFrame(counts, index=endpoints)

                # plot
                ax = counts_df.plot.bar(rot=45, figsize=(10, 6))
                ax.set_title(
                    f"Dataset: {dset_name}, Split: {split_name}, {split_number}, {fold_number} - Sample Counts per Endpoint"
                )
                ax.set_ylabel("Number of Samples")
                plt.tight_layout()
                plt_path = split_output_dir / "sample_counts_per_endpoint.png"
                plt.savefig(plt_path, dpi=600)
                plt.close()
                logger.info(f"Saved sample counts plot to {plt_path}")

In [None]:
# boxplot for number of test samples for each split method (x) on different datasets (separate plots)
for dset_name, splits in split_datasets.items():
    logger.info(f"Creating boxplot for dataset: {dset_name}")

    plot_data = []
    for split_name, split_data in splits.items():
        for split_id, folds in split_data.items():
            for fold_id, datasets in folds.items():
                n_test_samples = len(datasets["total"]["test"])
                plot_data.append({"Split Method": split_name, "Number of Test Samples": n_test_samples})
    plot_df = pd.DataFrame(plot_data)

    fig, ax = plt.subplots(figsize=(10, 6))
    sns.boxplot(x="Split Method", y="Number of Test Samples", data=plot_df, ax=ax)
    ax.set_title(f"Distribution of Test Set Sizes for Different Split Methods\nDataset: {dset_name}")
    ax.set_ylabel("Number of Test Samples")
    ax.set_xlabel("Split Method")
    ax.yaxis.get_major_locator().set_params(integer=True)
    ax.tick_params(axis="x", rotation=45)
    ax.grid(True, axis="y", linestyle="--", alpha=0.7)

    fig.tight_layout()
    fig.savefig(output_fig_dir / f"{dset_name}_test_set_size_distribution.png", dpi=600)

In [None]:
# boxplot distribution of data points for each split method and each dataset
for dset_name, splits in split_datasets.items():
    for split_name, split_data in splits.items():
        fold_sizes = []
        for split_id, folds in split_data.items():
            for fold_id, groups in folds.items():
                for group_name, datasets in groups.items():
                    # ignore "total"
                    if group_name == "total":
                        continue

                    train_size = len(datasets["train"])
                    test_size = len(datasets["test"])
                    fold_sizes.append(
                        {
                            "Split ID": split_id,
                            "Fold ID": fold_id,
                            "Group": group_name,
                            "Train Size": train_size,
                            "Test Size": test_size,
                        }
                    )
        fold_sizes_df = pd.DataFrame(fold_sizes)

        # 1 figure with 2 boxplots: train size and test size
        logger.info(
            f"Creating train/test size distribution boxplots for dataset: {dset_name}, split method: {split_name}"
        )
        fig, axs = plt.subplots(1, 2, figsize=(12, 6))

        sns.boxplot(x="Group", y="Train Size", data=fold_sizes_df, ax=axs[0])
        axs[0].set_title(
            f"Train Set Size Distribution: {dset_name.capitalize()} Quality, {split_name.replace('_', ' ').capitalize()} Split"
        )

        sns.boxplot(x="Group", y="Test Size", data=fold_sizes_df, ax=axs[1])
        axs[1].set_title(
            f"Test Set Size Distribution: {dset_name.capitalize()} Quality, {split_name.replace('_', ' ').capitalize()} Split"
        )

        for ax in axs:
            ax.grid(True, axis="y", linestyle="--", alpha=0.7)
            ax.tick_params(axis="x", rotation=45)
            ax.set_ylabel("Number of Data Points")
            ax.set_xlabel("Provenance")

        fig.tight_layout()
        fig.savefig(
            output_fig_dir
            / f"{dset_name}_quality_{split_name}_split_train_test_size_distribution_boxplot.png",
            dpi=600,
        )