## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import logging
import itertools
import warnings

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import scienceplots

import useful_rdkit_utils as uru
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator


In [3]:
plt.style.use(['science'])

In [4]:
%matplotlib inline

In [5]:
# setup tqdm
tqdm.pandas()

In [6]:
# setup logging
level = logging.DEBUG
logger = logging.getLogger(__name__)
if logger.hasHandlers():
    logger.handlers.clear()

formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.setLevel(level)

logger.info("Imports successful.")

2025-11-16 15:38:25,962 - __main__ - INFO - Imports successful.


## Load Data

In [7]:
# Data input and output directories
base_data_dir = Path().cwd().parents[0] / "assets/dataset/eda/data/set"
output_dir = base_data_dir.parents[2] / "splits"
output_dir.mkdir(parents=True, exist_ok=True)

output_fig_dir = output_dir / "figures"
output_fig_dir.mkdir(parents=True, exist_ok=True)

if not base_data_dir.exists():
    raise FileNotFoundError(f"Data directory not found at {base_data_dir}")

logger.info(f"Output directory set to {output_dir}")
logger.info(f"Input data directory found at {base_data_dir}")
for dataset_dir in base_data_dir.iterdir():
    logger.info(f"Dataset name: {dataset_dir.name}")

2025-11-16 15:38:25,993 - __main__ - INFO - Output directory set to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits
2025-11-16 15:38:25,993 - __main__ - INFO - Input data directory found at /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/eda/data/set
2025-11-16 15:38:25,994 - __main__ - INFO - Dataset name: cleaned_combined_datasets_low_quality_summary_table.csv
2025-11-16 15:38:25,994 - __main__ - INFO - Dataset name: cleaned_combined_datasets_medium_quality_summary_table.csv
2025-11-16 15:38:25,994 - __main__ - INFO - Dataset name: cleaned_combined_datasets_medium_quality.csv
2025-11-16 15:38:25,994 - __main__ - INFO - Dataset name: cleaned_combined_datasets_low_medium_high_quality.csv
2025-11-16 15:38:25,995 - __main__ - INFO - Dataset name: cleaned_combined_datasets_high_quality.csv
2025-11-16 15:38:25,995 - __main__ - INFO - Dataset name: cleaned

In [9]:
# Load input datasets
datasets = {
    "high": pd.read_csv(base_data_dir / "cleaned_combined_datasets_high_quality.csv"),
    "medium": pd.read_csv(
        base_data_dir / "cleaned_combined_datasets_medium_high_quality.csv", low_memory=False
    ),
    "low": pd.read_csv(
        base_data_dir / "cleaned_combined_datasets_low_medium_high_quality.csv", low_memory=False
    ),
}

for name, df in datasets.items():
    logger.info(f"Dataset: {name}, shape: {df.shape}")
    logger.info(f"Columns: {df.columns.tolist()}")
    logger.info(f"Unique Dataset Constituents: {df['Dataset'].unique()}")

2025-11-16 15:38:26,246 - __main__ - INFO - Dataset: high, shape: (5326, 12)
2025-11-16 15:38:26,247 - __main__ - INFO - Columns: ['Molecule Name', 'SMILES', 'Dataset', 'LogD', 'KSOL', 'HLM CLint', 'MLM CLint', 'Caco-2 Permeability Papp A>B', 'Caco-2 Permeability Efflux', 'MPPB', 'MBPB', 'MGMB']
2025-11-16 15:38:26,247 - __main__ - INFO - Unique Dataset Constituents: ['expansionrx']
2025-11-16 15:38:26,248 - __main__ - INFO - Dataset: medium, shape: (94708, 12)
2025-11-16 15:38:26,248 - __main__ - INFO - Columns: ['Molecule Name', 'SMILES', 'Dataset', 'LogD', 'KSOL', 'HLM CLint', 'MLM CLint', 'Caco-2 Permeability Papp A>B', 'Caco-2 Permeability Efflux', 'MPPB', 'MBPB', 'MGMB']
2025-11-16 15:38:26,251 - __main__ - INFO - Unique Dataset Constituents: ['expansionrx' 'kermt_public']
2025-11-16 15:38:26,251 - __main__ - INFO - Dataset: low, shape: (116527, 12)
2025-11-16 15:38:26,251 - __main__ - INFO - Columns: ['Molecule Name', 'SMILES', 'Dataset', 'LogD', 'KSOL', 'HLM CLint', 'MLM CLint'

In [None]:
# calculate fingerprints for all molecules in each dataset
fpgen = rdFingerprintGenerator.GetMorganGenerator(
    radius=3,
    countSimulation=False,
    includeChirality=False,
    fpSize=2048,
)

for name, df in datasets.items():
    logger.info(f"Calculating fingerprints for dataset: {name}")
    df["mol"] = df["SMILES"].progress_apply(Chem.MolFromSmiles)
    df["Fingerprint"] = df["mol"].progress_apply(fpgen.GetCountFingerprintAsNumPy)

    df.drop(columns=["mol"], inplace=True)
    
    # put fingerprint column after "Molecule Name,SMILES,Dataset"
    cols = df.columns.tolist()
    cols.insert(3, cols.pop(cols.index("Fingerprint")))
    df = df[cols]
    
    # expand fingerprint numpy arrays into separate columns
    fp_array = np.vstack(df["Fingerprint"].values)
    fp_df = pd.DataFrame(fp_array, columns=[f"Morgan_FP_{i}" for i in range(fp_array.shape[1])])
    df = pd.concat([df.reset_index(drop=True), fp_df.reset_index(drop=True)], axis=1)
    df.drop(columns=["Fingerprint"], inplace=True)
    logger.debug(f"Number of fingerprint columns added: {fp_df.shape[1]}")
    
    datasets[name] = df
    logger.info(f"Fingerprints calculated for dataset: {name}")
    logger.debug(f"Dataset {name} columns after fingerprint calculation: {df.columns.tolist()}")

2025-11-16 15:38:26,288 - __main__ - INFO - Calculating fingerprints for dataset: high


  0%|          | 0/5326 [00:00<?, ?it/s]

  0%|          | 0/5326 [00:00<?, ?it/s]

2025-11-16 15:38:27,672 - __main__ - DEBUG - Number of fingerprint columns added: 2048
2025-11-16 15:38:27,673 - __main__ - INFO - Fingerprints calculated for dataset: high
2025-11-16 15:38:27,673 - __main__ - DEBUG - Dataset high columns after fingerprint calculation: ['Molecule Name', 'SMILES', 'Dataset', 'LogD', 'KSOL', 'HLM CLint', 'MLM CLint', 'Caco-2 Permeability Papp A>B', 'Caco-2 Permeability Efflux', 'MPPB', 'MBPB', 'MGMB', 'Morgan_FP_0', 'Morgan_FP_1', 'Morgan_FP_2', 'Morgan_FP_3', 'Morgan_FP_4', 'Morgan_FP_5', 'Morgan_FP_6', 'Morgan_FP_7', 'Morgan_FP_8', 'Morgan_FP_9', 'Morgan_FP_10', 'Morgan_FP_11', 'Morgan_FP_12', 'Morgan_FP_13', 'Morgan_FP_14', 'Morgan_FP_15', 'Morgan_FP_16', 'Morgan_FP_17', 'Morgan_FP_18', 'Morgan_FP_19', 'Morgan_FP_20', 'Morgan_FP_21', 'Morgan_FP_22', 'Morgan_FP_23', 'Morgan_FP_24', 'Morgan_FP_25', 'Morgan_FP_26', 'Morgan_FP_27', 'Morgan_FP_28', 'Morgan_FP_29', 'Morgan_FP_30', 'Morgan_FP_31', 'Morgan_FP_32', 'Morgan_FP_33', 'Morgan_FP_34', 'Morgan_FP_35

  0%|          | 0/94708 [00:00<?, ?it/s]

In [None]:
# on high-quality dataset, sort by Molecule Name ascending and split test/train by first 90%/10%
percentage_train = 0.9

high_quality_df = datasets["high"].sort_values(by="Molecule Name").reset_index(drop=True)
n_total = high_quality_df.shape[0]
n_train = int(n_total * percentage_train)
n_test = n_total - n_train

train_df = high_quality_df.iloc[:n_train]
test_df = high_quality_df.iloc[n_train:]

logger.info(f"High-quality dataset total samples: {n_total}")
logger.info(f"Training samples: {train_df.shape[0]}")
logger.info(f"Testing samples: {test_df.shape[0]}")

# save to temporal datasplit
temporal_dir = output_dir / "high_quality/temporal_split"
temporal_dir.mkdir(parents=True, exist_ok=True)
train_df.to_csv(temporal_dir / "train.csv", index=False)
test_df.to_csv(temporal_dir / "test.csv", index=False)
logger.info(f"Temporal split datasets saved to {temporal_dir}")

2025-11-16 15:36:45,569 - __main__ - INFO - High-quality dataset total samples: 5326
2025-11-16 15:36:45,570 - __main__ - INFO - Training samples: 4793
2025-11-16 15:36:45,570 - __main__ - INFO - Testing samples: 533
2025-11-16 15:36:45,704 - __main__ - INFO - Temporal split datasets saved to /media/aglisman/Linux_Overflow/home/aglisman/VSCodeProjects/OpenADMET-ExpansionRx-Blind-Challenge/assets/dataset/splits/high_quality/temporal_split


In [None]:
n_folds = 5
n_splits = 5
stratify_column = "Dataset"

split_dict = {
    "random_cluster": uru.get_random_clusters,
    "scaffold_cluster": uru.get_bemis_murcko_clusters,
    "kmeans_cluster": uru.get_kmeans_clusters,  # n_clusters = 10 by default
    "butina_cluster": uru.get_butina_clusters,  # cutoff = 0.65 by default
    # "umap_cluster": uru.get_umap_clusters,
}

In [None]:
split_datasets = {}

n_iter = len(datasets) * len(split_dict) * n_splits
logger.info(f"Total iterations for dataset splits: {n_iter}")

pbar = tqdm(total=n_iter, desc="Creating dataset splits")
for dset_name, data in datasets.items():  # iterate over different datasets
    split_datasets[dset_name] = {}

    for split_name, split in split_dict.items():  # iterate over different splitting methods
        logger.info(f"Processing dataset: {dset_name} with split method: {split_name}")
        split_datasets[dset_name][split_name] = {}

        for i in range(0, n_splits):  # iterate over different splits
            split_datasets[dset_name][split_name][f"split_{i}"] = {}
            group_kfold_shuffle = uru.GroupKFoldShuffle(n_splits=n_folds, random_state=i, shuffle=True)

            for group in data[stratify_column].unique():  # iterate over different dataset groups
                 # stratified group k-fold split (based on "Dataset" column)
                subdata = data[data[stratify_column] == group].reset_index(drop=True)
                cluster_list = split(subdata.SMILES)
                
                # iterate over different folds within each split
                for j, (train_idx, test_idx) in enumerate(group_kfold_shuffle.split(subdata, groups=cluster_list)):
                    if f"fold_{j}" not in split_datasets[dset_name][split_name][f"split_{i}"]:
                        split_datasets[dset_name][split_name][f"split_{i}"][f"fold_{j}"] = {}
                        
                    split_datasets[dset_name][split_name][f"split_{i}"][f"fold_{j}"][group] = {
                        "train": subdata.loc[train_idx].reset_index(drop=True).copy(),
                        "test": subdata.loc[test_idx].reset_index(drop=True).copy(),
                    }
            
            # combine group splits into final train/test sets for each fold
            logger.debug(f"Combining group splits for dataset: {dset_name}, split: {split_name}, iteration: {i}")
            for j in range(n_folds):
                combined_train = pd.concat(
                    [
                        split_datasets[dset_name][split_name][f"split_{i}"][f"fold_{j}"][group]["train"]
                        for group in data[stratify_column].unique()
                    ],
                    ignore_index=True,
                )
                combined_test = pd.concat(
                    [
                        split_datasets[dset_name][split_name][f"split_{i}"][f"fold_{j}"][group]["test"]
                        for group in data[stratify_column].unique()
                    ],
                    ignore_index=True,
                )
                split_datasets[dset_name][split_name][f"split_{i}"][f"fold_{j}"]["total"] = {
                    "train": combined_train,
                    "test": combined_test,
                }

pbar.close()

2025-11-16 15:36:45,801 - __main__ - INFO - Total iterations for dataset splits: 60


Creating dataset splits:   0%|          | 0/60 [00:00<?, ?it/s]

2025-11-16 15:36:45,805 - __main__ - INFO - Processing dataset: high with split method: random_cluster
2025-11-16 15:36:45,812 - __main__ - DEBUG - Combining group splits for dataset: high, split: random_cluster, iteration: 0
2025-11-16 15:36:45,820 - __main__ - DEBUG - Combining group splits for dataset: high, split: random_cluster, iteration: 1
2025-11-16 15:36:45,828 - __main__ - DEBUG - Combining group splits for dataset: high, split: random_cluster, iteration: 2
2025-11-16 15:36:45,836 - __main__ - DEBUG - Combining group splits for dataset: high, split: random_cluster, iteration: 3
2025-11-16 15:36:45,844 - __main__ - DEBUG - Combining group splits for dataset: high, split: random_cluster, iteration: 4
2025-11-16 15:36:45,846 - __main__ - INFO - Processing dataset: high with split method: scaffold_cluster
2025-11-16 15:36:47,432 - __main__ - DEBUG - Combining group splits for dataset: high, split: scaffold_cluster, iteration: 0
2025-11-16 15:36:49,017 - __main__ - DEBUG - Combini

KeyboardInterrupt: 

In [None]:
# save all datasets with name format {dataset}_quality/{split_method}/split{split_number}_fold{fold_number}.csv
for dset_name, splits in split_datasets.items():
    for split_name, split_data in splits.items():
        for split_number, folds in split_data.items():
            for fold_number, datasets_dict in folds.items():
                train_df = datasets_dict["total"]["train"]
                test_df = datasets_dict["total"]["test"]
                
                split_output_dir = output_dir / f"{dset_name}_quality" / split_name
                split_output_dir.mkdir(parents=True, exist_ok=True)
                
                train_file = split_output_dir / f"{split_number}_{fold_number}_train.csv"
                test_file = split_output_dir / f"{split_number}_{fold_number}_test.csv"
                
                train_df.to_csv(train_file, index=False)
                test_df.to_csv(test_file, index=False)
                
                logger.info(f"Saved train set to {train_file} with shape {train_df.shape}")
                logger.info(f"Saved test set to {test_file} with shape {test_df.shape}")

In [None]:
# boxplot for number of test samples for each split method (x) on different datasets (separate plots)
for dset_name, splits in split_datasets.items():
    logger.info(f"Creating boxplot for dataset: {dset_name}")

    plot_data = []
    for split_name, split_data in splits.items():
        for split_id, folds in split_data.items():
            for fold_id, datasets in folds.items():
                n_test_samples = len(datasets["total"]["test"])
                plot_data.append({
                    "Split Method": split_name,
                    "Number of Test Samples": n_test_samples
                })
    plot_df = pd.DataFrame(plot_data)

    fig, ax = plt.subplots(figsize=(10, 6))
    sns.boxplot(x="Split Method", y="Number of Test Samples", data=plot_df, ax=ax)
    ax.set_title(f"Distribution of Test Set Sizes for Different Split Methods\nDataset: {dset_name}")
    ax.set_ylabel("Number of Test Samples")
    ax.set_xlabel("Split Method")
    ax.yaxis.get_major_locator().set_params(integer=True)
    ax.tick_params(axis="x", rotation=45)
    ax.grid(True, axis="y", linestyle="--", alpha=0.7)

    fig.tight_layout()
    fig.savefig(output_fig_dir / f"{dset_name}_test_set_size_distribution.png", dpi=600)

In [None]:
# boxplot distribution of data points for each split method and each dataset

for dset_name, splits in split_datasets.items():
    for split_name, split_data in splits.items():
        fold_sizes = []
        for split_id, folds in split_data.items():
            for fold_id, groups in folds.items():
                for group_name, datasets in groups.items():
                    train_size = len(datasets["train"])
                    test_size = len(datasets["test"])
                    fold_sizes.append(
                        {
                            "Split ID": split_id,
                            "Fold ID": fold_id,
                            "Group": group_name,
                            "Train Size": train_size,
                            "Test Size": test_size,
                        }
                    )
        fold_sizes_df = pd.DataFrame(fold_sizes)
        

        # 1 figure with 2 boxplots: train size and test size
        logger.info(f"Creating train/test size distribution boxplots for dataset: {dset_name}, split method: {split_name}")
        fig, axs = plt.subplots(1, 2, figsize=(12, 6))
        
        sns.boxplot(x="Group", y="Train Size", data=fold_sizes_df, ax=axs[0])
        axs[0].set_title(f"Train Set Size Distribution: {dset_name.capitalize()} Quality, {split_name.replace('_', ' ').capitalize()} Split")
       
        sns.boxplot(x="Group", y="Test Size", data=fold_sizes_df, ax=axs[1])
        axs[1].set_title(f"Test Set Size Distribution: {dset_name.capitalize()} Quality, {split_name.replace('_', ' ').capitalize()} Split")

        for ax in axs:
            ax.grid(True, axis="y", linestyle="--", alpha=0.7)
            ax.tick_params(axis='x', rotation=45)
            ax.set_ylabel("Number of Data Points")
            ax.set_xlabel("Provenance")
        
        fig.tight_layout()
        fig.savefig(
            output_fig_dir
            / f"{dset_name}_quality_{split_name}_split_train_test_size_distribution_boxplot.png",
            dpi=600,
        )