In [None]:
%cd ../

In [None]:
import os  # noqa: E402
from pathlib import Path  # noqa: E402
from typing import List  # noqa: E402

import h5py  # noqa: E402
import numpy as np  # noqa: E402
import pandas as pd  # noqa: E402
from pandarallel import pandarallel  # noqa: E402
from rdkit import RDLogger  # noqa: E402
from rdkit.Chem import Descriptors  # noqa: E402
from rdkit.Chem.rdmolfiles import (  # noqa: E402
    MolFromSmarts,
    MolFromSmiles,
    MolToSmiles,
    SDMolSupplier,
)
from tdc.utils import create_fold, create_scaffold_split  # noqa: E402
from tokenizers import Tokenizer  # noqa: E402
from tqdm.auto import tqdm, trange  # noqa: E402

from src.data.components.utils import (  # noqa: E402
    create_butina_split,
    smiles2vector_fg,
    smiles2vector_mfg,
    std_smarts,
    std_smiles,
)

RDLogger.DisableLog("rdApp.*")
pandarallel.initialize(progress_bar=True)

# Utility Functions

In [None]:
def preprocess_df(
    task: str,
    data_path: str,
    smiles_col: str,
    target_cols: List[str],
    classification: bool,
    write: bool,
) -> pd.DataFrame:
    """Preprocesses a dataframe for a given task.

    :param task: Task name
    :param data_path: Data path
    :param smiles_col: SMILES column name
    :param target_cols: Target column names
    :param classification: Classification task
    :param write: Write to disk
    :return: Preprocessed dataframe
    """

    df = pd.read_csv(data_path)  # read dataframe
    if classification:
        df.fillna(0, inplace=True)  # fill nulls with 0
    if target_cols == ["multi_target"]:
        target_cols = df.columns[1:].to_list()  # select all columns except the first
    df = df.loc[:, [smiles_col] + target_cols]  # select SMILES and target columns
    df["SMILES"] = df[smiles_col].parallel_apply(std_smiles)  # standardize SMILES
    df.drop(columns=[smiles_col], inplace=True)  # drop original SMILES column
    df.dropna(inplace=True)  # drop nulls
    df.drop_duplicates(subset=["SMILES"], inplace=True)  # drop duplicates
    df.reset_index(drop=True, inplace=True)  # reset index
    if write:
        write_path = Path(f"./data/processed/tasks/{task}")
        write_path.mkdir(parents=True, exist_ok=True)  # create directory if it doesn't exist
        df.to_parquet(write_path / f"{task}.parquet", index=False)  # write to disk
    return df


def create_summary(data_path: str) -> pd.DataFrame:
    """Creates a summary dataframe for a given task.

    :param data_path: Data path
    :return: Summary dataframe
    """
    num_datapoints = []  # number of datapoints per task
    num_tasks = []  # number of tasks per task
    tasks = []  # task names
    for task_path in Path(data_path).iterdir():  # iterate over tasks
        if not task_path.is_dir():  # skip files
            continue
        task = task_path.name  # get task name
        df = pd.read_parquet(task_path / f"{task}.parquet")  # read dataframe
        num_datapoints.append(df.shape[0])  # append number of datapoints
        num_tasks.append(df.shape[1] - 1)  # append number of tasks
        tasks.append(task)  # append task name
    df = (
        pd.DataFrame(
            {
                "Task": tasks,
                "Datapoints": num_datapoints,
                "Num_tasks": num_tasks,
            }
        )
        .sort_values("Task")
        .reset_index(drop=True)  # create summary dataframe
    )
    df.to_parquet(Path(data_path) / "summary.parquet", index=False)  # write to disk
    return df  # return summary dataframe

# Functional Groups Prep

In [None]:
df = pd.read_csv("./data/raw/training/fg.csv")  # read dataframe
drop_values = [
    "NOT [!#1!#6!#7!#8!#16!#15!#9!#17!#35!#53] AND [#6] >0",
    "[!#1;!#6;!#7;!#8;!#9;!#11;!#12;!#15;!#16;!#17;!#19;!#20;!#35;!#53]",
    "NOT [!#6!#1!#7!#8!#9!#17!#35!#53!#15!#14!#16]",
    "[!#6!#1!#7!#8!#9!#17!#35!#53!#15!#14!#16]",
    "[!#1!#6!#7!#8!#16!#15!#9!#17!#35!#53]",
    "C(F)(F)C(F)(F)",
    "(H[Cl,Br,I,F]).([*])",
    "[!#1!#2!#5!#6!#7!#8!#9!#10!#14!#15!#16!#17!#18!#32!#33!#34!#35!#36!#51!#52!#53!#54!#84!#85!#86]~[#6,#7,#8,#16]",
    "([*]).([*])",
    "[#6] >0",
    "[N;!R][C;!R](=O)[!#7] OR [N;!R][C;!R](=O)[N;!R]",
    "[#16;X3v3+0]",
    "N[F,Cl,Br,I]",
    "[+,-]",
    "([*][*][*]).([*][*][*])",
    "[+]",
    "[-]",
    "[ERROR]",
]  # SMARTS to drop
df = df[~df["SMARTS"].isin(drop_values)].reset_index(drop=True)  # drop SMARTS
df["SMARTS"] = df["SMARTS"].parallel_apply(std_smarts)  # standardize SMARTS
df.dropna(inplace=True)  # drop nulls
df.drop_duplicates(subset=["SMARTS"], inplace=True)  # drop duplicates
df.to_parquet("./data/processed/training/fg.parquet", index=False)  # write to disk

# Therapeutic Data Commons Prep

In [None]:
classification_tdc = [
    "herg_karim",
    "herg",
    "herg_central_inhib",
    "dili",
    "skin_reaction",
    "ames",
    "carcinogens_lagunin",
    "sarscov2_3clpro_diamond",
    "sarscov2_vitro_touret",
    "orexin1_receptor_butkiewicz",
    "m1_muscarinic_receptor_agonists_butkiewicz",
    "m1_muscarinic_receptor_antagonists_butkiewicz",
    "potassium_ion_channel_kir2.1_butkiewicz",
    "kcnq2_potassium_channel_butkiewicz",
    "cav3_t-type_calcium_channels_butkiewicz",
    "choline_transporter_butkiewicz",
    "serine_threonine_kinase_33_butkiewicz",
    "tyrosyl-dna_phosphodiesterase_butkiewicz",
    "pampa_ncats",
    "hia_hou",
    "pgp_broccatelli",
    "bioavailability_ma",
    "cyp2c9_substrate_carbonmangels",
    "cyp2d6_substrate_carbonmangels",
    "cyp3a4_substrate_carbonmangels",
]

regression_tdc = [
    "ld50_zhu",
    "herg_central_1uM",
    "herg_central_10uM",
    "caco2_wang",
    "solubility_aqsoldb",
    "ppbr_az",
    "vdss_lombardo",
    "half_life_obach",
    "clearance_hepatocyte_az",
    "clearance_microsome_az",
]

In [None]:
for task in tqdm(classification_tdc, desc="Processing TDC Classification datasets"):
    _ = preprocess_df(
        task=task,
        data_path=f"./data/raw/tdc_data/{task}.csv.gz",
        smiles_col="Drug",
        target_cols=["Y"],
        classification=True,
        write=True,
    )

In [None]:
for task in tqdm(regression_tdc, desc="Processing TDC Regression datasets"):
    _ = preprocess_df(
        task=task,
        data_path=f"./data/raw/tdc_data/{task}.csv.gz",
        smiles_col="Drug",
        target_cols=["Y"],
        classification=False,
        write=True,
    )

# MoleculeNet Prep

## Regression

In [None]:
regression_tasks = {
    "Lipop": ["exp"],
    "ESOL": ["measured log solubility in mols per litre"],
    "FreeSolv": ["expt"],
    "PDBbind-full": ["-logKd/Ki"],
    "PDBbind-core": ["-logKd/Ki"],
    "PDBbind-refined": ["-logKd/Ki"],
    "qm7": ["u0_atom"],
    "qm8": [
        "E1-CC2",
        "E2-CC2",
        "f1-CC2",
        "f2-CC2",
        "E1-PBE0",
        "E2-PBE0",
        "f1-PBE0",
        "f2-PBE0",
        "E1-CAM",
        "E2-CAM",
        "f1-CAM",
        "f2-CAM",
    ],
    "qm9": [
        "mu",
        "alpha",
        "homo",
        "lumo",
        "gap",
        "r2",
        "zpve",
        "cv",
        "u0",
        "u298",
        "h298",
        "g298",
    ],
}

In [None]:
for task in tqdm(regression_tasks.keys(), desc="Processing Regression Tasks"):
    _ = preprocess_df(
        task=task,
        data_path=f"./data/raw/moleculenet/{task}/{task}.csv.gz",
        smiles_col="smiles",
        target_cols=regression_tasks[task],
        classification=False,
        write=True,
    )

## Classification

In [None]:
classification_tasks = {
    "ToxCast": ["multi_target"],
    "BACE": ["Class"],
    "ChEMBL": ["multi_target"],
    "PCBA": ["multi_target"],
    "BBBP": ["p_np"],
    "HIV": ["HIV_active"],
    "MUV": [
        "MUV-466",
        "MUV-548",
        "MUV-600",
        "MUV-644",
        "MUV-652",
        "MUV-689",
        "MUV-692",
        "MUV-712",
        "MUV-713",
        "MUV-733",
        "MUV-737",
        "MUV-810",
        "MUV-832",
        "MUV-846",
        "MUV-852",
        "MUV-858",
        "MUV-859",
    ],
    "Tox21": [
        "NR-AR",
        "NR-AR-LBD",
        "NR-AhR",
        "NR-Aromatase",
        "NR-ER",
        "NR-ER-LBD",
        "NR-PPAR-gamma",
        "SR-ARE",
        "SR-ATAD5",
        "SR-HSE",
        "SR-MMP",
        "SR-p53",
    ],
    "SIDER": ["multi_target"],
    "ClinTox": ["FDA_APPROVED", "CT_TOX"],
}

In [None]:
for task in tqdm(classification_tasks.keys(), desc="Processing Classification Tasks"):
    _ = preprocess_df(
        task=task,
        data_path=f"./data/raw/moleculenet/{task}/{task}.csv.gz",
        smiles_col="smiles",
        target_cols=classification_tasks[task],
        classification=True,
        write=True,
    )

## Peptide Data

In [None]:
peptide_tasks = {
    "1625_aa": {"smiles_col": "Peptide_smiles", "target_cols": ["cleavage"]},
    "746_aa": {"smiles_col": "Peptide_smiles", "target_cols": ["cleavage"]},
    "Impens": {"smiles_col": "Peptide_smiles", "target_cols": ["cleavage"]},
    "Schilling": {"smiles_col": "Peptide_smiles", "target_cols": ["cleavage"]},
    "Ecoli": {"smiles_col": "smiles", "target_cols": ["Encoded_Activity"]},
}

In [None]:
for task in tqdm(peptide_tasks.keys(), desc="Processing Peptide Tasks"):
    _ = preprocess_df(
        task=task,
        data_path=f"./data/raw/peptide_data/{task}.csv.gz",
        smiles_col=peptide_tasks[task]["smiles_col"],
        target_cols=peptide_tasks[task]["target_cols"],
        classification=True,
        write=True,
    )

# MolMapNet Dataset Prep

In [None]:
molmapnet_regression = {
    "Malaria": {
        "smiles_col": "smiles",
        "target_cols": ["activity"],
    },
    "LMC_H": {
        "smiles_col": "Canonical_Smiles",
        "target_cols": ["hlm_clearance[mL.min-1.g-1]"],
    },
    "LMC_R": {
        "smiles_col": "Canonical_Smiles",
        "target_cols": ["rlm_clearance[mL.min-1.g-1]"],
    },
    "LMC_M": {
        "smiles_col": "Canonical_Smiles",
        "target_cols": ["mlm_clearance[mL.min-1.g-1]"],
    },
}
molmapnet_classification = {
    "CYP450": {
        "smiles_col": "smiles",
        "target_cols": [
            "label_1a2",
            "label_2c9",
            "label_2c19",
            "label_2d6",
            "label_3a4",
        ],
    },
    "BACE_ChEMBL": {"smiles_col": "smiles", "target_cols": ["Class"]},
    "BACE_NOVEL": {"smiles_col": "smiles", "target_cols": ["Class"]},
}

In [None]:
for task in tqdm(
    molmapnet_regression.keys(),
    desc="Processing MolMapNet Classification Tasks",
):
    if task == "LMC_H" or task == "LMC_R" or task == "LMC_M":
        data_path = "./data/raw/molmapnet/LMC/LMC.csv.gz"
    else:
        data_path = f"./data/raw/molmapnet/{task}/{task}.csv.gz"
    _ = preprocess_df(
        task=task,
        data_path=data_path,
        smiles_col=molmapnet_regression[task]["smiles_col"],
        target_cols=molmapnet_regression[task]["target_cols"],
        classification=False,
        write=True,
    )

In [None]:
for task in tqdm(
    molmapnet_classification.keys(),
    desc="Processing MolMapNet Classification Tasks",
):
    _ = preprocess_df(
        task=task,
        data_path=f"./data/raw/molmapnet/{task}/{task}.csv.gz",
        smiles_col=molmapnet_classification[task]["smiles_col"],
        target_cols=molmapnet_classification[task]["target_cols"],
        classification=True,
        write=True,
    )

# Cell Line Dataset Prep

In [None]:
cell_lines = [
    "A2780",
    "CCRF-CEM",
    "DU-145",
    "HCT-15",
    "KB",
    "LoVo",
    "PC-3",
    "SK-OV-3",
]  # Cell lines to process

for cell in tqdm(cell_lines, desc="Processing Cancer Cell Lines"):
    suppl = SDMolSupplier(
        f"./data/raw/kekulescope/cell-lines/{cell}/{cell}.sdf"
    )  # Read in SDF file
    mols = [x for x in suppl if x is not None]  # Filter out None values
    smiles = [MolToSmiles(x) for x in mols]  # Get SMILES
    inhibition = [float(m.GetProp("pIC50")) for m in mols]  # Get pIC50 values
    df = pd.DataFrame({"SMILES": smiles, "Target": inhibition})  # Create DataFrame
    df["SMILES"] = df["SMILES"].parallel_apply(std_smiles)  # Standardize SMILES
    df.dropna(inplace=True)  # Drop NaN values
    df.drop_duplicates(subset=["SMILES"], inplace=True)  # Drop duplicates
    write_path = Path(f"./data/processed/tasks/{cell}")  # Create write path
    write_path.mkdir(parents=True, exist_ok=True)  # Create directory
    df.to_parquet(write_path / f"{cell}.parquet", index=False)  # Write DataFrame

# Summary Table

In [None]:
task_df = create_summary("./data/processed/tasks/")  # Create summary DataFrame

In [None]:
task_df.head()  # Show tasks DataFrame

In [None]:
len(task_df)  # Number of tasks

# Splitting Datasets

In [None]:
def split_dataset(df: pd.DataFrame, split_type: str, fold_idx: int) -> dict:
    """Splits a dataset into train, validation and test sets.

    :param df: DataFrame to split
    :param root: Root directory
    :param split_type: Split type
    :param fold_idx: Fold index
    :raises ValueError: Invalid split type
    :return: Dictionary containing train, validation and test sets
    """
    if split_type == "random":
        splits = create_fold(
            df=df, fold_seed=fold_idx, frac=(0.8, 0.1, 0.1)
        )  # Create random split
    elif split_type == "scaffold":
        splits = create_scaffold_split(
            df=df, seed=fold_idx, frac=(0.8, 0.1, 0.1), entity="SMILES"
        )  # Create scaffold split
    elif split_type == "butina":
        splits = create_butina_split(
            df=df, seed=fold_idx, frac=(0.8, 0.1, 0.1), entity="SMILES"
        )  # Create butina split
    else:
        raise ValueError("Invalid split type")

    return splits

In [None]:
def get_descriptors(smi: str) -> np.ndarray:
    """Get descriptors from SMILES string.

    :param smi: SMILES string
    :return: Descriptor vector
    """
    mol = MolFromSmiles(smi)  # Get molecule from SMILES string

    # Get descriptors
    desc_list = []
    for _, func in Descriptors._descList:
        try:
            desc_list.append(func(mol))
        except BaseException:
            desc_list.append(0)
    descriptors = np.asarray(desc_list)
    descriptors = np.nan_to_num(
        descriptors, nan=0.0, posinf=0.0, neginf=0.0
    )  # Replace NaNs with 0
    descriptors = descriptors / np.linalg.norm(descriptors)  # Normalize
    return descriptors

In [None]:
def store_as_hdf5(
    root: Path,
    task: str,
    split_types: List[str],
    fgroups_list: List[MolFromSmarts],
    tokenizer: Tokenizer,
) -> None:
    """Stores a dataset as an HDF5 file.

    :param root: Root directory
    :param task: Task name
    :param split_types: Split types
    :param fgroups_list: List of functional groups
    :param tokenizer: Tokenizer
    """
    df = pd.read_parquet(root / "tasks" / task / f"{task}.parquet")  # Read in DataFrame
    labels = df.drop(columns=["SMILES"]).values
    df["fg"] = df["SMILES"].parallel_apply(lambda x: smiles2vector_fg(x, fgroups_list))
    df["mfg"] = df["SMILES"].parallel_apply(lambda x: smiles2vector_mfg(x, tokenizer))
    df["desc"] = df["SMILES"].parallel_apply(lambda x: get_descriptors(x))

    with h5py.File(root / "tasks" / task / f"{task}.hdf5", "w") as f:
        for split_type in split_types:
            for i in range(5):
                splits = split_dataset(df, split_type, i)
                for split in ["train", "valid", "test"]:
                    indices = df.loc[df["SMILES"].isin(splits[split]["SMILES"])].index.tolist()
                    for index in indices:
                        smile, fg, mfg, desc = df.loc[index, ["SMILES", "fg", "mfg", "desc"]]
                        label = labels[index]
                        group_id = f"{split_type}/fold_{i}/{split}/{smile}"
                        group = f.require_group(group_id)
                        for key, value in (
                            ("fg", fg),
                            ("mfg", mfg),
                            ("desc", desc),
                            ("label", label),
                        ):
                            group.create_dataset(
                                key, data=value, dtype="float32", compression="lzf"
                            )

In [None]:
root = Path("./data/processed")  # Root directory
tokenize_dataset = "pubchem"
frequency = 500

# Get functional groups
fgroups = pd.read_parquet(os.path.join(root, "training", "fg.parquet"))["SMARTS"].tolist()
fgroups_list = [MolFromSmarts(x) for x in fgroups]

# Get tokenizer
tokenizer = Tokenizer.from_file(
    os.path.join(
        root,
        "training",
        "tokenizers",
        f"BPE_{tokenize_dataset}_{frequency}.json",
    )
)

In [None]:
for i in trange(len(task_df), desc="Processing Splits"):
    task = task_df.loc[i, "Task"]
    datapoints = task_df.loc[i, "Datapoints"]
    if datapoints <= 300000:  # type: ignore
        split_types = ["random", "scaffold", "butina"]
    else:
        split_types = ["random"]
    store_as_hdf5(root, task, split_types, fgroups_list, tokenizer)