In [None]:
# pip install rdkit pandas tqdm

Collecting rdkit
  Downloading rdkit-2025.9.2-cp39-cp39-macosx_10_15_x86_64.whl (31.7 MB)
[K     |████████████████████████████████| 31.7 MB 19.9 MB/s eta 0:00:01
Installing collected packages: rdkit
Successfully installed rdkit-2025.9.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from tqdm import tqdm
from typing import Optional, Tuple
from rdkit import RDLogger
import logging

In [2]:
RDLogger.DisableLog('rdApp.*')

logger = logging.getLogger()
logger.setLevel(logging.ERROR)

In [16]:
def standardize_smiles(
    smiles: str,
    remove_stereo: bool = False,
) -> Tuple[Optional[str], Optional[str]]:

    if not isinstance(smiles, str) or smiles.strip() == "":
        return None, "empty_smiles"

    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=True)
    except Exception:
        return None, "rdkit_parse_error"

    if mol is None:
        return None, "invalid_smiles"

    try:
        chooser = rdMolStandardize.LargestFragmentChooser()
        mol = chooser.choose(mol)
    except Exception:
        return None, "fragment_chooser_error"

    try:
        taut_enum = rdMolStandardize.TautomerEnumerator()
        mol = taut_enum.Canonicalize(mol)
    except Exception:
        return None, "tautomer_error"

    try:
        Chem.SanitizeMol(mol)
    except Exception:
        return None, "sanitize_error"

    try:
        canonical = Chem.MolToSmiles(
            mol,
            canonical=True,
            isomericSmiles=not remove_stereo
        )
    except Exception:
        return None, "smiles_generation_error"

    return canonical, None

In [9]:
def clean_smiles_dataframe(
    df: pd.DataFrame,
    smiles_col: str = "smiles",
    target_col: Optional[str] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:

    clean_rows = []
    report_rows = []

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        smiles = row[smiles_col]

        canon, error = standardize_smiles(smiles)

        if canon is None:
            report_rows.append({
                "index": idx,
                "original_smiles": smiles,
                "error": error,
            })
            continue

        record = row.to_dict()
        record["canonical_smiles"] = canon
        clean_rows.append(record)

    clean_df = pd.DataFrame(clean_rows)
    report_df = pd.DataFrame(report_rows)

    before = len(clean_df)
    clean_df = clean_df.drop_duplicates(subset=["canonical_smiles"])
    after = len(clean_df)

    print(f"Deduplicated: {before - after} molecules removed")

    return clean_df, report_df


In [17]:
df = pd.read_csv("training_data_contaminated.csv")

print("Initial size:", len(df))

clean_df, report_df = clean_smiles_dataframe(
    df,
    smiles_col="smiles",
    target_col="label",  
)

print("Clean size:", len(clean_df))
print("Removed:", len(report_df))


Initial size: 20498


100%|██████████| 20498/20498 [04:41<00:00, 72.76it/s] 

Deduplicated: 7401 molecules removed
Clean size: 11951
Removed: 1146



