In [1]:
# Import modules
import os
import pandas as pd
import rdkit
# from rdkit.Chem import Draw
# from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem.MolStandardize import rdMolStandardize

# Catch errors
# import sys
# from io import StringIO
# rdkit.Chem.WrapLogs()

# Suppress RDKit Output
rdkit.RDLogger.DisableLog('rdApp.info')

# Print versions
print(f"Pandas Version: {pd.__version__}")
print(f"RDKit Version: {rdkit.__version__}")

Pandas Version: 2.2.3
RDKit Version: 2024.09.6


In [2]:
# Paths
path_data = os.path.realpath("../data")
input_file = "chembl_approved_small_molecule_drugs.csv"
output_file = "chembl_approved_small_molecule_drugs_rdkit-profiled.csv"
input_path = os.path.join(path_data, input_file)
output_path = os.path.join(path_data, output_file)

# Load CSV
df = pd.read_csv(input_path)
df.head()

Unnamed: 0,molecule_chembl_id,smiles,molregno,pref_name,max_phase,therapeutic_flag,dosed_ingredient,structure_type,chebi_par_id,molecule_type,...,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,np_likeness_score
0,CHEMBL1200542,CC(=O)OCC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)C...,674493,DESOXYCORTICOSTERONE ACETATE,4.0,1,1,MOL,34671.0,Small molecule,...,372.51,0.0,27.0,0.69,372.2301,C23H32O4,4.0,0.0,0.0,1.96
1,CHEMBL1200728,Cl.N=C(N)N,674679,GUANIDINE HYDROCHLORIDE,4.0,1,1,MOL,32735.0,Small molecule,...,95.53,0.0,4.0,0.24,59.0483,CH6ClN3,3.0,5.0,0.0,0.32
2,CHEMBL1200982,CCC(C)C1(CC)C(=O)[N-]C(=O)NC1=O.[Na+],674933,BUTABARBITAL SODIUM,4.0,1,1,MOL,,Small molecule,...,234.23,0.0,15.0,0.68,212.1161,C10H15N2NaO3,5.0,2.0,0.0,0.32
3,CHEMBL3989520,NCCc1c[nH]cn1.O=P(O)(O)O.O=P(O)(O)O,2197391,HISTAMINE PHOSPHATE,4.0,1,1,MOL,,Small molecule,...,307.14,1.0,8.0,0.56,111.0796,C5H15N3O8P2,3.0,3.0,0.0,0.0
4,CHEMBL449,CCC(C)C1(CC)C(=O)NC(=O)NC1=O,2393,BUTABARBITAL,4.0,1,0,MOL,3228.0,Small molecule,...,212.25,0.0,15.0,0.68,212.1161,C10H16N2O3,5.0,2.0,0.0,0.32


In [3]:
df.iloc[2903]

molecule_chembl_id                                  CHEMBL1329455
smiles                           CCOC(=O)[N-]c1c[n+](N2CCOCC2)no1
molregno                                                   753339
pref_name                                             MOLSIDOMINE
max_phase                                                     4.0
therapeutic_flag                                                1
dosed_ingredient                                                1
structure_type                                                MOL
chebi_par_id                                                  NaN
molecule_type                                      Small molecule
first_approval                                                NaN
oral                                                            0
parenteral                                                      0
topical                                                         0
natural_product                                                 1
first_in_c

In [4]:
uncharge_app = rdMolStandardize.Uncharger()
mols = []
skipped_indices = []
# sio = sys.stderr = StringIO()
for ii, smi in enumerate(df["smiles"]):
    try:
        mol = rdkit.Chem.MolFromSmiles(
            smi,
            sanitize = False
        )
        assert mol is not None
    except Exception as e:
        skipped_indices.append(ii)
        if pd.notna(smi):
            print(f"Error processing SMILES at index {ii}: {smi}")
            print(f"Exception: {e}")
        continue
    try:
        rdkit.Chem.SanitizeMol(mol)
        rdMolStandardize.FragmentParentInPlace(
            mol,
            skipStandardize = True
        )
        uncharge_app.unchargeInPlace(mol)
    except Exception as e:
        skipped_indices.append(ii)
        print(f"Error processing molecule at index {ii}")
        print(f"Exception: {e}")
        continue
    mols.append((ii, mol))

print(f"{len(mols)}/{df.shape[0]} molecules processed!")
print(f"{df.shape[0]-len(mols)}/{df.shape[0]} molecules skipped!")

3382/3517 molecules processed!
135/3517 molecules skipped!


In [6]:
# res = {}
# for nm, fn in Descriptors._descList:
#     # some of the descriptor fucntions can throw errors if they fail, catch those here:
#     try:
#         val = fn(mol)
#     except:
#         # print the error message:
#         import traceback
#         traceback.print_exc()
#         # and set the descriptor value to whatever missingVal is
#         val = missingVal
#     res[nm] = val        