References for this pipeline:


1.   RDKit documentation. Available at: https://www.rdkit.org/docs/index.html
2.   RSC_OpenScience_Standardization_202104 documentation. Available at: https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
3.   practical_cheminformatics_tutorials. Available at: https://github.com/PatWalters/practical_cheminformatics_tutorials/blob/main/misc/working_with_ChEMBL_drug_data.ipynb
4.   TROPSHA, Alexander. Best practices for QSAR model development, validation, and exploitation. Molecular informatics, v. 29, n. 6‐7, p. 476-488, 2010.
5.   BENTO, A. Patrícia et al. An open source chemical structure curation pipeline using RDKit. Journal of Cheminformatics, v. 12, p. 1-16, 2020.


In [1]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.1/35.1 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2024.3.1


In [93]:
import re
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize

Smiles Standardization

1. Structure validation: the function MolFromSmiles() returns None if chemical substance is invalid
2. Removal of Hs atoms by function MolFromSmiles()
3. Canonicalizon of smiles by function MolToSmiles()



In [86]:
smiles_with_error = ["CCO[AlH2]", "Br[Mg]c1ccccc1CCC(=O)O[Na]", "AAAAA", "C[Hg]C", "[K+].[O-]C(=O)CSC1=CC=C(F)C=C1",
                     "Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)(O)O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1",
                     "Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)([O-])O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1"]

for smi in smiles_with_error:
  mol = Chem.MolFromSmiles(smi, sanitize=True)
  if mol is not None:
    smiles_std = Chem.MolToSmiles(mol, canonical=True)
    print(smiles_std)
  else:
    pass

CCO[AlH2]
O=C(CCc1ccccc1[Mg]Br)O[Na]
C[Hg]C
O=C([O-])CSc1ccc(F)cc1.[K+]
Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)(O)O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1
Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)([O-])O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1


Mol normalization

1. Applies a series of standard transformations to correct functional groups and recombine charges (normalization)
2. Disconnect metal atoms and counterions
3. Reionize the molecule

In [87]:
smiles = ["CCO[AlH2]", "Br[Mg]c1ccccc1CCC(=O)O[Na]", "C[Hg]C", "[K+].[O-]C(=O)CSC1=CC=C(F)C=C1",
          "Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)(O)O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1",
          "Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)([O-])O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1"]

for smi in smiles:
  mol = Chem.MolFromSmiles(smi, sanitize=True)
  clean_mol = rdMolStandardize.Cleanup(mol)
  print(Chem.MolToSmiles(clean_mol, canonical=True))

CC[O-].[AlH2+]
O=C([O-])CCc1ccccc1[Mg]Br.[Na+]
C[Hg]C
O=C([O-])CSc1ccc(F)cc1.[K+]
Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)(O)O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1
Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)([O-])O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1


Indeed removes metal atoms, counterions, etc.

In [88]:
smiles = ["CC[O-].[AlH2+]", "O=C([O-])CCc1ccccc1[Mg]Br.[Na+]", "C[Hg]C", "O=C([O-])CSc1ccc(F)cc1.[K+]",
          "Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)(O)O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1",
          "Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)([O-])O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1"]

for smi in smiles:
  mol = Chem.MolFromSmiles(smi, sanitize=True)
  clean_mol = rdMolStandardize.FragmentParent(mol)
  print(Chem.MolToSmiles(clean_mol, canonical=True))

CC[O-]
O=C([O-])CCc1ccccc1[Mg]Br
C[Hg]C
O=C([O-])CSc1ccc(F)cc1
Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)(O)O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1
Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)([O-])O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1


Neutralizes the molecule: note that neutralizing and removing charges are not the same because in some cases (p.ex.: smiles[5]) the charges are kept. Sometimes, also, the neutralizing process can fail, that's why the duplicates can also be removed by inchi values (bellow).

In [89]:
smiles = ["CC[O-]", "O=C([O-])CCc1ccccc1[Mg]Br", "C[Hg]C", "O=C([O-])CSc1ccc(F)cc1",
          "Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)(O)O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1",
          "Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)([O-])O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1"]

uncharger = rdMolStandardize.Uncharger()
for smi in smiles:
  mol = Chem.MolFromSmiles(smi, sanitize=True)
  clean_mol = uncharger.uncharge(mol)
  print(Chem.MolToSmiles(clean_mol, canonical=True))

CCO
O=C(O)CCc1ccccc1[Mg]Br
C[Hg]C
O=C(O)CSc1ccc(F)cc1
Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)(O)O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1
Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)([O-])O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1


Captures the canonical tautomer for a molecule.

In [90]:
smiles = ["CCO", "O=C(O)CCc1ccccc1[Mg]Br", "C[Hg]C", "O=C(O)CSc1ccc(F)cc1",
          "Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)(O)O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1",
          "Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)([O-])O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1"]

te = rdMolStandardize.TautomerEnumerator()
for smi in smiles:
  mol = Chem.MolFromSmiles(smi, sanitize=True)
  clean_mol = te.Canonicalize(mol)
  print(Chem.MolToSmiles(clean_mol, canonical=True))

CCO
O=C(O)CCc1ccccc1[Mg]Br
C[Hg]C
O=C(O)CSc1ccc(F)cc1
Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)(O)O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1
Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)([O-])O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1


In [105]:
duplicated_smiles = ["Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)(O)O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1",
                     "Cc1ccccc1-c1cc(N2CC[N+](C)(COP(=O)([O-])O)CC2)ncc1N(C)C(=O)C(C)(C)c1cc(C(F)(F)F)cc(C(F)(F)F)c1"]

inchi_list = []

for smi in duplicated_smiles:
  mol = Chem.MolFromSmiles(smi, sanitize=True)
  inchi = Chem.MolToInchi(mol)
  inchi_list.append(inchi)

inchi_list[0] == inchi_list[1]

False

In [106]:
inchi_list = ["InChI=1S/C31H35F6N4O5P/c1-20-8-6-7-9-24(20)25-17-27(40-10-12-41(5,13-11-40)19-46-47(43,44)45)38-18-26(25)39(4)28(42)29(2,3)21-14-22(30(32,33)34)16-23(15-21)31(35,36)37/h6-9,14-18H,10-13,19H2,1-5H3,(H-,43,44,45)/p+1",
         "InChI=1S/C31H35F6N4O5P/c1-20-8-6-7-9-24(20)25-17-27(40-10-12-41(5,13-11-40)19-46-47(43,44)45)38-18-26(25)39(4)28(42)29(2,3)21-14-22(30(32,33)34)16-23(15-21)31(35,36)37/h6-9,14-18H,10-13,19H2,1-5H3,(H-,43,44,45)"]

new_inchi_list = []

for inchi in inchi_list:
  inchi = re.sub("/p\+[0-9]+", "", inchi)
  new_inchi_list.append(inchi)

new_inchi_list[0] == new_inchi_list[1]

True