# Introduction and objective

This is the unit testing of the cleaning structures code. As we advanced further and further down the cleaning steps, we did not carefully checked the functions to clean the salts, neutralize and turn it into canonical structures. 

The main objective here is to translate it into a function (or object) so we can already perform it into another structures and also to give it to lab's partners so they incorporate it into their workflow for cleaning a bunch of ligands. 

In [1]:
import pandas as pd
import session_info

from rdkit import Chem, rdBase
from rdkit.Chem import Draw, Descriptors, PandasTools, AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.SaltRemover import SaltRemover, InputFormat
from rdkit.Chem import rdmolops # To clean the structures
from IPython.display import HTML

# Defining a function to print all structures in mol correctly
def show_df(df):
    return HTML(df.to_html(notebook=True))

In [2]:
test_cases = {
    "SMILES": [
        'C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O.[Na+].[Na+].[Na+] ',
        'COC1=C(C=C2C(=C1)C(=NC(=N2)N3CCN(CC3)C(=O)C4=CC=CO4)N)OC.Cl',
        'CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl.OP(=O)(O)O.OP(=O)(O)O ',
        'C[S+](C)[O-]',
        'CC(C(=O)[O-])O',
        'C[N+](C)(C)CCCCCCCCCC[N+](C)(C)C',
        'CC12CCC(=O)CC1CCC3C2CCC4(C3CCC4O)C',
        'C1=CC=C2C(=C1)C(=O)NS2(=O)=O',
        'CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC=CC=C(C)C=CC=C(C)C=CC2=C(CCCC2(C)C)C)C)C',
        'C#CCNC1CCC2=CC=CC=C12',
        'CN1C2CCC1C(C(C2)OC(=O)C3=CC=CC=C3)C(=O)OC',
        'CN1C(=C(C2=C(S1(=O)=O)C=CS2)O)C(=O)NC3=CC=CC=N3',
        'CN1C(C(=O)Nc2ccccn2)C(=O)c3sccc3[S]1(=O)=O',
        'Sc1ncnc2nc[nH]c12',
        'S=c1[nH]cnc2nc[nH]c12',
    ],
    "NAME": [
        'SODIUM CITRATE',
        'PRAZOSIN HYDROCHLORIDE',
        'CHLOROQUINE DIPHOSPHATE',
        'DIMETHYL SULFOXIDE',
        'LACTATE',
        'DECAMETHONIUM',
        'DIHYDROTESTOSTERONE',
        'SACCHARIN',
        'BETACAROTENE',
        'RASAGILINE',
        'COCAINE',
        'TENOXICAM',
        'TENOXICAM',
        'MERCAPTOPURINE',
        'MERCAPTOPURINE',
    ],
    "CASE": [
        'MULTIPLE SALT',
        'SIMPLE SALT',
        'COMPLEX SALT',
        'POSITIVE/NEGATIVE',
        'NEGATIVE',
        'POSITIVE',
        'SINGLE',
        'DOUBLE',
        'ALTERNATING DOUBLE',
        'TRIPLE',
        'BRIDGE ATOMS',
        'TAUTORMER A',
        'TAUTOMER A',
        'TAUTOMER B',
        'TAUTOMER B',
    ],
}

test_cases_df = pd.DataFrame.from_dict(test_cases)
test_cases_df.head(3)

Unnamed: 0,SMILES,NAME,CASE
0,C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O.[Na+].[N...,SODIUM CITRATE,MULTIPLE SALT
1,COC1=C(C=C2C(=C1)C(=NC(=N2)N3CCN(CC3)C(=O)C4=C...,PRAZOSIN HYDROCHLORIDE,SIMPLE SALT
2,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl.OP(=O)(...,CHLOROQUINE DIPHOSPHATE,COMPLEX SALT


In [3]:
# Salt Cleaning by Keeping the Largest Fragment
# Usar a funcao do rdkit para limpar sal (talvez?)
def keep_largest_fragment(mol):
    '''
    This function reads a mol object and returns the largest fragment, as mol
    '''
    frags = rdmolops.GetMolFrags(mol, asMols=True)
    largest_mol = max(frags, key=lambda x: x.GetNumAtoms())
    return largest_mol

def neutralize_standardize_atoms(mol):
    pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
    at_matches = mol.GetSubstructMatches(pattern)
    at_matches_list = [y[0] for y in at_matches]
    if len(at_matches_list) > 0:
        for at_idx in at_matches_list:
            atom = mol.GetAtomWithIdx(at_idx)
            chg = atom.GetFormalCharge()
            hcount = atom.GetTotalNumHs()
            atom.SetFormalCharge(0)
            atom.SetNumExplicitHs(hcount - chg)
            atom.UpdatePropertyCache()
    return Chem.MolToSmiles(mol) # Isso aqui já canoniza os SMILES?


def smiles_to_inchi(smiles:str) -> str:
    """
    This function recieves a smiles string, and uses openbabel (in linux terminal, be aware of openbabel dependency) to convert it to InChI
    """
    response = ! obabel -:"{smiles}" -oinchi
    return response[-2]

In [4]:
# Before cleaning the SMILES
PandasTools.AddMoleculeColumnToFrame(smilesCol='SMILES', molCol='before_cleaning', frame=test_cases_df)

# Keeping largest fragment
test_cases_df['cleaned_smiles'] = test_cases_df['before_cleaning'].apply(keep_largest_fragment)
# Neutralizing atoms
test_cases_df['cleaned_smiles'] = test_cases_df['cleaned_smiles'].apply(neutralize_standardize_atoms)

# Create smiles to inchi column
test_cases_df['InChI_from_smiles'] = test_cases_df['cleaned_smiles'].apply(smiles_to_inchi)

# Generating the final MOL
PandasTools.AddMoleculeColumnToFrame(smilesCol='cleaned_smiles', molCol='after_cleaning', frame=test_cases_df)

test_cases_df[test_cases_df['InChI_from_smiles'].duplicated()]

# test_cases_df.drop(columns=['SMILES', 'NAME'], inplace=True)
# test_cases_df

Unnamed: 0,SMILES,NAME,CASE,before_cleaning,cleaned_smiles,InChI_from_smiles,after_cleaning
14,S=c1[nH]cnc2nc[nH]c12,MERCAPTOPURINE,TAUTOMER B,<rdkit.Chem.rdchem.Mol object at 0x7fb0c25c6160>,S=c1[nH]cnc2nc[nH]c12,InChI=1S/C5H4N4S/c10-5-3-4(7-1-6-3)8-2-9-5/h1-...,<rdkit.Chem.rdchem.Mol object at 0x7fb0c25c6fa0>
