In [None]:
#¡¡¡¡IMPORTANT NOTE!!!!
#The code requires a column with the smiles strings whose header must be named as: Smiles


import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import SanitizeMol
from rdkit.Chem import RemoveHs
from rdkit.Chem import AssignStereochemistry

#pip install molvs
from molvs.metal import MetalDisconnector
from molvs.fragment import LargestFragmentChooser
from molvs.normalize import Normalizer
from molvs.charge import Reionizer, Uncharger
from molvs import validate_smiles

#from molvs.standardize import Standardizer
#from molvs import standardize_smiles

In [None]:
#Import the file that contain the smiles strings
df=pd.read_excel('input_file.xlsx') #Change the file name to the name of your file that contains the smiles strings.
#df=pd.read_csv('name_of_the_database_file.csv')

In [None]:
#NOTE: If it is intended to determine the canonical tautomer it is important to take into account that the rdkit and molvs functions for this purpose
#sometimes don't preserve the stereochemistry, particularly it is adviced to use molvs over rdkit, since molvs presents this problem with less frequency.

def curation(x,y):
    try:
        mol = Chem.MolFromSmiles(x, sanitize=True) # After constructing the molecule to a format where rdkit can work with the molecule, it is applied the function Chem.SanitizeMol() By default sanitize is always True, but here is shown this parameter to remember that this function includes the sanitize function        
        #Chem.SanitizeMol(mol)                                   # Modifies directly the molecule that is why is not neccesary to include 'mol=' .The sanitize function makes the following: kekulize, check valencies, set aromaticity, conjugation and hybridization. Not recommended to not use sanitize, because in some cases the stereochemistry of the original molecule is not imported and preserved when sanitize is not applied.
        mol = Chem.RemoveHs(mol)                                 # Removal of explicit hydrogens. Some rdkit functions may trouble with molecules with explicit hydrogens such as some functions to calculate physicochemical properties
        mol = MetalDisconnector().disconnect(mol)                # Disconnects metal atoms that are covalently bonded to non-metals.
        mol = LargestFragmentChooser().choose(mol)               # From the molecules that used to be connected with metals or another salts, just the largest fragment is kept.
        mol = Normalizer().normalize(mol)                        # Apply a series of Normalization transforms to correct represent with their charge some functional groups and recombine charges.
        mol = Reionizer().reionize(mol)                          # Ensure the strongest acid groups protonate first in partially ionized molecules. For some reason this function just works in molecules with three or more ionized groups and not in molecules with just two ionized functional groups.
        mol = Uncharger().uncharge(mol)                          # Attempts to neutralize the molecules. Partially ionized molecules will remain without changes.
        Chem.AssignStereochemistry(mol, force=True, cleanIt=True) # Modifies directly the molecule that is why is not neccesary to include 'mol=' .Recalculation of stereochemistry to assure that the original stereochemistry is preserved.
        mol = Chem.MolToSmiles(mol, isomericSmiles=True)                               # Returns the canonical SMILES string for a molecule. The canonical strings generated by rdkit may be different from the canonical SMILES generated by other software. The generated canonical SMILES must be comparable to the ones in ChEMBL since the SMILES strings in ChEMBL were generated with rdkit: https://chembl.gitbook.io/chembl-interface-documentation/frequently-asked-questions/drug-and-compound-questions#how-are-the-smiles-and-inchi-created-for-chembl
                                                                                       # By default isomericSmiles is True, nonetheless the parameter is showed to remember that can be recovered the smiles without the stereochemistry information.
        print(f"{y} processed row")                                                    # Print the index of the processed row
        return mol                                                                     
    except:
        print(f"{y} not processed row")
        pass
        
df["Smiles_curated"] = [curation(i,j) for i,j in zip(df["Smiles"], df.index)]

#Eliminate the empty rows from the column 'Smiles_curated'
#Since the generated SMILES strings from the curation process are canonical won't be two different molecules with the same SMILES string. 
df= df.dropna(subset=['Smiles_curated'])

#This two lines are for the last part of the code that generates a table with the pairs of the duplicated values that later are eliminated
# Filter rows with duplicate values in 'Smiles_curated'
df2=df
duplicates = df2[df2.duplicated('Smiles_curated', keep=False)]

# Remove duplicate rows based on 'Smiles_curated' column, keeping just the first row
df = df.drop_duplicates(subset=['Smiles_curated'], keep='first')


# Alternative ways to call the molvs functions. 
#MetalDisconnector as example.
# mol = self.disconnect_metals(mol) #In the standardizer class was defined MetalDisconnector as disconnect_metals, the remaining classes also were redifined
# mol = MetalDisconnector()(mol)

In [None]:
#This file does not contain the column with some issues and errors overlooked by the prior code.
#But it is useful to save the data with the already determined column'Smiles_curated' at this point.

df.to_excel('data_curated.xlsx')
#df

In [None]:
#Highlight issues and errors

# Thes molvs 'validate_smiles()' function detects several issues and errors in molecules that were not corrected during the curation process because 
# cannot be corrected automatically, nonetheless, a column that highlights the issues and errors is added and theses compounds can be manually removed. 

# ¡¡¡¡NOT ALL THE ISSUES THAT ARE DETECTED BY 'validate_smiles()' IMPLIES THE REMOVAL OF MOLECULES!!!! that depends on the criteria of the person.

# For instance, this function highlights the molecules with charge different than cero and molecules with isotopic information anotated and these
# molecules not neccesarily should be removed.

def validation(x,y): #Highlights the errors in the molecules
    try:
        a=validate_smiles(x)                                      
        if a:                                                     #If the list is not empty, that is to say, if was found an alert.
            print(f"{y} processed row MolVs issue founds")
            return a
        else:                                                     #Verifies if the smiles contain different atoms than the allowed. Thank to Ana Chávez for this block of code. 
            mol = Chem.MolFromSmiles(x, sanitize=True)            #The molecule with a different element than the allowed is highlighted, but not eliminated.
            allowed_elements = {"C","H","O","N","P","S","F","Cl","Br","I","B","Si","Se"}  #Can be added or removed atoms to this list.
            actual_elements = set([atom.GetSymbol() for atom in mol.GetAtoms()])
            if len(actual_elements-allowed_elements) != 0:
                print(f"{y} processed row Allowed elements issue found")
                return 'Contains elements different than the allowed'
    except:
        pass

In [None]:
#Highlight the issues and errors in the not curated smiles

#A column that highlights the errors or issues with the molecules is added.
#Useful to compare the non curated SMILES to the curated smiles and visualize some errors that were corrected during the curation process.
df["Issues_in_NOT_curated_smiles"] = [validation(i,j) for i,j in zip(df["Smiles"],df.index)]        

In [None]:
#Highlight the errors in the curated smiles
df["Issues_in_curated_smiles"] = [validation(i,j) for i,j in zip(df["Smiles_curated"],df.index)]

In [None]:
df.to_excel('data_curated_issues_or_errors_highlighted.xlsx')

In [None]:
# Pairs of duplicates table

#This table allows to identify the compounds which were pair of duplicates.

# Reset the index to have the original indices as a column
duplicates = duplicates.reset_index()

# Merge the DataFrame with itself to get the pairs, including the original indices
pairs_df = duplicates.merge(duplicates, on='Smiles_curated', suffixes=('_left', '_right'))

# Remove duplicate pairs (rows repeated in reverse order)
pairs_df = pairs_df[pairs_df['index_left'] < pairs_df['index_right']]

# Save the resulting pairs to an Excel file
pairs_df.to_excel('duplicates.xlsx', index=False)
