In [1]:
import pandas as pd
# %pip install pubchempy
import pubchempy as pcp
# %pip install rdkit
from rdkit import Chem

In [2]:
# Reading the csv file
df0 = pd.read_csv('data/Class_1_original_dataset.csv')  
df0.head

<bound method NDFrame.head of                                  Name  \
0                          Everolimus   
1                        Posaconazole   
2                      Alatrofloxacin   
3                         Aminosidine   
4                          Isepamicin   
..                                ...   
901                        Remdesivir   
902  2-Hyroxypropyl-beta-cyclodextrin   
903         3,3'-iminodipropionitrile   
904                     chlorhexidine   
905                       Clioquinol    

                                                 Smile  Class  
0    CO[C@@H]1C[C@H](C[C@@H](C)[C@@H]2CC(=O)[C@H](C...      1  
1    CC[C@@H]([C@H](C)O)N1N=CN(C1=O)c2ccc(cc2)N3CCN...      1  
2    C[C@H](N)C(=O)N[C@@H](C)C(=O)N[C@H]1[C@@H]2CN(...      1  
3    NC[C@@H]1O[C@H](O[C@H]2[C@@H](O)[C@H](O[C@@H]3...      1  
4    CN[C@@H]1[C@@H](O)[C@@H](O[C@@H]2[C@@H](O)[C@H...      1  
..                                                 ...    ...  
901  CCC(CC)COC(=O)C(C)NP(=O)

In [3]:
# Given the name of a drug, return the related CIDs (PubChem compound ID) of this drug from PubChem
def get_CID(drug_name):
    try:
        cids = pcp.get_cids(drug_name)
        assert len(cids) > 0
        return cids
    except Exception:
        print(f'No structure can be found for {drug_name}')
        return None


In [4]:
# This step takes around 7.5min for 906 drugs.
df0["PUBCHEM_CIDs"] = df0["Name"].apply(get_CID)

No structure can be found for Polythiazide
No structure can be found for Poly-L-lysine
No structure can be found for 2-Hyroxypropyl-beta-cyclodextrin


In [5]:
df0.head

<bound method NDFrame.head of                                  Name  \
0                          Everolimus   
1                        Posaconazole   
2                      Alatrofloxacin   
3                         Aminosidine   
4                          Isepamicin   
..                                ...   
901                        Remdesivir   
902  2-Hyroxypropyl-beta-cyclodextrin   
903         3,3'-iminodipropionitrile   
904                     chlorhexidine   
905                       Clioquinol    

                                                 Smile  Class  \
0    CO[C@@H]1C[C@H](C[C@@H](C)[C@@H]2CC(=O)[C@H](C...      1   
1    CC[C@@H]([C@H](C)O)N1N=CN(C1=O)c2ccc(cc2)N3CCN...      1   
2    C[C@H](N)C(=O)N[C@@H](C)C(=O)N[C@H]1[C@@H]2CN(...      1   
3    NC[C@@H]1O[C@H](O[C@H]2[C@@H](O)[C@H](O[C@@H]3...      1   
4    CN[C@@H]1[C@@H](O)[C@@H](O[C@@H]2[C@@H](O)[C@H...      1   
..                                                 ...    ...   
901  CCC(CC)COC(=O)C(C

In [6]:
df1 = df0.copy()
# The most relevant SMILES is the SMILES corresponding to the first CID in the search result.
df1["Most_Relevant_SMILES"] = None
# Match_MostRelevant indicates if the original SMILES matches with the most relevant SMILES.
df1["Match_MostRelevant"] = False
# Matching_SMILES is the PubChem SMILES corresponding to one of the related CIDs that matches with the original SMILES.
df1["Matching_SMILES"] = None
# Matching CID is the matched CID 
df1["Matching_CID"] = None
# The matching International Chemical Identifier (InChI) Key 
df1["Matching_InChIKey"] = None


In [7]:
# Given two isomeric smiles strings, construct the molecules using Rdkit
# and then generate the isomeric smiles again to see if the uniformly generated smiles match
def compare_smiles(smiles1, smiles2):
    if smiles1 == None or smiles2 == None:
        return False
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)
    
    newsmile1 = Chem.MolToSmiles(mol1,isomericSmiles=True)
    newsmile2 = Chem.MolToSmiles(mol2,isomericSmiles=True)

    if newsmile1 == newsmile2:   
        return True
    else:
        return False

In [8]:
# It takes around 8.5m for 906 drugs.
for i in range(len(df1)):
    if df1["PUBCHEM_CIDs"][i]is not None:
        # the most relevant CID is the first in the obtained CIDs
        most_CID = df1["PUBCHEM_CIDs"][i][0]
        # get the compound object identified by this CID
        c = pcp.get_compounds(most_CID)[0]
        # get the most relevant SMILES
        # if there's no isomeric SMILES for this compound, the canonical SMILES will be automatically returned.
        df1.loc[i,"Most_Relevant_SMILES"] = c.isomeric_smiles

        # First compare the most relevant SMILES with the original SMILES.
        if compare_smiles(df1["Most_Relevant_SMILES"][i],df1["Smile"][i]):
            df1.loc[i,"Match_MostRelevant"] = True
            df1.loc[i,"Matching_SMILES"] = df1["Most_Relevant_SMILES"][i]
            df1.loc[i,"Matching_CID"] = most_CID
            df1.loc[i,"Matching_InChIKey"] = c.inchikey

        # if the origininal SMILES does not match with the original SMILES and there are other records available,
        # check all the available records to find matching SMILES.
        elif len(df0["PUBCHEM_CIDs"][i])>1:
            for j in range(1,len(df1["PUBCHEM_CIDs"][i])):
                cid = df1["PUBCHEM_CIDs"][i][j]
                c = pcp.get_compounds(cid)[0]
                smile = c.isomeric_smiles
                if compare_smiles(smile,df1["Smile"][i]):
                    df1.loc[i,"Matching_SMILES"] = smile
                    df1.loc[i,"Matching_CID"] = cid
                    df1.loc[i,"Matching_InChIKey"] = c.inchikey
                    break
 



In [9]:
df1.head

<bound method NDFrame.head of                                  Name  \
0                          Everolimus   
1                        Posaconazole   
2                      Alatrofloxacin   
3                         Aminosidine   
4                          Isepamicin   
..                                ...   
901                        Remdesivir   
902  2-Hyroxypropyl-beta-cyclodextrin   
903         3,3'-iminodipropionitrile   
904                     chlorhexidine   
905                       Clioquinol    

                                                 Smile  Class  \
0    CO[C@@H]1C[C@H](C[C@@H](C)[C@@H]2CC(=O)[C@H](C...      1   
1    CC[C@@H]([C@H](C)O)N1N=CN(C1=O)c2ccc(cc2)N3CCN...      1   
2    C[C@H](N)C(=O)N[C@@H](C)C(=O)N[C@H]1[C@@H]2CN(...      1   
3    NC[C@@H]1O[C@H](O[C@H]2[C@@H](O)[C@H](O[C@@H]3...      1   
4    CN[C@@H]1[C@@H](O)[C@@H](O[C@@H]2[C@@H](O)[C@H...      1   
..                                                 ...    ...   
901  CCC(CC)COC(=O)C(C

In [13]:
df1.to_csv('data/processed_class1.csv',index=False)

In [11]:
# NoMatch=df0[df0["Matching_SMILES"].notnull()]
# NoMatch.to_csv('data/nomatch_class1.csv')

In [12]:
# # Displaying some of the mismatching molecules
# for i in range(10):
#     if df0["Same_Mol"][i]== False:
#         mol1 = Chem.MolFromSmiles(df0["Smile"][i])
#         mol2 = Chem.MolFromSmiles(df0["PUBCHEM_SMILES"][i])
#         fig1 = Chem.Draw.MolToMPL(mol1)
#         fig1.show()
#         fig2 = Chem.Draw.MolToMPL(mol2)
#         fig2.show()