In [1]:
import pandas as pd
# %pip install pubchempy
import pubchempy as pcp
# %pip install rdkit
from rdkit import Chem

In [2]:
# Reading the csv file
df0 = pd.read_csv('data/Class_0_original_dataset.csv')  
df0.head

<bound method NDFrame.head of                Name                                              Smile  Class
0      propisergide  C[C@H](CO)NC(=O)[C@H]1CN(C)[C@@H]2Cc3cn(C)c4cc...      0
1        sancycline  CN(C)[C@H]1[C@@H]2C[C@@H]3Cc4cccc(O)c4C(=O)C3=...      0
2       metoserpate  CO[C@H]1C[C@@H]2CN3CCc4c([nH]c5cc(OC)ccc45)[C@...      0
3       fludiazepam               CN1C(=O)CN=C(c2ccccc2F)c3cc(Cl)ccc13      0
4        epiestriol  C[C@]12CC[C@H]3[C@H](CCc4cc(O)ccc34)[C@H]1C[C@...      0
...             ...                                                ...    ...
1602     Demecarium  CN(CCCCCCCCCCN(C)C(=O)OC1=CC=CC(=C1)[N+](C)(C)...      0
1603  Hexafluronium  C[N+](C)(CCCCCC[N+](C)(C)C1C2=CC=CC=C2C3=CC=CC...      0
1604      Ivacaftor  CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=...      0
1605    gadoteridol  CC(CN1CCN(CCN(CCN(CC1)CC(=O)[O-])CC(=O)[O-])CC...      0
1606  Fosaprepitant  CC(C1=CC(=CC(=C1)C(F)(F)F)C(F)(F)F)OC2C(N(CCO2...      0

[1607 rows x 3 columns]>

In [3]:
# Given the name of a drug, return the related CIDs (PubChem compound ID) of this drug from PubChem
def get_CID(drug_name):
    try:
        cids = pcp.get_cids(drug_name)
        assert len(cids) > 0
        return cids
    except Exception:
        print(f'No structure can be found for {drug_name}')
        return None


In [4]:
# This step takes around 13min for 1607 drugs.
df0["PUBCHEM_CIDs"] = df0["Name"].apply(get_CID)

No structure can be found for antimony thioglycollate
No structure can be found for azaguanidine
No structure can be found for hmba
No structure can be found for nsc 93236
No structure can be found for chloroethyl mesylate
No structure can be found for imidazopyrazole
No structure can be found for elderfield's pyrimidine mustard
No structure can be found for chlorodihydroxyandrostenone
No structure can be found for thioguanine alpha-deoxyriboside
No structure can be found for thioguanine beta-deoxyriboside
No structure can be found for 7-methyltestosterone
No structure can be found for pararosaniline
No structure can be found for bephenium oh-naphthoate
No structure can be found for tryptophane mustard
No structure can be found for diaminomethylphenazinium cl
No structure can be found for nsc 19622
No structure can be found for fenbuprol
No structure can be found for nsc 60795
No structure can be found for androstenetrione
No structure can be found for tylocrebin-(+)
No structure can b

In [5]:
df0.head

<bound method NDFrame.head of                Name                                              Smile  Class  \
0      propisergide  C[C@H](CO)NC(=O)[C@H]1CN(C)[C@@H]2Cc3cn(C)c4cc...      0   
1        sancycline  CN(C)[C@H]1[C@@H]2C[C@@H]3Cc4cccc(O)c4C(=O)C3=...      0   
2       metoserpate  CO[C@H]1C[C@@H]2CN3CCc4c([nH]c5cc(OC)ccc45)[C@...      0   
3       fludiazepam               CN1C(=O)CN=C(c2ccccc2F)c3cc(Cl)ccc13      0   
4        epiestriol  C[C@]12CC[C@H]3[C@H](CCc4cc(O)ccc34)[C@H]1C[C@...      0   
...             ...                                                ...    ...   
1602     Demecarium  CN(CCCCCCCCCCN(C)C(=O)OC1=CC=CC(=C1)[N+](C)(C)...      0   
1603  Hexafluronium  C[N+](C)(CCCCCC[N+](C)(C)C1C2=CC=CC=C2C3=CC=CC...      0   
1604      Ivacaftor  CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=...      0   
1605    gadoteridol  CC(CN1CCN(CCN(CCN(CC1)CC(=O)[O-])CC(=O)[O-])CC...      0   
1606  Fosaprepitant  CC(C1=CC(=CC(=C1)C(F)(F)F)C(F)(F)F)OC2C(N(CCO2...      0  

In [6]:
df1 = df0.copy()
# The most relevant SMILES is the SMILES corresponding to the first CID in the search result.
df1["Most_Relevant_SMILES"] = None
# Match_MostRelevant indicates if the original SMILES matches with the most relevant SMILES.
df1["Match_MostRelevant"] = False
# Matching_SMILES is the PubChem SMILES corresponding to one of the related CIDs that matches with the original SMILES.
df1["Matching_SMILES"] = None
# Matching CID is the matched CID 
df1["Matching_CID"] = None
# The matching International Chemical Identifier (InChI) Key 
df1["Matching_InChIKey"] = None


In [7]:
# Given two isomeric smiles strings, construct the molecules using Rdkit
# and then generate the isomeric smiles again to see if the uniformly generated smiles match
def compare_smiles(smiles1, smiles2):
    if smiles1 == None or smiles2 == None:
        return False
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)
    
    newsmile1 = Chem.MolToSmiles(mol1,isomericSmiles=True)
    newsmile2 = Chem.MolToSmiles(mol2,isomericSmiles=True)

    if newsmile1 == newsmile2:   
        return True
    else:
        return False

In [8]:
# It takes around 13m for 1607 drugs.
for i in range(len(df1)):
    if df1["PUBCHEM_CIDs"][i] is not None:
        # the most relevant CID is the first in the obtained CIDs
        most_CID = df1["PUBCHEM_CIDs"][i][0]
        # get the compound object identified by this CID
        c = pcp.get_compounds(most_CID)[0]
        # get the most relevant SMILES
        # if there's no isomeric SMILES for this compound, the canonical SMILES will be automatically returned.
        df1.loc[i,"Most_Relevant_SMILES"] = c.isomeric_smiles

        # First compare the most relevant SMILES with the original SMILES.
        if compare_smiles(df1["Most_Relevant_SMILES"][i],df1["Smile"][i]):
            df1.loc[i,"Match_MostRelevant"] = True
            df1.loc[i,"Matching_SMILES"] = df1["Most_Relevant_SMILES"][i]
            df1.loc[i,"Matching_CID"] = most_CID
            df1.loc[i,"Matching_InChIKey"] = c.inchikey

        # if the origininal SMILES does not match with the most relevant SMILES and there are other records available,
        # check all the available records to find matching SMILES.
        elif len(df0["PUBCHEM_CIDs"][i])>1:
            for j in range(1,len(df1["PUBCHEM_CIDs"][i])):
                cid = df1["PUBCHEM_CIDs"][i][j]
                c = pcp.get_compounds(cid)[0]
                smile = c.isomeric_smiles
                if compare_smiles(smile,df1["Smile"][i]):
                    df1.loc[i,"Matching_SMILES"] = smile
                    df1.loc[i,"Matching_CID"] = cid
                    df1.loc[i,"Matching_InChIKey"] = c.inchikey
                    break
 



In [9]:
df1.head

<bound method NDFrame.head of                Name                                              Smile  Class  \
0      propisergide  C[C@H](CO)NC(=O)[C@H]1CN(C)[C@@H]2Cc3cn(C)c4cc...      0   
1        sancycline  CN(C)[C@H]1[C@@H]2C[C@@H]3Cc4cccc(O)c4C(=O)C3=...      0   
2       metoserpate  CO[C@H]1C[C@@H]2CN3CCc4c([nH]c5cc(OC)ccc45)[C@...      0   
3       fludiazepam               CN1C(=O)CN=C(c2ccccc2F)c3cc(Cl)ccc13      0   
4        epiestriol  C[C@]12CC[C@H]3[C@H](CCc4cc(O)ccc34)[C@H]1C[C@...      0   
...             ...                                                ...    ...   
1602     Demecarium  CN(CCCCCCCCCCN(C)C(=O)OC1=CC=CC(=C1)[N+](C)(C)...      0   
1603  Hexafluronium  C[N+](C)(CCCCCC[N+](C)(C)C1C2=CC=CC=C2C3=CC=CC...      0   
1604      Ivacaftor  CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=...      0   
1605    gadoteridol  CC(CN1CCN(CCN(CCN(CC1)CC(=O)[O-])CC(=O)[O-])CC...      0   
1606  Fosaprepitant  CC(C1=CC(=CC(=C1)C(F)(F)F)C(F)(F)F)OC2C(N(CCO2...      0  

In [10]:
df1.to_csv('data/processed_class0.csv',index=False)

In [11]:
# NoMatch=df0[df0["Matching_SMILES"].notnull()]
# NoMatch.to_csv('data/nomatch_class0.csv')

In [12]:
# # Displaying some of the mismatching molecules
# for i in range(10):
#     if df0["Same_Mol"][i]== False:
#         mol1 = Chem.MolFromSmiles(df0["Smile"][i])
#         mol2 = Chem.MolFromSmiles(df0["PUBCHEM_SMILES"][i])
#         fig1 = Chem.Draw.MolToMPL(mol1)
#         fig1.show()
#         fig2 = Chem.Draw.MolToMPL(mol2)
#         fig2.show()