# Preprocessing SMILES data

In [1]:
import os
import pandas as pd

In [2]:
file = "all_drugs_smi"
f = open(file)

In [3]:
#extract SMILES string and drug names
smi = []
for line in f:
    drg = []
    smile_string = line.split()[0]
    drug_name = line.split()[-1]
    drg.append(smile_string)
    drg.append(drug_name)
    smi.append(drg)
f.close()

In [4]:
df_smi = pd.DataFrame(smi, columns = ["SMILES_STRING", "DRUG_NAME"])

In [5]:
print(df_smi.shape)
df_smi.head()

(1385, 2)


Unnamed: 0,SMILES_STRING,DRUG_NAME
0,Nc1nc(NC2CC2)c2ncn([C@H]3C=C[C@@H](CO)C3)c2n1,Abacavir
1,CC(=O)O[C@H]1CC[C@@]2(C)C(=CCC3C4CC=C(c5cccnc5...,Abiraterone
2,CC(=O)NCCCS(=O)(=O)O,Acamprosate
3,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(C(C)=O)c1,Acebutolol
4,CC(=O)CC(c1ccc([N+](=O)[O-])cc1)c1c(O)c2ccccc2...,Acenocoumarol


# Preprocessing drug labels data

In [6]:
import json
import pandas as pd
from pandas.io.json import json_normalize

In [51]:
frames = []
directory = "./JSON"
cols_openfda = ["openfda.spl_set_id","openfda.brand_name","openfda.generic_name"
               ,"openfda.package_ndc","openfda.product_type","openfda.route","openfda.rxcui",
               "openfda.substance_name","openfda.unii","description"]
for f in os.listdir(directory):
    if f.endswith(".json"):
        print(f)
        data = json.load(open(directory + "/" + f))
        #pull openFDA drug identifiers
        df = json_normalize(data["results"])
        df_openfda = df[cols_openfda]
        print(df_openfda.shape)
        frames.append(df_openfda)

drug-label-0003-of-0007.json
(20000, 10)
drug-label-0004-of-0007.json
(20000, 10)
drug-label-0005-of-0007.json
(20000, 10)
drug-label-0002-of-0007.json
(20000, 10)
drug-label-0007-of-0007.json
(12340, 10)
drug-label-0006-of-0007.json
(20000, 10)
drug-label-0001-of-0007.json
(20000, 10)


In [52]:
#merge multiple JSON files
df_openfda_master = pd.concat(frames)

In [53]:
#one long string
df_openfda_master.iloc[6]

openfda.spl_set_id                   [4d11cc46-3942-1561-e054-00144ff8d46c]
openfda.brand_name        [SERIOUS SKINCARE SERIOUS C3 PLASMA BIO TECHNO...
openfda.generic_name      [AVOBENZONE, OCTOCRYLENE, OCTINOXATE, OCTISALA...
openfda.package_ndc                                          [53755-101-04]
openfda.product_type                                       [HUMAN OTC DRUG]
openfda.route                                                     [TOPICAL]
openfda.rxcui                                                           NaN
openfda.substance_name    [OCTINOXATE, OCTISALATE, OXYBENZONE, OCTOCRYLE...
openfda.unii              [G63QQF2NOX, 95OOS7VE0Y, 4X49Y0596W, 5A68WGF6W...
description                                                             NaN
Name: 6, dtype: object

In [54]:
#multiple strings
df_openfda_master.iloc[6].loc["openfda.generic_name"]

['AVOBENZONE, OCTOCRYLENE, OCTINOXATE, OCTISALATE, OXYBENZONE']

In [55]:
df_openfda_master.iloc[6].loc["openfda.substance_name"]

['OCTINOXATE', 'OCTISALATE', 'OXYBENZONE', 'OCTOCRYLENE', 'AVOBENZONE']

In [56]:
#extract single value from list
df_openfda_master = df_openfda_master.applymap(lambda x:x[0] if type(x) is list and len(x)==1 else x)

In [57]:
#sample data
df_openfda_master.iloc[6]

openfda.spl_set_id                     4d11cc46-3942-1561-e054-00144ff8d46c
openfda.brand_name        SERIOUS SKINCARE SERIOUS C3 PLASMA BIO TECHNOL...
openfda.generic_name      AVOBENZONE, OCTOCRYLENE, OCTINOXATE, OCTISALAT...
openfda.package_ndc                                            53755-101-04
openfda.product_type                                         HUMAN OTC DRUG
openfda.route                                                       TOPICAL
openfda.rxcui                                                           NaN
openfda.substance_name    [OCTINOXATE, OCTISALATE, OXYBENZONE, OCTOCRYLE...
openfda.unii              [G63QQF2NOX, 95OOS7VE0Y, 4X49Y0596W, 5A68WGF6W...
description                                                             NaN
Name: 6, dtype: object

In [58]:
df_openfda_master.shape

(132340, 10)

In [59]:
len(df_openfda_master["openfda.spl_set_id"].unique())

95134

In [60]:
t2 = df_openfda_master.loc[df_openfda_master["openfda.spl_set_id"].isna()]
t2.head()

Unnamed: 0,openfda.spl_set_id,openfda.brand_name,openfda.generic_name,openfda.package_ndc,openfda.product_type,openfda.route,openfda.rxcui,openfda.substance_name,openfda.unii,description
30,,,,,,,,,,
31,,,,,,,,,,
32,,,,,,,,,,
33,,,,,,,,,,
34,,,,,,,,,,


### Remove missing values

In [62]:
df_openfda_master = df_openfda_master.loc[df_openfda_master["openfda.spl_set_id"].notna()]
df_openfda_master.shape

(95133, 10)

### Data quality (missing values)

substance name, unii are more complete than RxCUI

NLM associates U.S. Food and Drug Administration (FDA) generated unique ingredient identifiers (UNIIs) to RxNorm (SAB=RXNORM) atoms of term type IN. The association is made by an exact string match to the RxNorm ingredient string (case insensitive) from the official FDA substance list. These UNII codes are found in RXNSAT.RRF as values of the attribute ATN='UNII_CODE'. The UNII is a non-proprietary, free, unique, unambiguous, non semantic, alphanumeric identifier based on a substance’s molecular structure and/or descriptive information. For more information on the FDA UNII codes, please refer to this FDA web page.


In [325]:
df_openfda_master.count()

openfda.spl_set_id        95133
openfda.brand_name        95133
openfda.generic_name      95133
openfda.package_ndc       95132
openfda.product_type      95133
openfda.route             93524
openfda.rxcui             64613
openfda.substance_name    93207
openfda.unii              92925
description               34588
dtype: int64

### Select human presciption drug

In [63]:
df_openfda_master.groupby("openfda.product_type")["openfda.spl_set_id"].count()

openfda.product_type
HUMAN OTC DRUG             60038
HUMAN PRESCRIPTION DRUG    35095
Name: openfda.spl_set_id, dtype: int64

In [64]:
df_openfda_rx = df_openfda_master.loc[df_openfda_master["openfda.product_type"] == "HUMAN PRESCRIPTION DRUG"]
df_openfda_rx.groupby("openfda.product_type")["openfda.spl_set_id"].count()

openfda.product_type
HUMAN PRESCRIPTION DRUG    35095
Name: openfda.spl_set_id, dtype: int64

### Use generic name or substance name?
case study: Morphine

In [65]:
df_openfda_rx.loc[df_openfda_rx["openfda.generic_name"]=="MORPHINE"]

Unnamed: 0,openfda.spl_set_id,openfda.brand_name,openfda.generic_name,openfda.package_ndc,openfda.product_type,openfda.route,openfda.rxcui,openfda.substance_name,openfda.unii,description
6289,3ad95865-a77c-41da-b572-e5c0bb404c79,Opium Tincture Deodorized,MORPHINE,"[0187-4203-16, 0187-4203-04]",HUMAN PRESCRIPTION DRUG,ORAL,830196.0,MORPHINE,76I7G6D29C,"DESCRIPTION Opium Tincture, USP (Deodorized), ..."
8060,e2a5697a-cc41-4cf1-b3a8-59b0268740d7,Opium Tincture Deodorized,MORPHINE,"[42799-217-02, 42799-217-01]",HUMAN PRESCRIPTION DRUG,ORAL,830196.0,MORPHINE,76I7G6D29C,"DESCRIPTION Opium Tincture, USP (Deodorized), ..."
1129,1d9a7065-45c2-0e3f-e054-00144ff8d46c,MORPHINUM,MORPHINE,10191-1004-2,HUMAN PRESCRIPTION DRUG,SUBLINGUAL,,ABT-925 ANHYDROUS FREE BASE,E6CKI5C54O,
1173,1d858c8b-9797-209b-e054-00144ff8d46c,MORPHINUM,MORPHINE,10191-1002-2,HUMAN PRESCRIPTION DRUG,SUBLINGUAL,,ABT-925 ANHYDROUS FREE BASE,E6CKI5C54O,
1904,1dac198e-49bb-725b-e054-00144ff8d46c,MORPHINUM,MORPHINE,10191-1005-2,HUMAN PRESCRIPTION DRUG,SUBLINGUAL,,ABT-925 ANHYDROUS FREE BASE,E6CKI5C54O,
8636,1d87ae72-46e2-343b-e054-00144ff88e88,MORPHINUM,MORPHINE,10191-1003-2,HUMAN PRESCRIPTION DRUG,SUBLINGUAL,,ABT-925 ANHYDROUS FREE BASE,E6CKI5C54O,
10252,1c6d882b-4f2c-2cfb-e054-00144ff88e88,MORPHINUM,MORPHINE,10191-1001-2,HUMAN PRESCRIPTION DRUG,SUBLINGUAL,,ABT-925 ANHYDROUS FREE BASE,E6CKI5C54O,


In [310]:
df_openfda_rx[df_openfda_rx["openfda.rxcui"]=="830196"]

Unnamed: 0,openfda.spl_set_id,openfda.brand_name,openfda.generic_name,openfda.package_ndc,openfda.product_type,openfda.route,openfda.rxcui,openfda.substance_name,openfda.unii,description
6289,3ad95865-a77c-41da-b572-e5c0bb404c79,Opium Tincture Deodorized,MORPHINE,"[0187-4203-16, 0187-4203-04]",HUMAN PRESCRIPTION DRUG,ORAL,830196,MORPHINE,76I7G6D29C,"DESCRIPTION Opium Tincture, USP (Deodorized), ..."
5086,674a969a-707d-41a9-b7c3-b90cbf2909e3,Opium Tincture Deodorized,MORPHINE TINCTURE,62559-153-04,HUMAN PRESCRIPTION DRUG,ORAL,830196,MORPHINE,76I7G6D29C,"DESCRIPTION Opium Tincture, USP (Deodorized), ..."
8060,e2a5697a-cc41-4cf1-b3a8-59b0268740d7,Opium Tincture Deodorized,MORPHINE,"[42799-217-02, 42799-217-01]",HUMAN PRESCRIPTION DRUG,ORAL,830196,MORPHINE,76I7G6D29C,"DESCRIPTION Opium Tincture, USP (Deodorized), ..."


Using RxNorm API, can't find Morphine in the drug label data. RxCUI = 830196 is associated with "opium" using RxNorm Nav

In [324]:
ind = df_openfda_master["openfda.rxcui"] == "7052"
df_openfda_master[ind]

Unnamed: 0,openfda.spl_set_id,openfda.brand_name,openfda.generic_name,openfda.package_ndc,openfda.product_type,openfda.route,openfda.rxcui,openfda.substance_name,openfda.unii,description


Substance name seems to be more accurate to match "drug_name" in all_drugs.smi file. However, this expects to be a fuzzy match. Can we find mappings between UNII and SMILES? If so we can link SMILES at UNII level for each prescription drug 

### SMILES strings are different pulled from different data sources: (same chemical formula C17H19NO3, same UNII 76I7G6D29C)

1. From the all_drugs.smi file <br/>
CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)C=C[C@H]3[C@H]1C5
2. Pubchem <br/>
CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)O)O[C@H]3[C@H](C=C4)O <br/>
https://pubchem.ncbi.nlm.nih.gov/compound/morphine#section=Top
3. Drugbank <br/>
[H][C@@]12OC3=C(O)C=CC4=C3[C@@]11CCN(C)[C@]([H])(C4)[C@]1([H])C=C[C@@H]2O <br/>
https://www.drugbank.ca/drugs/DB00295
4. ZINC <br/>
C[NH+]1CC[C@]23c4c5ccc(c4O[C@H]2[C@H](C=C[C@H]3[C@H]1C5)O)O <br/>
http://zinc.docking.org/substance/3812983

### Convert SMILES strings to canonical form
the canonical SMILES are the same for all_drugs.smi, Pubchem and Drugbank

In [66]:
from rdkit import Chem
 
smis = ["CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)C=C[C@H]3[C@H]1C5",
        "CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)O)O[C@H]3[C@H](C=C4)O",
        "[H][C@@]12OC3=C(O)C=CC4=C3[C@@]11CCN(C)[C@]([H])(C4)[C@]1([H])C=C[C@@H]2O",
        "C[NH+]1CC[C@@]23c4c5ccc(c4O[C@H]2[C@@H](C=C[C@H]3[C@@H]1C5)O)O"
       ]

cans = [Chem.MolToSmiles(Chem.MolFromSmiles(smi),canonical = True) for smi in smis]
cans

['CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)C=C[C@H]3[C@H]1C5',
 'CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)C=C[C@H]3[C@H]1C5',
 'CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)C=C[C@H]3[C@H]1C5',
 'C[NH+]1CC[C@@]23c4c5ccc(O)c4O[C@H]2[C@H](O)C=C[C@H]3[C@@H]1C5']

### Try exact drug name matching

In [67]:
#get all drug names from .smi file
smi_drugs = [x.lower() for x in df_smi.DRUG_NAME]

In [68]:
len(smi_drugs)

1385

In [91]:
spl_drugs = []

for s in df_openfda_rx["openfda.substance_name"]:
    if isinstance(s,list):
        spl_drugs = spl_drugs + s
    elif isinstance(s,str):
        spl_drugs.append(s)

In [97]:
#get unique substance names
spl_drugs = list(set(spl_drugs))

In [98]:
spl_drugs = [x.lower() for x in spl_drugs]

In [99]:
len(spl_drugs)

2829

In [100]:
spl_unii = []

for s in df_openfda_rx["openfda.unii"]:
    if isinstance(s,list):
        spl_unii = spl_unii + s
    elif isinstance(s,str):
        spl_unii.append(s)

In [101]:
#get unique substance names
spl_unii = list(set(spl_unii))

In [73]:
#intersection of drug names
drugs_intersection = list(set(smi_drugs) & set(spl_drugs))

In [74]:
print(len(drugs_intersection))
#print a few samples
print(drugs_intersection[:10])

510
['liothyronine', 'cisplatin', 'clobazam', 'stavudine', 'chlorzoxazone', 'amcinonide', 'cefotetan', 'spironolactone', 'glycine', 'loratadine']


In [75]:
#SPL drugs that are not in smiles file using exact string matching
drugs_not_in_smi = list(set(spl_drugs) - set(smi_drugs))
print(len(drugs_not_in_smi))
#print a few sampels
print(drugs_not_in_smi[:10])

2319
['sus scrofa stomach', 'azathioprine sodium', 'salmonella enterica subsp. enterica serovar typhi', 'methylparaben', 'selexipag', 'lidocaine hydrochloride', 'borage', 'yeast mannan', 'borrelia burgdorferi', 'moxetumomab pasudotox']


### Create drug - UNII dictionary

In [86]:
#pull unique drug-unii pairs from SPL 
drug_unii_dict = {}
for index, row in df_openfda_rx.iterrows():
    #if both fields are lists and have the same length
    if isinstance(row["openfda.substance_name"],list) and isinstance(row["openfda.unii"],list):
        if len(row["openfda.substance_name"]) == len(row["openfda.unii"]):
            for i in range(len(row["openfda.substance_name"])):
                key = row["openfda.substance_name"][i]
                val = row["openfda.unii"][i]
                drug_unii_dict[key] = val
    #if both fileds are single strings
    elif isinstance(row["openfda.substance_name"],str) and isinstance(row["openfda.unii"],str):
        key = row["openfda.substance_name"]
        val = row["openfda.unii"]
        drug_unii_dict[key] = val

In [89]:
len(drug_unii_dict)

2688

In [102]:
len(spl_drugs)

2829

In [103]:
len(spl_unii)

3167

### Step 1: Use UNII-List lookup file

In [104]:
#download UNII data from https://fdasis.nlm.nih.gov/srs/jsp/srs/uniiListDownload.jsp
unii_file = "./UNII_Data/UNII Records 25Oct2018.txt"
unii_data = pd.read_table(unii_file, sep="\t",low_memory=False)

In [105]:
unii_data.head()

Unnamed: 0,UNII,PT,RN,NCIT,EC,RXCUI,PUBCHEM,ITIS,NCBI,PLANTS,GRIN,MPNS,INN_ID,MF,INCHIKEY,SMILES,INGREDIENT_TYPE,Unnamed: 17
0,17462400.0,2-(5-CYANO-2-(6-(METHOXYCARBONYL)-7-METHYL-3-O...,1613620-10-2,,,,,,,,,,,C27H28F3N6O3,IHTRPSMRGYWUIM-HSZRJFAPSA-O,COC(=O)C1=C(C)N(C2=NNC(=O)N2[C@@H]1C3=CC=C(C=C...,IONIC MOIETY,
1,129526470.0,"5,8-DIMETHOXY(1,2,4)TRIAZOLO(1,5-C)PYRIMIDIN-2...",219715-62-5,,,,11446888.0,,,,,,,C7H9N5O2,DBJPBHJHAPAUQU-UHFFFAOYSA-N,COC1=CN=C(OC)N2N=C(N)N=C12,INGREDIENT SUBSTANCE,
2,258808825.0,MACROPIPER EXCELSUM LEAF,,,,,,,130373.0,,,,,,,,INGREDIENT SUBSTANCE,
3,377415922.0,"N-DESMETHYLVENLAFAXINE, (S)-",392332-59-1,,,,9860056.0,,,,,,,C16H25NO2,MKAFOJAJJMUXLW-OAHLLOKOSA-N,CNC[C@H](C1=CC=C(OC)C=C1)C2(O)CCCCC2,INGREDIENT SUBSTANCE,
4,457970679.0,VERRUCARIN,54018-05-2,,,,,,,,,,,,,,INGREDIENT SUBSTANCE,


In [158]:
import math
unii_smiles_dict = {}
for index, row in unii_data.iterrows():
    key = row.UNII
    val = row.SMILES
    if type(val) is str:
        unii_smiles_dict[key] = val

In [161]:
len(unii_smiles_dict)

69953

In [254]:
#matched 68% drugs to smiles
df_lst = []
for key in drug_unii_dict.keys():
    drug = key
    unii = drug_unii_dict[key]
    try:
        smi = unii_smiles_dict[unii]
        try:
            smile = Chem.MolToSmiles(Chem.MolFromSmiles(smi),canonical = True)
        except:
            smile = smi
    except KeyError:
        smile = float("NaN")
    df_lst.append([drug,unii,smile])
df_output = pd.DataFrame(df_lst, columns = ["drug", "unii", "smiles_unii"])

In [255]:
df_output.shape

(2688, 3)

### Step 2: Use PubChemPy API

In [230]:
import pubchempy as pcp

In [259]:
lst = []
for index, row in df_output.iterrows():
    try:
        com = pcp.get_compounds(row["drug"],"name")
        if len(com) > 0:
            smi = com[0].isomeric_smiles
            lst.append(smi)
        else:
            lst.append(float("NaN"))
    except:
        lst.append("HTTPError")

In [264]:
df_output["smiles_api"] = lst

### Step3: Add smi file as validation set

In [272]:
#create drug name: SMILES string dictionary
dict_smi = {}
for index,row in df_smi.iterrows():
    key = row["DRUG_NAME"].upper()
    val = row["SMILES_STRING"]
    dict_smi[key] = val

In [274]:
lst = []
for index, row in df_output.iterrows():
    try:
        smile = dict_smi[row["drug"]]
        lst.append(smile)
    except:
        lst.append(float("NaN"))        

In [275]:
df_output["smiles_file"] = lst

In [279]:
df_output.head()

Unnamed: 0,drug,unii,smiles_unii,smiles_api,smiles_file
0,NABUMETONE,LW0TIW155Z,COc1ccc2cc(CCC(C)=O)ccc2c1,CC(=O)CCC1=CC2=C(C=C1)C=C(C=C2)OC,COc1ccc2cc(CCC(C)=O)ccc2c1
1,BUSPIRONE HYDROCHLORIDE,207LT9J9OC,Cl.O=C1CC2(CCCC2)CC(=O)N1CCCCN1CCN(c2ncccn2)CC1,C1CCC2(C1)CC(=O)N(C(=O)C2)CCCCN3CCN(CC3)C4=NC=...,
2,OXYGEN,S88TT14065,O=O,O=O,
3,PAROXETINE HYDROCHLORIDE ANHYDROUS,3I3T11UD2S,Cl.Fc1ccc([C@@H]2CCNC[C@H]2COc2ccc3c(c2)OCO3)cc1,,
4,VENLAFAXINE HYDROCHLORIDE,7D7RX5A8MO,COc1ccc(C(CN(C)C)C2(O)CCCCC2)cc1.Cl,CN(C)CC(C1=CC=C(C=C1)OC)C2(CCCCC2)O.Cl,


### Step 4: Combine SMILES from 3 data sources

In [293]:
lst = []
for index, row in df_output.iterrows():
    if type(row["smiles_unii"]) is str:
        try:
            can = Chem.MolToSmiles(Chem.MolFromSmiles(row["smiles_unii"]),canonical = True)
        except:
            can = row["smiles_unii"]
    elif type(row["smiles_api"]) is str:
        try:
            can = Chem.MolToSmiles(Chem.MolFromSmiles(row["smiles_api"]),canonical = True)
        except:
            can = row["smiles_api"]
    elif type(row["smiles_file"]) is str:
        try:
            can = Chem.MolToSmiles(Chem.MolFromSmiles(row["smiles_file"]),canonical = True)
        except:
            can = row["smiles_file"]
    else:
        can = float("NaN")
    lst.append(can)

In [296]:
df_output["smiles_can"] = lst

In [299]:
df_output.count()

drug           2688
unii           2688
smiles_unii    1849
smiles_api     1816
smiles_file     508
smiles_can     2022
dtype: int64

In [301]:
#check drugs with missing smiles_can
ind = df_output["smiles_can"].isna()
df_output[ind]

Unnamed: 0,drug,unii,smiles_unii,smiles_api,smiles_file,smiles_can
192,CHITOSAN LOW MOLECULAR WEIGHT (20-200 MPA.S),SBD1A2I75N,,,,
203,HEPARIN SODIUM,ZZ45AB24CA,,,,
234,CONESTAT ALFA,5QS67N4551,,,,
238,COLESEVELAM HYDROCHLORIDE,P4SG24WI5Q,,,,
261,ACONITUM NAPELLUS,WQZ3G9PF0H,,,,
265,MATRICARIA RECUTITA,5EF0HWI5WU,,,,
266,COMFREY ROOT,M9VVZ08EKQ,,,,
267,BRYONIA ALBA ROOT,V5VD430YW9,,,,
268,PHYTOLACCA AMERICANA ROOT,I76KB35JEV,,,,
269,BLACK COHOSH,6L5ZL09795,,,,


Lots of them are not chemical compounds, e.g. cat's claw? Bluefish?

## Save as HDF

In [315]:
df_output.to_hdf("drugs_smi.h4",key = "rx")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['drug', 'unii', 'smiles_unii', 'smiles_api', 'smiles_file', 'smiles_can']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


# Past Code

### PubChemPy API

In [70]:
import pubchempy as pcp

In [116]:
#retreving substances
results = pcp.get_compounds("ABT-925 ANHYDROUS FREE BASE","name")

In [117]:
results

[Compound(9916104)]

In [118]:
for com in results:
    print(com.isomeric_smiles)

CC(C)(C)C1=NC(=CC(=N1)N2CCN(CC2)CCCSC3=NC=CC(=O)N3)C(F)(F)F


In [124]:
#create drug name: SMILES string dictionary
dict_smi = {}
for index,row in df_smi.iterrows():
    key = row["DRUG_NAME"].lower()
    val = row["SMILES_STRING"]
    dict_smi[key] = val

In [138]:
df_lst = []
for d in drugs_intersection:
    #SMILES from file
    smi_f = dict_smi[d]
    smi_f_norm = Chem.MolToSmiles(Chem.MolFromSmiles(smi_f),canonical = True)
    #SMILES from PubChem API
    com = pcp.get_compounds(d,"name")
    if len(com) > 0:
        smi_api = com[0].isomeric_smiles
        smi_api_norm = Chem.MolToSmiles(Chem.MolFromSmiles(smi_api),canonical = True)
    else:
        smi_api = ""
        smi_api_norm = ""
    row_lst = [d, smi_f, smi_f_norm, smi_api, smi_api_norm, smi_f_norm == smi_api_norm]
    df_lst.append(row_lst)

In [141]:
df_api_val = pd.DataFrame(df_lst, columns = ["drug_name", "smi_f", "smi_f_norm", "smi_api", "smi_api_norm", "same_smi_flag"])

In [145]:
df_api_val.groupby("same_smi_flag").size()

same_smi_flag
False     80
True     430
dtype: int64

84% exact match

In [148]:
df_api_val.loc[df_api_val["same_smi_flag"] == False]

Unnamed: 0,drug_name,smi_f,smi_f_norm,smi_api,smi_api_norm,same_smi_flag
7,crotamiton,CC=CC(=O)N(CC)c1ccccc1C,CC=CC(=O)N(CC)c1ccccc1C,CCN(C1=CC=CC=C1C)C(=O)/C=C/C,C/C=C/C(=O)N(CC)c1ccccc1C,False
14,famotidine,NC(N)=Nc1nc(CSCCC(N)=NS(N)(=O)=O)cs1,NC(N)=Nc1nc(CSCCC(N)=NS(N)(=O)=O)cs1,C1=C(N=C(S1)N=C(N)N)CSCC/C(=N/S(=O)(=O)N)/N,NC(N)=Nc1nc(CSCC/C(N)=N/S(N)(=O)=O)cs1,False
23,selenium,S=[Se],S=[Se],[Se],[Se],False
34,sulfasalazine,O=C(O)c1cc(/N=N/c2ccc(S(=O)(=O)Nc3ccccn3)cc2)c...,O=C(O)c1cc(/N=N/c2ccc(S(=O)(=O)Nc3ccccn3)cc2)c...,C1=CC=NC(=C1)NS(=O)(=O)C2=CC=C(C=C2)N/N=C/3\C=...,O=C(O)C1=C/C(=N/Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)...,False
36,doxycycline,C[C@H]1c2cccc(O)c2C(=O)C2=C(O)[C@]3(O)C(=O)C(C...,C[C@H]1c2cccc(O)c2C(=O)C2=C(O)[C@]3(O)C(=O)C(C...,C[C@@H]1[C@H]2[C@@H]([C@H]3[C@@H](C(=O)C(=C([C...,C[C@H]1c2cccc(O)c2C(O)=C2C(=O)[C@]3(O)C(O)=C(C...,False
40,alprostadil,CCCCC[C@H](O)C=C[C@H]1[C@H](O)CC(=O)[C@@H]1CCC...,CCCCC[C@H](O)C=C[C@H]1[C@H](O)CC(=O)[C@@H]1CCC...,CCCCC[C@@H](/C=C/[C@H]1[C@@H](CC(=O)[C@@H]1CCC...,CCCCC[C@H](O)/C=C/[C@H]1[C@H](O)CC(=O)[C@@H]1C...,False
46,polidocanol,CCCCCCCCCCCCOCCO,CCCCCCCCCCCCOCCO,CCCCCCCCCCCCOCCOCCOCCOCCOCCOCCOCCOCCOCCO,CCCCCCCCCCCCOCCOCCOCCOCCOCCOCCOCCOCCOCCO,False
47,estradiol,CCCCC(=O)O[C@H]1CC[C@H]2[C@@H]3CCc4cc(O)ccc4[C...,CCCCC(=O)O[C@H]1CC[C@H]2[C@@H]3CCc4cc(O)ccc4[C...,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2O)CCC4=C3...,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,False
57,silver,Nc1ccc(S(=O)(=O)[N-]c2ncccn2)cc1,Nc1ccc(S(=O)(=O)[N-]c2ncccn2)cc1,[Ag],[Ag],False
64,olanzapine,Cc1cc2c(s1)Nc1ccccc1N=C2N1CCN(C)CC1,Cc1cc2c(s1)Nc1ccccc1N=C2N1CCN(C)CC1,CC1=CC2=C(NC3=CC=CC=C3N=C2S1)N4CCN(CC4)C,Cc1cc2c(s1)=Nc1ccccc1NC=2N1CCN(C)CC1,False


### Alternative way to get SMILES
most drugs have UNII info. which can be used to find SMILES

In [63]:
print(sum(df_openfda_rx["openfda.unii"].isna()))
print(sum(df_openfda_rx["openfda.unii"].isna())/df_openfda_rx.shape[0])

971
0.027667758940019944


In [155]:
#download UNII data from https://fdasis.nlm.nih.gov/srs/jsp/srs/uniiListDownload.jsp
unii_file = "./UNII_Data/UNII Records 25Oct2018.txt"
unii_data = pd.read_table(unii_file, sep="\t",low_memory=False)

In [162]:
unii_data.head()

Unnamed: 0,UNII,PT,RN,NCIT,EC,RXCUI,PUBCHEM,ITIS,NCBI,PLANTS,GRIN,MPNS,INN_ID,MF,INCHIKEY,SMILES,INGREDIENT_TYPE,Unnamed: 17
0,17462400.0,2-(5-CYANO-2-(6-(METHOXYCARBONYL)-7-METHYL-3-O...,1613620-10-2,,,,,,,,,,,C27H28F3N6O3,IHTRPSMRGYWUIM-HSZRJFAPSA-O,COC(=O)C1=C(C)N(C2=NNC(=O)N2[C@@H]1C3=CC=C(C=C...,IONIC MOIETY,
1,129526470.0,"5,8-DIMETHOXY(1,2,4)TRIAZOLO(1,5-C)PYRIMIDIN-2...",219715-62-5,,,,11446888.0,,,,,,,C7H9N5O2,DBJPBHJHAPAUQU-UHFFFAOYSA-N,COC1=CN=C(OC)N2N=C(N)N=C12,INGREDIENT SUBSTANCE,
2,258808825.0,MACROPIPER EXCELSUM LEAF,,,,,,,130373.0,,,,,,,,INGREDIENT SUBSTANCE,
3,377415922.0,"N-DESMETHYLVENLAFAXINE, (S)-",392332-59-1,,,,9860056.0,,,,,,,C16H25NO2,MKAFOJAJJMUXLW-OAHLLOKOSA-N,CNC[C@H](C1=CC=C(OC)C=C1)C2(O)CCCCC2,INGREDIENT SUBSTANCE,
4,457970679.0,VERRUCARIN,54018-05-2,,,,,,,,,,,,,,INGREDIENT SUBSTANCE,


In [163]:
unii_data.loc[unii_data["UNII"] == "E6CKI5C54O"]

Unnamed: 0,UNII,PT,RN,NCIT,EC,RXCUI,PUBCHEM,ITIS,NCBI,PLANTS,GRIN,MPNS,INN_ID,MF,INCHIKEY,SMILES,INGREDIENT_TYPE,Unnamed: 17
58514,E6CKI5C54O,ABT-925 ANHYDROUS FREE BASE,220519-06-2,,,1661016.0,9916104,,,,,,,C20H27F3N6OS,KXVAICSRMHXLJN-UHFFFAOYSA-N,CC(C)(C)C1=NC(=CC(=N1)C(F)(F)F)N2CCN(CCCSC3=NC...,INGREDIENT SUBSTANCE,
