In [1]:
# Import all necessary packages
import os
from os import sep
import pubchempy as pcp
import pandas as pd
from collections import defaultdict
import numpy as np
from numpy import nan
from IPython.display import Image

In [2]:
# Read in list of chemicals data file and change the name of column.
dataset = pd.read_csv ("../Data/list_of_chemicals.csv")
dataset.rename(columns = {'Chemical name' : 'Chemical_Name'}, inplace = True)

In [3]:
# View datafile.
dataset

Unnamed: 0,ZOOL 409 student sign up (only one student per chemical),Contact (https://www.boisestate.edu/research-gutt-c/) in GUTT group who provided list,Study organism(s),Chemical_Name,CAS,IUPAC identifier,SMILES notation (2.1.4 PubChem),Links to structures (if available),Chemical formula,M/Z,Molecular weight,literature reference
0,,Marjorie Matocq UNR,Prunus fasciculata,Prunasin,,\n\n,,,,,,
1,,Marjorie Matocq UNR,Prunus fasciculata,amygdalin,,https://pubchem.ncbi.nlm.nih.gov/compound/656516,,,,,,
2,,Marjorie Matocq UNR,Frangula californica,Emodin,,https://pubchem.ncbi.nlm.nih.gov/compound/3220,,,,,,
3,,Casey Philbin UNR,Eriogonum spp.,Hordenine,,https://pubchem.ncbi.nlm.nih.gov/compound/68313,,,,,,
4,,Casey Philbin UNR,Eriogonum spp.,N-methyl-4-methoxyphenethylamine,,https://pubchem.ncbi.nlm.nih.gov/compound/104735,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
204,,Deb Conner (BSU),Sagebrush (Artemisia spp),Quercetin,117-39-5,"2-(3,4-dihydroxyphenyl)-3,5,7-trihydroxychrome...",,,,,,
205,,Deb Conner (BSU),Sagebrush (Artemisia spp),"Quercetin 7,30- dimethyl ether (Rhamnazine)",,,,,,,,
206,,Deb Conner (BSU),Sagebrush (Artemisia spp),Scoparone,0120-08-01,"6,7-dimethoxychromen-2-one",,,,,,
207,,Deb Conner (BSU),Sagebrush (Artemisia spp),Skimmin,93-39-0,"7-[(2S,3R,4S,5S,6R)-3,4,5-trihydroxy-6-(hydrox...",,,,,,


In [4]:
# Extract column ad new dataset for analysis.
ChemList_Column = dataset["Chemical_Name"]

In [5]:
# View new dataset
ChemList_Column

0                                         Prunasin
1                                        amygdalin
2                                           Emodin
3                                        Hordenine
4                 N-methyl-4-methoxyphenethylamine
                          ...                     
204                                      Quercetin
205    Quercetin 7,30- dimethyl ether (Rhamnazine)
206                                      Scoparone
207                                        Skimmin
208                                  Umbelliferone
Name: Chemical_Name, Length: 209, dtype: object

In [6]:
# Use PubChempy to obtain chemical identifiers and insert into cidlist created for exact number of chemicals.
cidlist = [] # list of cids
unfoundcid = [] # list of unfound chemicals
foundcid = []  # list of found chemicals
for i,d in enumerate(ChemList_Column):
    values = pcp.get_cids(d)
    if len(values) > 0:
        cidlist.append(values[0])
        foundcid.append(d)
    else:
        unfoundcid.append(d)

In [7]:
# cidlist and foundcid size
(len(cidlist),len(foundcid))

(156, 156)

In [8]:
# List of unfound chemical names
unfoundcid;

In [9]:
foundcid;

In [10]:
# Use identifiers to obtain properties of the chemicals (MolecularFormula, MW, and SMILES) in place in correct list
# Runs in 4 minutes
count = len(cidlist)
Formula, Weight, Smiles = [None] * count,[None] * count,[None] * count

for i,d in enumerate(cidlist):
    if d != None:
        Formula[i] = pcp.get_properties(['MolecularFormula'],d)[0]['MolecularFormula']
        Weight[i] = float(pcp.get_properties(['MolecularWeight'], d)[0]['MolecularWeight'])
        Smiles[i] = pcp.get_properties(['CanonicalSMILES'],d)[0]['CanonicalSMILES']

In [11]:
print(f'Found Smiles: {sum(x is not None for x in Smiles)} of total:{len(Smiles)}')

Found Smiles: 156 of total:156


## SwissADME
---

These must be installed. 

FireFox Browser MUST be installed on your computer. 

Install selenium package (only once)
pip install selenium

install geckodriver (only once)
pip install geckodriver-autoinstaller


In [12]:
import geckodriver_autoinstaller
geckodriver_autoinstaller.install();

from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import csv
import requests


Still issues in SwissADME code 
- Every time code is run I get a different number of output
    -Possibly caused by waits?
- When concat is run empty cells are ingnored has to show none in empty cells


In [15]:
# SwissADME code
# THIS SECTION TAKES ~20 MIN

# Create list for SwissADME links
SwissAdme_links = [None] * len(Smiles)

# Name each elements for code
name = "smiles"
button = "submitButton"
path = '//*[@id="sib_body"]/div[7]/a[1]'
website = "http://www.swissadme.ch"

# Loop through all the Smiles and get the list of links
for i,d in enumerate(Smiles):
    # This will run headless - runs in the background
    options = Options()
    options.headless = True
    driver = Firefox(options=options)
    wait = WebDriverWait(driver,30)
    # Open SwissADME database
    driver.get(website)
    
    # Locate search box on SwissADME and import the smiles from previously created list
    element1 = wait.until(EC.visibility_of_element_located((By.NAME, name)))
    smiles_search_ADME = driver.find_element(By.NAME,name).send_keys(d) 
    
    # Find and click Submit
    element2 = wait.until(EC.visibility_of_element_located((By.ID, button)))
    run = driver.find_element(By.ID,button).submit()
    
    # Locate href link (csv link)
    element3 = wait.until(EC.visibility_of_element_located((By.ID,"mol-cell-1")))
    csvlinks_ADME = driver.find_element(By.XPATH, path).get_property('href')
    
    SwissAdme_links[i] = csvlinks_ADME

    # Close browser
    driver.close()       

In [17]:
print(f'Links: {sum(x is not None for x in SwissAdme_links)} of total:{len(SwissAdme_links)}')

Links: 156 of total:156


In [20]:
SwissAdme_data = [None] * len(SwissAdme_links)
for i,d in enumerate(SwissAdme_links):
     SwissAdme_data[i] =  pd.read_csv(d, sep=",")

In [22]:
SwissAdme_data;

In [23]:
print(f' Returned data from {sum(x is not None for x in SwissAdme_data)} of total {len(SwissAdme_data)} Requested')

 Returned data from 156 of total 156 Requested


In [25]:
unfoundchemicals = []
for i,d in enumerate(SwissAdme_data):
    if d.empty:
        print(foundcid[i], "-", Smiles[i])

In [27]:
# Concatonate all the csv files into one
result = pd.concat(SwissAdme_data)

In [28]:
# View result
result

Unnamed: 0,Molecule,Canonical SMILES,Formula,MW,#Heavy atoms,#Aromatic heavy atoms,Fraction Csp3,#Rotatable bonds,#H-bond acceptors,#H-bond donors,...,Lipinski #violations,Ghose #violations,Veber #violations,Egan #violations,Muegge #violations,Bioavailability Score,PAINS #alerts,Brenk #alerts,Leadlikeness #violations,Synthetic Accessibility
0,Molecule 1,OCC1OC(OC(c2ccccc2)C#N)C(C(C1O)O)O,C14H17NO6,295.29,21,6,0.50,4,7,4,...,0,1,0,0,0,0.55,0,0,0,4.21
0,Molecule 1,OCC1OC(OCC2OC(OC(c3ccccc3)C#N)C(C(C2O)O)O)C(C(...,C20H27NO11,457.43,32,6,0.65,7,12,7,...,2,1,1,1,4,0.17,0,0,1,5.41
0,Molecule 1,Cc1cc(O)c2c(c1)C(=O)c1c(C2=O)c(O)cc(c1)O,C15H10O5,270.24,20,12,0.07,0,5,3,...,0,0,0,0,0,0.55,1,0,0,2.57
0,Molecule 1,CN(CCc1ccc(cc1)O)C,C10H15NO,165.23,12,6,0.40,3,2,1,...,0,0,0,0,1,0.55,0,0,1,1.00
0,Molecule 1,CNCCc1ccc(cc1)OC,C10H15NO,165.23,12,6,0.40,4,2,1,...,0,0,0,0,1,0.55,0,0,1,1.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Molecule 1,Oc1ccc(cc1O)c1oc2cc(O)c(c(c2c(=O)c1O)O)O,C15H10O8,318.24,23,16,0.00,1,8,6,...,1,0,1,1,2,0.55,1,1,0,3.27
0,Molecule 1,Oc1cc(O)c2c(c1)oc(c(c2=O)O)c1ccc(c(c1)O)O,C15H10O7,302.24,22,16,0.00,1,7,5,...,0,0,0,0,0,0.55,1,1,0,3.23
0,Molecule 1,COc1cc2oc(=O)ccc2cc1OC,C11H10O4,206.19,15,10,0.18,2,4,0,...,0,0,0,0,0,0.55,0,1,1,2.77
0,Molecule 1,OCC1OC(Oc2ccc3c(c2)oc(=O)cc3)C(C(C1O)O)O,C15H16O8,324.28,23,10,0.40,3,8,4,...,0,1,0,0,0,0.55,0,1,0,4.68


In [29]:
# Eventually export as csv. 
result.to_csv('../Output/ADME_data_JB.csv')