Importing packages

In [149]:
import numpy as np
import pandas as pd
import os
import pubchempy as pcp
import cirpy as cir
from rdkit import Chem

Importing pesticides (conventional chemical) database file into Pandas dataframe
(Datafile obtained from [EPA:United States Environmental Protection Agency](https://iaspub.epa.gov/apex/pesticides/f?p=CHEMICALSEARCH:46:::NO:::))

In [136]:
csv_path = os.path.join(os.getcwd(), "pesticides_conventionalchemical.csv")
pest_df = pd.read_csv(csv_path, comment="#")

Dropping unnecessary columns

In [137]:
pest_df = pest_df.drop(['Image', 'Latest Process', 'PublicParticipation', 'Open forPublic Comments'], 1)

Dropping compounds with no CAS registry number or with duplicate CAS registry number.

In [142]:
pest_df = pest_df.dropna().reset_index(drop=True).drop_duplicates(subset=['CAS Number'])
print ("Shape of dataframe: ", pest_df.shape)
pest_df.head(4)

Shape of dataframe:  (1578, 2)


Unnamed: 0,Chemical Name,CAS Number
0,(Z)-8-Dodecen-1-yl acetate,28079-04-1
1,(Z)-9-Tetradecenal,53939-27-8
2,"(Z,Z)-3,13-Octadecadien-1-ol acetate",53120-27-7
3,"(Z,Z)-3,13-Octadecadien-1-ol",66410-24-0


Identifying compounds by their CAS registry number (using CIRPY package) and add SMILES string to the data

In [144]:
%%time
n = pest_df['CAS Number'].shape[0]
smiles = pd.Series(index=np.arange(n))
for i in range(n):
    smiles[i] = cir.resolve(pest_df['CAS Number'][i], representation='smiles')
pest_df['SMILES'] = smiles

pest_df.head(3)

CPU times: user 22 s, sys: 1.63 s, total: 23.6 s
Wall time: 6min 32s


Remove compounds without a valid SMILES string

In [146]:
pest_df = pest_df.dropna().reset_index(drop=True)

In [164]:
%%time
ls = ['molecular_weight', 'xlogp', 'h_bond_donor_count', 
     'h_bond_acceptor_count', 'rotatable_bond_count']
pcp.get_properties(ls, 'CCC\C=C/CCCCCCCOC(C)=O', namespace='smiles')

CPU times: user 14.3 ms, sys: 4.73 ms, total: 19.1 ms
Wall time: 844 ms


In [165]:
%%time
pcp.get_properties(ls, '5363377', namespace='cid' )

CPU times: user 15.4 ms, sys: 3.77 ms, total: 19.2 ms
Wall time: 515 ms


[{'CID': 5363377,
  'HBondAcceptorCount': 2,
  'HBondDonorCount': 0,
  'MolecularWeight': 226.36,
  'RotatableBondCount': 11,
  'XLogP': 4.8}]