In [1]:
from lxml import etree

In [2]:
import os
import glob
import pandas as pd

In [3]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs

In [4]:
def name_smiles_inchi(gen):
    """
    gen: generator for the elements in an etree
    """
    li = []
    for element in gen:
        if len(element.findall('.//{http://www.xml-cml.org/schema}name')) > 0:
            name = element.findall('.//{http://www.xml-cml.org/schema}name')[0].text
        else:
            name = 'none'   
        if name == 'title compound':
            if len(element.findall('.//{http://bitbucket.org/dan2097}nameResolved')) > 0:
                name = element.findall('.//{http://bitbucket.org/dan2097}nameResolved')[0].text
            else:
                name = 'unknown'

        if len(element.findall('.//{http://www.xml-cml.org/schema}identifier[@dictRef="cml:smiles"]')) > 0:
            smiles = element.findall('.//{http://www.xml-cml.org/schema}identifier[@dictRef="cml:smiles"]')[0].attrib['value']
        else:
            smiles = 'none'

        if len(element.findall('.//{http://www.xml-cml.org/schema}identifier[@dictRef="cml:inchi"]')) > 0: 
            inchi = element.findall('.//{http://www.xml-cml.org/schema}identifier[@dictRef="cml:inchi"]')[0].attrib['value']
        else:
            inchi = 'none'

        new_row = [name, smiles, inchi]
        li.append(new_row)
    return li

In [5]:
chem_sci_folder = r'/Users/wendong/Library/CloudStorage/OneDrive-Personal/Lab/Project_AI4Science/chemical science'
os.chdir(chem_sci_folder)
root_folder, main_folders, files_in_root = next(os.walk(chem_sci_folder))
main_folders.sort()
files_in_root.sort()
main_folders

['.ipynb_checkpoints',
 '1976_Sep2016_USPTOgrants_cml',
 '2001_Sep2016_USPTOapplications_cml',
 'references']

In [6]:
main_folder_ID = 2
os.chdir(main_folders[main_folder_ID])
_, year_folders, _ = next(os.walk(os.getcwd()))
year_folders.sort()

In [7]:
year_folder_ID = 0
os.chdir(year_folders[year_folder_ID])
xml_file_list = glob.glob('*.xml')
xml_file_list.sort()
os.getcwd()

'/Users/wendong/Library/CloudStorage/OneDrive-Personal/Lab/Project_AI4Science/chemical science/2001_Sep2016_USPTOapplications_cml/2001'

In [8]:
df_products = pd.DataFrame(columns=['name', 'smiles', 'inchi'])
df_reactants = pd.DataFrame(columns=['name', 'smiles', 'inchi'])
df_solvents = pd.DataFrame(columns=['name', 'smiles', 'inchi'])
df_catalysts = pd.DataFrame(columns=['name', 'smiles', 'inchi'])
df_actions = pd.DataFrame(columns=['actions'])

In [9]:
for xml_file in xml_file_list:
    tree = etree.parse(xml_file)
    root = tree.getroot()
    
    li = name_smiles_inchi(root.iterfind('.//{http://www.xml-cml.org/schema}product'))
    df = pd.DataFrame(li, columns=['name', 'smiles', 'inchi'])
    df_products = df_products.append(df)
    df_products.drop_duplicates(inplace=True, ignore_index=True)
    
    li = name_smiles_inchi(root.iterfind('.//{http://www.xml-cml.org/schema}reactant'))
    df = pd.DataFrame(li, columns=['name', 'smiles', 'inchi'])
    df_reactants = df_reactants.append(df)
    df_reactants.drop_duplicates(inplace=True, ignore_index=True)
    
    li = name_smiles_inchi(root.iterfind('.//{http://www.xml-cml.org/schema}spectator[@role="solvent"]'))
    df = pd.DataFrame(li, columns=['name', 'smiles', 'inchi'])
    df_solvents = df_solvents.append(df)
    df_solvents.drop_duplicates(inplace=True, ignore_index=True)
    
    li = name_smiles_inchi(root.iterfind('.//{http://www.xml-cml.org/schema}spectator[@role="catalyst"]'))
    df = pd.DataFrame(li, columns=['name', 'smiles', 'inchi'])
    df_catalysts = df_catalysts.append(df)
    df_catalysts.drop_duplicates(inplace=True, ignore_index=True)
    
    li = [e.attrib['action'] for e in root.findall('.//{http://bitbucket.org/dan2097}reactionAction[@action]')]
    df = pd.DataFrame(li, columns=['actions'])
    df_actions = df_actions.append(df)
    df_actions.drop_duplicates(inplace=True, ignore_index=True)
    

In [10]:
df_products.count()

name      8221
smiles    8221
inchi     8221
dtype: int64

In [15]:
df_products.head()

Unnamed: 0,name,smiles,inchi
0,p-tert-butylcyclohexanol,C(C)(C)(C)C1CCC(CC1)O,"InChI=1S/C10H20O/c1-10(2,3)8-4-6-9(11)7-5-8/h8..."
1,4-Chloro-1-(4-isopropyl-phenyl)-butan-1-one,ClCCCC(=O)C1=CC=C(C=C1)C(C)C,InChI=1S/C13H17ClO/c1-10(2)11-5-7-12(8-6-11)13...
2,4-Chloro-1-(4-methyl-phenyl)-butan-1-one,ClCCCC(=O)C1=CC=C(C=C1)C,InChI=1S/C11H13ClO/c1-9-4-6-10(7-5-9)11(13)3-2...
3,1-[4-(1-Bromo-1-methyl-ethyl)-phenyl]-4-chloro...,BrC(C)(C)C1=CC=C(C=C1)C(CCCCl)=O,"InChI=1S/C13H16BrClO/c1-13(2,14)11-7-5-10(6-8-..."
4,(4-Bromomethyl-phenyl)-cyclopropyl-methanone,BrCC1=CC=C(C=C1)C(=O)C1CC1,InChI=1S/C11H11BrO/c12-7-8-1-3-9(4-2-8)11(13)1...


In [11]:
df_reactants.count()

name      11204
smiles    11204
inchi     11204
dtype: int64

In [16]:
df_reactants.head()

Unnamed: 0,name,smiles,inchi
0,p-tert-butylphenol,C(C)(C)(C)C1=CC=C(C=C1)O,"InChI=1S/C10H14O/c1-10(2,3)8-4-6-9(11)7-5-8/h4..."
1,aluminum chloride,[Cl-].[Al+3].[Cl-].[Cl-],InChI=1S/Al.3ClH/h;3*1H/q+3;;;/p-3
2,4-chlorobutyryl chloride,ClCCCC(=O)Cl,InChI=1S/C4H6Cl2O/c5-3-1-2-4(6)7/h1-3H2
3,cumene,C1(=CC=CC=C1)C(C)C,"InChI=1S/C9H12/c1-8(2)9-6-4-3-5-7-9/h3-8H,1-2H3"
4,ice,none,none


In [12]:
df_solvents.count()

name      441
smiles    441
inchi     441
dtype: int64

In [17]:
df_solvents.head()

Unnamed: 0,name,smiles,inchi
0,methylene chloride,C(Cl)Cl,InChI=1S/CH2Cl2/c2-1-3/h1H2
1,carbontetrachloride,C(Cl)(Cl)(Cl)Cl,"InChI=1S/CCl4/c2-1(3,4)5"
2,carbon tetrachloride,C(Cl)(Cl)(Cl)Cl,"InChI=1S/CCl4/c2-1(3,4)5"
3,water,O,InChI=1S/H2O/h1H2
4,toluene,C1(=CC=CC=C1)C,"InChI=1S/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3"


In [13]:
df_catalysts.count()

name      316
smiles    316
inchi     316
dtype: int64

In [18]:
df_catalysts.head()

Unnamed: 0,name,smiles,inchi
0,nickel,[Ni],InChI=1S/Ni
1,AIBN,CC(C)(C#N)N=NC(C)(C)C#N,"InChI=1S/C8H12N4/c1-7(2,5-9)11-12-8(3,4)6-10/h..."
2,tetra-butylammonium bromide,[Br-].C(CCC)[N+](CCCC)(CCCC)CCCC,"InChI=1S/C16H36N.BrH/c1-5-9-13-17(14-10-6-2,15..."
3,tetraethylammonium bromide,[Br-].C(C)[N+](CC)(CC)CC,"InChI=1S/C8H20N.BrH/c1-5-9(6-2,7-3)8-4;/h5-8H2..."
4,silver,[Ag],InChI=1S/Ag


In [14]:
df_actions.count()

actions    24
dtype: int64

In [19]:
df_actions.head()

Unnamed: 0,actions
0,Yield
1,Add
2,Heat
3,Unknown
4,Stir


In [20]:
df_reaction = pd.DataFrame(columns=['source', 'reaction_smiles', 'products', 'reactants', 'solvents', 'catalysts', 'actions'])

In [26]:
reactions = []
for xml_file in xml_file_list:
    tree = etree.parse(xml_file)
    root = tree.getroot()
    
    for reaction in root:
        single_reaction = []
        docID = reaction.findall('./{http://bitbucket.org/dan2097}source/{http://bitbucket.org/dan2097}documentId')
        single_reaction.append(docID[0].text)
        
        reaction_smiles = reaction.findall('./{http://bitbucket.org/dan2097}reactionSmiles')[0].text
        single_reaction.append(reaction_smiles)
        
        products = reaction.findall('.//{http://www.xml-cml.org/schema}product')
        products_names = []
        for product in products:
            name = product.findall('.//{http://www.xml-cml.org/schema}name')[0].text
            if name == 'title compound':
                if len(product.findall('.//{http://bitbucket.org/dan2097}nameResolved')) > 0:
                    name = product.findall('.//{http://bitbucket.org/dan2097}nameResolved')[0].text
                else:
                    name = 'unknown'
            products_names.append(name)
        single_reaction.append(products_names)
        
        reactants = reaction.findall('.//{http://www.xml-cml.org/schema}reactant')
        reactants_names = []
        for reactant in reactants:
            name = reactant.findall('.//{http://www.xml-cml.org/schema}name')[0].text
            reactants_names.append(name)
        single_reaction.append(reactants_names)
        
        solvents = reaction.findall('.//{http://www.xml-cml.org/schema}spectator[@role="solvent"]')
        solvents_names = []
        for solvent in solvents:
            name = solvent.findall('.//{http://www.xml-cml.org/schema}name')[0].text
            solvents_names.append(name)
        single_reaction.append(solvents_names)
        
        catalysts = reaction.findall('.//{http://www.xml-cml.org/schema}spectator[@role="catalyst"]')
        catalysts_names = []
        for catalyst in catalysts:
            name = catalyst.findall('.//{http://www.xml-cml.org/schema}name')[0].text
            catalysts_names.append(name)
        single_reaction.append(catalysts_names)
        
        actions = reaction.findall('.//{http://bitbucket.org/dan2097}reactionAction[@action]')
        actions_names = [e.attrib['action'] for e in reaction.findall('.//{http://bitbucket.org/dan2097}reactionAction[@action]')]
        single_reaction.append(actions_names)
        reactions.append(single_reaction)

In [28]:
df_reaction = pd.DataFrame(reactions, columns=['source', 'reaction_smiles', 'products', 'reactants', 'solvents', 'catalysts', 'actions'])

In [29]:
df_reaction.count()

source             9399
reaction_smiles    9399
products           9399
reactants          9399
solvents           9399
catalysts          9399
actions            9399
dtype: int64

In [30]:
df_reaction.head()

Unnamed: 0,source,reaction_smiles,products,reactants,solvents,catalysts,actions
0,US20010000035A1,[C:1]([C:5]1[CH:10]=[CH:9][C:8]([OH:11])=[CH:7...,[p-tert-butylcyclohexanol],[p-tert-butylphenol],[],[nickel],[Yield]
1,US20010000038A1,[Cl-].[Al+3].[Cl-].[Cl-].[Cl:5][CH2:6][CH2:7][...,[4-Chloro-1-(4-isopropyl-phenyl)-butan-1-one],"[aluminum chloride, 4-chlorobutyryl chloride, ...","[methylene chloride, methylene chloride, methy...",[],"[Add, Heat, Unknown, Stir, Add, Stir, Add, Par..."
2,US20010000038A1,[Al+3].[Cl-].[Cl-].[Cl-].[Cl:5][CH2:6][CH2:7][...,[4-Chloro-1-(4-methyl-phenyl)-butan-1-one],"[AlCl3, 4-chlorobutyryl chloride, ice water, t...",[],[],"[Cool, Add, Stir, Add, Stir, Stir, Remove, Ext..."
3,US20010000038A1,[Cl:1][CH2:2][CH2:3][CH2:4][C:5]([C:7]1[CH:12]...,[1-[4-(1-Bromo-1-methyl-ethyl)-phenyl]-4-chlor...,"[4-chloro-1-(4-isopropyl-phenyl)-butan-1-one, ...",[carbontetrachloride],[],"[Dissolve, Add, Stir, Heat, Cool, Filter, Wash..."
4,US20010000038A1,[Cl:1][CH2:2][CH2:3][CH2:4][C:5]([C:7]1[CH:12]...,[1-[4-(1-Bromo-1-methyl-ethyl)-phenyl]-4-chlor...,"[4-chloro-1-(4-isopropyl-phenyl)-butan-1-one, ...",[carbon tetrachloride],[AIBN],"[Dissolve, Add, Stir, Heat, Irradiate, Remove,..."


In [31]:
from pubchempy import get_cids, get_compounds

In [32]:
def pubchem_compounds(df):
    """
    df: existing dataframe
    """
    di = {}
    for ind, row in df.iterrows():
        name, smiles, inchi = row['name'], row['smiles'], row['inchi']
        if smiles != 'none':
            try:
                c = get_compounds(smiles, 'smiles')
                di[str(ind)] = c 
                continue
            except:
                print("Error of smiles on %s" % ind)
        else:
            try:
                c = get_compounds(name, 'name')
                di[str(ind)] = c
                continue
            except:
                print("Error of name on %s" % ind)
    return di