In [7]:
import random
from glob import glob
from time import time
from typing import Union

import pandas as pd
import numpy as np
from ord_data_load import ORD_PATH, ORD_REPO_PATH

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#to disable warnings
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')


import os
import multiprocessing as mp
import gzip
from google import protobuf

from ord_schema import message_helpers
from ord_schema.proto import dataset_pb2
from ord_schema.proto import reaction_pb2

from time import time
from ord_data_load import load_dataset, filter_uspto_filenames

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
uspto_filenames = []
N = len(glob(f'{ORD_REPO_PATH}//data//*//*.pb.gz'))

start = time()
for i, pb in enumerate(glob(f'{ORD_REPO_PATH}//data//*//*.pb.gz'), 1):
    print(f"{i:3d} / {N}: parsing dataset {time() - start:.1f}s", end="\r")
    dataset = load_dataset(pb)
    if "uspto" in dataset.name:
        uspto_filenames.append(pb)

len(uspto_filenames)

515 / 515: parsing dataset 130.5s

489

In [9]:
%%time

if os.path.exists(f"{ORD_PATH}/uspto_files.csv"):
    print("uspto_files.csv already exists ... loading")
    uspto_files = pd.read_csv(f"{ORD_PATH}/uspto_files.csv").squeeze()
else:
    print("extracting uspto filenames ... ", end="")
    n_cores = 24
    with mp.Pool(n_cores) as p:
        uspto_filenames = p.map(filter_uspto_filenames, glob(f'{ORD_REPO_PATH}/data/*/*.pb.gz'))

    # save results
    uspto_files = pd.Series(uspto_filenames).dropna()
    uspto_files.to_csv(f"{ORD_PATH}/uspto_files.csv", index=False)
    print("saved to uspto_files.csv")

extracting uspto filenames ... saved to uspto_files.csv
CPU times: total: 312 ms
Wall time: 26.7 s


# Create parsing method for USPTO compounds

In [10]:
uspto_files = pd.read_csv(f"{ORD_PATH}/uspto_files.csv", index_col=None).squeeze()

pb = uspto_files.sample().iat[0]
pb

'./ord-data/data\\9c\\ord_dataset-9cecb3a8d3b9494191b28dcefea66af2.pb.gz'

In [11]:
%%time
pb = glob(f'{ORD_REPO_PATH}/data/*/*c3c1091f873b4f40827973a6f1f9b685.pb.gz')[0]
dataset = load_dataset(pb)

CPU times: total: 1.08 s
Wall time: 1.07 s


In [12]:
print(dataset.name)
print(dataset.description)
print(len(dataset.reactions))

uspto-grants-2014_09
CML filenames: I20140902.xml,I20140909.xml,I20140916.xml,I20140923.xml,I20140930.xml
17639


In [13]:
def get_rxn_role(val):
    return reaction_pb2.ReactionRole.ReactionRoleType.Name(val)

get_rxn_role(1)

'REACTANT'

In [14]:
%%time
rxn = random.choice(dataset.reactions)
for cmpd in message_helpers.find_submessages(rxn, reaction_pb2.Compound):
    for i in cmpd.identifiers:
        if i.type == reaction_pb2.CompoundIdentifier.NAME:
            print("name:", i.value)
        if i.type == reaction_pb2.CompoundIdentifier.SMILES:
            print("smiles: ", i.value)
    print("role:", get_rxn_role(cmpd.reaction_role))
    print("=========================================================================")
for cmpd in message_helpers.find_submessages(rxn, reaction_pb2.ProductCompound):
    for i in cmpd.identifiers:
        if i.type == reaction_pb2.CompoundIdentifier.NAME:
            print("name:", i.value)
        if i.type == reaction_pb2.CompoundIdentifier.SMILES:
            print("smiles: ", i.value)
    print("role:", get_rxn_role(cmpd.reaction_role))
    print("=========================================================================")
print(rxn.notes)

name: 1-(4-methyl-pyridin-3-yl)-imidazolidin-2-one
smiles:  CC1=C(C=NC=C1)N1C(NCC1)=O
role: REACTANT
name: 5-bromo-3-methyl-benzofuran
smiles:  BrC=1C=CC2=C(C(=CO2)C)C1
role: REACTANT
name: trans-1,2-diamino cyclohexane
smiles:  N[C@H]1[C@@H](CCCC1)N
role: REACTANT
name: potassium carbonate
smiles:  C([O-])([O-])=O.[K+].[K+]
role: REACTANT
name: copper iodide
smiles:  [Cu](I)I
role: CATALYST
name: 1,4-dioxane
smiles:  O1CCOCC1
role: SOLVENT
name: crude product
role: WORKUP
name: MeOH
smiles:  CO
role: WORKUP
name: chloroform
smiles:  C(Cl)(Cl)Cl
role: WORKUP
name: product
name: 1-(3-Methyl-benzofuran-5-yl)-3-(4-methyl-pyridin-3-yl)-imidazolidin-2-one
smiles:  CC1=COC2=C1C=C(C=C2)N2C(N(CC2)C=2C=NC=CC2C)=O
role: PRODUCT
procedure_details: "Using the same reaction conditions as in Example 14, 1-(4-methyl-pyridin-3-yl)-imidazolidin-2-one (I-14b: 105 mg, 0.8465 mol) was reacted with 5-bromo-3-methyl-benzofuran (197 mg, 0.9311 mmol), 1,4-dioxane (10 mL), copper iodide (20 mg), trans-1,2-diam

In [15]:
%%time
pb = random.choice(uspto_files)
# pb = glob(f'{ORD_REPO_PATH}/data/*/*c3c1091f873b4f40827973a6f1f9b685.pb.gz')[0]

dataset = load_dataset(pb)
cmpd_count = 0
for rxn in dataset.reactions:
    cmpd_count += len(message_helpers.find_submessages(rxn, reaction_pb2.Compound))
    cmpd_count += len(message_helpers.find_submessages(rxn, reaction_pb2.ProductCompound))

print("Total reactions parsed:", len(dataset.reactions))
print("Total compounds parsed:", cmpd_count)
print("Compounds per rxn     :", cmpd_count / len(dataset.reactions))

Total reactions parsed: 1663
Total compounds parsed: 13426
Compounds per rxn     : 8.073361395069153
CPU times: total: 719 ms
Wall time: 701 ms


In [16]:
%%time
cmpd_count = 0
max_len = 0
for rxn in dataset.reactions:
    for cmpd in message_helpers.find_submessages(rxn, reaction_pb2.Compound):
    # for cmpd in message_helpers.find_submessages(rxn, reaction_pb2.ProductCompound):
        cmpd_count += 1

        names = []
        for i in cmpd.identifiers:
            if i.type == reaction_pb2.CompoundIdentifier.NAME:
                names.append(i.value)
        if len(names) > 1:
            # pass
            # print(names)
            if len(names) > max_len:
                max_len = len(names)


print("=================================================")
print("Fraction of multinames:", len(names) / cmpd_count)
print("max len:", max_len)

Fraction of multinames: 8.557980316645271e-05
max len: 2
CPU times: total: 531 ms
Wall time: 530 ms


In [17]:
get_rxn_role(2)

'REAGENT'

In [18]:
"""
    2D Numpy array of compounds with two indexes [0 .. N, field_idx]
    - first idx:
        Compound index
    - second idx:
        Corresponding field index
        ['second name', 'name', 'smiles', 'role', 'rxn_id']
        arr[i, 0] - (str/optional), trivial name or compound label in the patent
        arr[i, 1] - (str), systematic name
        arr[i, 2] - (str), smiles
        arr[i, 3] - (int), reaction role enum from reaction_pb2.ReactionRole.ReactionRoleType,
                           e.g. "REACTANT" - 1, "SOLVENT" - 3, "CATALYST" - 4, "PRODUCT" - 8
        arr[i, 4] - (str), reaction_id, e.g. "ord-43d5b7a6265d46a0ab8a7e2b2db5ad33"

"""

def parse_compound(arr: np.array,
                   idx: int,
                   rxn: reaction_pb2.Reaction,
                   cmpd: Union[reaction_pb2.Compound, reaction_pb2.ProductCompound]
                   ):
    names = []
    for i in cmpd.identifiers:
        if i.type == reaction_pb2.CompoundIdentifier.NAME:
            names.append(i.value)
        if i.type == reaction_pb2.CompoundIdentifier.SMILES:
            arr[idx, 2] = i.value

    if len(names) == 1:
        arr[idx, 1] = names[0]
    else:
        arr[idx, 0] = names[0]
        arr[idx, 1] = names[1]

    arr[idx, 3] = cmpd.reaction_role
    arr[idx, 4] = rxn.reaction_id

def parse_product(arr: np.array,
                   idx: int,
                   rxn: reaction_pb2.Reaction,
                   cmpd: Union[reaction_pb2.Compound, reaction_pb2.ProductCompound]
                   ):

    parse_compound(arr, idx, rxn, cmpd)
    # products in outcomes sometimes have UNDEFINED rxn_role
    if arr[idx, 3] == 0: # UNDEFINED
        arr[idx, 3] = 8 # PRODUCT


start = time()
# pb = glob(f'{ORD_REPO_PATH}/data/*/*c3c1091f873b4f40827973a6f1f9b685.pb.gz')[0]
pb = glob(f'{ORD_REPO_PATH}/data/*/*018fd0e1351f4fd09b20fcddd97b4c7a.pb.gz')[0]
# pb = random.choice(uspto_files)
dataset = load_dataset(pb)
print(f"Dataset {dataset.name} loaded in {time() - start:.2f}s", )

start = time()


def parse_dataset(dataset: dataset_pb2.Dataset) -> np.array:
    N = len(dataset.reactions)
    arr = np.empty((N*10, 5), dtype=object) # upper limit ~10x compounds per reaction
    idx = 0

    for rxn in dataset.reactions:
        compounds = []
        products = []

        for key in rxn.inputs:
            compounds.extend(rxn.inputs[key].components)
        products.extend(rxn.outcomes[0].products)

        for cmpd in compounds:
            parse_compound(arr, idx, rxn, cmpd)
            idx += 1
        for cmpd in products:
            parse_product(arr, idx, rxn, cmpd)
            idx += 1
    arr = arr[:idx]
    return arr

arr = parse_dataset(dataset)

print(f"Dataset {dataset.name} parsed in {time() - start:.2f}s", )
print(arr.shape)
print("Total reactions parsed:", len(dataset.reactions))
print("Total compounds parsed:", len(arr))
print("Compounds per rxn     :", len(arr) / len(dataset.reactions))
# print("Max NAME len          :", np.char.str_len(reactants[:, 0]).max())
# print("Max SMILES len        :", np.char.str_len(reactants[:, 1]).max())
print(f"Size                 : {arr.nbytes / 1024 / 1024:.1f}MB")
print(f"Size per rxn         : {arr.nbytes / len(dataset.reactions):.0f}B")

Dataset uspto-grants-1998_03 loaded in 0.17s
Dataset uspto-grants-1998_03 parsed in 0.23s
(16563, 5)
Total reactions parsed: 2997
Total compounds parsed: 16563
Compounds per rxn     : 5.526526526526527
Size                 : 0.6MB
Size per rxn         : 221B


In [19]:
arr[:10]

array([['( 1 )', 'p-n-octyloxybenzoic acid',
        'C(CCCCCCC)OC1=CC=C(C(=O)O)C=C1', 1,
        'ord-89aff4b1c18042e4bbb1f88ebba96f86'],
       [None, 'thionyl chloride', 'S(=O)(Cl)Cl', 1,
        'ord-89aff4b1c18042e4bbb1f88ebba96f86'],
       [None, 'p-octyloxybenzoic acid', 'C(CCCCCCC)OC1=CC=C(C(=O)O)C=C1',
        1, 'ord-89aff4b1c18042e4bbb1f88ebba96f86'],
       [None, '4-octyloxybenzoic acid chloride',
        'C(CCCCCCC)OC1=CC=C(C(=O)Cl)C=C1', 8,
        'ord-89aff4b1c18042e4bbb1f88ebba96f86'],
       [None, 'pyridine', 'N1=CC=CC=C1', 1,
        'ord-c218088b16c541088a933a7606f68c4c'],
       [None, 'p-hydroxybenzoic acid', 'OC1=CC=C(C(=O)O)C=C1', 1,
        'ord-c218088b16c541088a933a7606f68c4c'],
       [None, 'decanoic acid chloride', 'C(CCCCCCCCC)(=O)Cl', 1,
        'ord-c218088b16c541088a933a7606f68c4c'],
       [None, 'water', 'O', 3, 'ord-c218088b16c541088a933a7606f68c4c'],
       [None, 'p-decanoyloxybenzoic acid',
        'C(CCCCCCCCC)(=O)OC1=CC=C(C(=O)O)C=C1', 8,
  

In [20]:
arr[~np.equal(arr[:, 0], None)]

array([['( 1 )', 'p-n-octyloxybenzoic acid',
        'C(CCCCCCC)OC1=CC=C(C(=O)O)C=C1', 1,
        'ord-89aff4b1c18042e4bbb1f88ebba96f86'],
       ['( 2 )', 'p-decanoyloxybenzoic acid chloride',
        'C(CCCCCCCCC)(=O)OC1=CC=C(C(=O)Cl)C=C1', 1,
        'ord-d965967cbe69411f9fd760ac1f990dfb'],
       ['colorless crystal',
        "4,4'-di(phenylthiocarbamoylamino)benzanilide",
        'C1(=CC=CC=C1)NC(=S)NC1=CC=C(C(=O)NC2=CC=C(C=C2)NC(NC2=CC=CC=C2)=S)C=C1',
        8, 'ord-e8642ae0ecb248809067cd131fde0eed'],
       ...,
       ['title compound',
        '2-[2-(2-Acetylamino-2-carboxyethyldisulfanyl) benzoylamino]-3-methyl-pentanoic acid',
        'C(C)(=O)NS(SC1=C(C(=O)NC(C(=O)O)C(CC)C)C=CC=C1)CCC(=O)O', 8,
        'ord-8b264e10282d462783a148a83796833e'],
       ['title compound',
        '[2-(2-Benzoyl-phenyldisulfanyl)-phenyl]-phenyl-methanone',
        'C(C1=CC=CC=C1)(=O)C1=C(C=CC=C1)SSC1=C(C=CC=C1)C(=O)C1=CC=CC=C1',
        8, 'ord-950cfed0e8884a9ab3f2f0fdcbee3869'],
       ['title

In [21]:
arr[arr[:, 3] == 8].shape[0]

3124

In [22]:
%%time
arr[arr[:, 3] == 8]

CPU times: total: 0 ns
Wall time: 1 ms


array([[None, '4-octyloxybenzoic acid chloride',
        'C(CCCCCCC)OC1=CC=C(C(=O)Cl)C=C1', 8,
        'ord-89aff4b1c18042e4bbb1f88ebba96f86'],
       [None, 'p-decanoyloxybenzoic acid',
        'C(CCCCCCCCC)(=O)OC1=CC=C(C(=O)O)C=C1', 8,
        'ord-c218088b16c541088a933a7606f68c4c'],
       [None, 'p-Acetoxybenzoic acid', 'C(C)(=O)OC1=CC=C(C(=O)O)C=C1', 8,
        'ord-d965967cbe69411f9fd760ac1f990dfb'],
       ...,
       ['title compound',
        '2-[2-(2-Acetylamino-2-carboxyethyldisulfanyl) benzoylamino]-3-methyl-pentanoic acid',
        'C(C)(=O)NS(SC1=C(C(=O)NC(C(=O)O)C(CC)C)C=CC=C1)CCC(=O)O', 8,
        'ord-8b264e10282d462783a148a83796833e'],
       ['title compound',
        '[2-(2-Benzoyl-phenyldisulfanyl)-phenyl]-phenyl-methanone',
        'C(C1=CC=CC=C1)(=O)C1=C(C=CC=C1)SSC1=C(C=CC=C1)C(=O)C1=CC=CC=C1',
        8, 'ord-950cfed0e8884a9ab3f2f0fdcbee3869'],
       ['title compound',
        '{2-[2-(hydroxyimino-phenyl-methyl)-phenyldisulfanyl]phenyl}-phenyl-methanon',
     

In [23]:
%%time

u, c = np.unique(arr[arr[:, 3] == 8][:, 4], return_counts=True)
u[c > 1]

CPU times: total: 31.2 ms
Wall time: 3 ms


array(['ord-034dc9437bee48c3847af5a6f9237cbd',
       'ord-07e514fe010d477b830fbb79e7963a40',
       'ord-0a27e3faf3c24930a2eeef0ff4a98ee9',
       'ord-0c4cda8c04324a8b924c6c3506ced1f7',
       'ord-0f557f1000674f4b902d975b3b805985',
       'ord-15ef5e371941461d87787ec3f8b02012',
       'ord-185abfaba55a4edabda0c165062f3325',
       'ord-1a28675e5d044056be5059476c8f3ce9',
       'ord-1c03739439424e59810e1f608eed556f',
       'ord-1c51dd5042944cafbcc75002bd31f9d1',
       'ord-1ed77840c3734a39844b39f7948bc2ab',
       'ord-20b7c42ff39d4f27aff2e357d5541bd7',
       'ord-26714608e58f4c188b984bcbdffd717d',
       'ord-2751c75ddf0c486a8d95a650128f9c7e',
       'ord-29290da4f651485f9a89caa1ac100820',
       'ord-30b24befdf374a5dae019d09392b25b1',
       'ord-32657578f2084976872d2cf3b47b25b7',
       'ord-35c0a06e7c3d4c9f82f5835490d4df81',
       'ord-36355329f50b4fba91d75f1d3bd8c47f',
       'ord-3aafaa9bc09e4d7e91b49eec25c1596b',
       'ord-3c799edf560b44e4b1b3a6fde9733781',
       'ord-3

In [24]:
np.unique(arr[:, 4]).size

2997

In [25]:
names_unique = np.unique(arr[~np.equal(arr[:, 1], None)][:, 1], return_counts=True)
# np.unique(arr[:, 1])

In [26]:
names_unique[0][np.argsort(names_unique[1])][-20:]

array(['sodium hydroxide', 'ether', 'pyridine', 'potassium carbonate',
       'hydrochloric acid', 'sodium hydride', 'acetic acid',
       'dimethylformamide', 'HCl', 'toluene', 'ethyl acetate', 'DMF',
       'dichloromethane', 'ethanol', 'triethylamine', 'methanol',
       'tetrahydrofuran', 'methylene chloride', 'THF', 'water'],
      dtype=object)

In [27]:
%%time
len_vect = np.vectorize(lambda x: len(str(x)))
len_vect(arr[:, 1]).max()

CPU times: total: 0 ns
Wall time: 5 ms


188

# Parse whole USPTO dataset to numpy array of compounds

In [28]:
import numpy as np
import pandas as pd
from ord_data_load import pb2_to_numpy_cmpd, ORD_PATH

uspto_files = pd.read_csv(f"{ORD_PATH}/uspto_files.csv").squeeze()

In [29]:
%%time
n_cores = 24

if __name__ == '__main__':
    with mp.Pool(n_cores) as p:
        res = p.map(pb2_to_numpy_cmpd, uspto_files)
len(res)

CPU times: total: 8.53 s
Wall time: 27.5 s


489

In [30]:
cmpd_np = np.vstack(res)
cmpd_np

array([[None, 'thionyl chloride', 'S(=O)(Cl)Cl', 1,
        'ord-89aff4b1c18042e4bbb1f88ebba96f86'],
       [None, 'p-octyloxybenzoic acid', 'C(CCCCCCC)OC1=CC=C(C(=O)O)C=C1',
        1, 'ord-89aff4b1c18042e4bbb1f88ebba96f86'],
       ['( 1 )', 'p-n-octyloxybenzoic acid',
        'C(CCCCCCC)OC1=CC=C(C(=O)O)C=C1', 1,
        'ord-89aff4b1c18042e4bbb1f88ebba96f86'],
       ...,
       [None, 'MeOH', 'CO', 3, 'ord-2b0f97b346414e2083183fb1d64385e2'],
       [None, 'EtOAc', 'CCOC(=O)C', 1,
        'ord-2b0f97b346414e2083183fb1d64385e2'],
       [None,
        'N-hexyl-6-hydroxy-2,5,7,8-tetramethylchroman-2-carboxamide',
        'C(CCCCC)NC(=O)C1(OC2=C(C(=C(C(=C2CC1)C)O)C)C)C', 8,
        'ord-2b0f97b346414e2083183fb1d64385e2']], dtype=object)

In [31]:
import gc
del res
gc.collect()

178

In [32]:
print(f"{cmpd_np.nbytes/1024/1024:.1f}MB")

378.8MB


In [33]:
%%time
np.save(f'{ORD_PATH}/cmpd_np.npy', cmpd_np)

CPU times: total: 13.1 s
Wall time: 13.5 s


In [34]:
%%time
cmpd_np = np.load(f'{ORD_PATH}/cmpd_np.npy', allow_pickle=True)

CPU times: total: 6.69 s
Wall time: 6.7 s


In [35]:
%%time
df = pd.DataFrame(cmpd_np[:, :4], columns=['trivial' ,'name', 'smiles', 'rxn_role'], index=cmpd_np[:, 4])
df.sort_index(inplace=True)

CPU times: total: 20.5 s
Wall time: 20.6 s


In [36]:
df.to_pickle(f"{ORD_PATH}/uspto_cmpd.pkl.zst")

In [37]:
%%time
suzuki_df = pd.read_pickle(f"{ORD_PATH}/suzuki.pkl.zst")
suzuki_idx = suzuki_df.index.values

CPU times: total: 188 ms
Wall time: 201 ms


In [38]:
%%time
df.loc[suzuki_idx]

CPU times: total: 4.36 s
Wall time: 4.34 s


Unnamed: 0,trivial,name,smiles,rxn_role
ord-205c29c6463642c4a3ef2726db5ff6fc,,tetrakis-(triphenylphosphine)palladium (0),,1
ord-205c29c6463642c4a3ef2726db5ff6fc,,toluene,C1(=CC=CC=C1)C,3
ord-205c29c6463642c4a3ef2726db5ff6fc,,"methyl N-methyl-4-(5,6,7,8-tetrahydro-5,5,8,8-...",CN1C(=CC(=C1)C1=CC2=CC=3C(CCC(C3C=C2C=C1)(C)C)...,8
ord-205c29c6463642c4a3ef2726db5ff6fc,,methyl N-methyl-4-iodo-2-pyrrolecarboxylate,CN1C(=CC(=C1)I)C(=O)OC,1
ord-205c29c6463642c4a3ef2726db5ff6fc,,"5,6,7,8-tetrahydro-5,5,8,8-tetramethyl-2-anthr...",CC1(C=2C=C3C=CC(=CC3=CC2C(CC1)(C)C)B(O)O)C,1
...,...,...,...,...
ord-7e9dca83f617441ea03c8f72561a3acb,,"2-[3-(2-chloro-3,6-difluoro-phenyl)-isoxazol-5...",ClC1=C(C(=CC=C1F)F)C1=NOC(=C1)COC1=NC=C(C=C1)B...,1
ord-7e9dca83f617441ea03c8f72561a3acb,,DME,COCCOC,3
ord-7e9dca83f617441ea03c8f72561a3acb,,Na2CO3,C(=O)([O-])[O-].[Na+].[Na+],1
ord-7e9dca83f617441ea03c8f72561a3acb,,Pd(PPh3)4,C=1C=CC(=CC1)[P](C=2C=CC=CC2)(C=3C=CC=CC3)[Pd]...,4
