In [5]:
import random
from glob import glob
from time import time
from typing import Union

import pandas as pd
import numpy as np
from ord_data_load import ORD_PATH, ORD_REPO_PATH, load_dataset, fahrenheit_to_celsius

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#to disable warnings
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')


import os
import multiprocessing as mp
import gzip
from google import protobuf

from ord_schema import message_helpers
from ord_schema.proto import dataset_pb2
from ord_schema.proto import reaction_pb2

from time import time
from ord_data_load import load_dataset, filter_uspto_filenames

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
%%time

if os.path.exists(f"{ORD_PATH}/uspto_files.csv"):
    print("uspto_files.csv already exists ... loading")
    uspto_files = pd.read_csv(f"{ORD_PATH}/uspto_files.csv").squeeze()
else:
    print("extracting uspto filenames ... ", end="")
    n_cores = 24
    with mp.Pool(n_cores) as p:
        uspto_filenames = p.map(filter_uspto_filenames, glob(f'{ORD_REPO_PATH}/data/*/*.pb.gz'))

    # save results
    uspto_files = pd.Series(uspto_filenames).dropna()
    uspto_files.to_csv(f"{ORD_PATH}/uspto_files.csv", index=False)
    print("saved to uspto_files.csv")

uspto_files.csv already exists ... loading
CPU times: total: 15.6 ms
Wall time: 10 ms


In [432]:
dataset = load_dataset(random.choice(uspto_files))
rxn = random.choice(dataset.reactions)

# rxn.conditions
rxn.conditions.temperature

control {
  type: AMBIENT
}

In [151]:
# rxn.conditions.temperature.control - AMBIENT
# rxn.conditions.temperature.setpoint.value, rxn.conditions.temperature.setpoint.units

(0.0, 0)

In [183]:
rxn.notes.procedure_details

'This material was prepared in 52.7% yield from 0.06 mol of LDA and 0.024 mol of product of Example 6 according to the following general procedure as a beige solid, mp 115°-117° C. To a -78° C. solution of 2.5 eq (0.06 mol) of LDA in DME was added a solution of 1 eq (0.024 mol) of starting material, the monocarboxylate product (Example 6) in dry DME. The resulting dark colored solution was stirred for 30 minutes at -78° C. To the above solution was added excess of dry ice. The reaction mixture was stirred at -78° C. for 15 minutes and warmed to room temperature in 1 hour. The reaction mixture was poured into ice water (100 ml) and extracted with ether. The aqueous layer was made acidic. The oil precipitate was extracted with ether. The ether extract was dried (MgSO4) and concentrated in vacuo to give the desired product.'

In [372]:
rxn.provenance.patent
rxn.reaction_id
rxn.identifiers[0].value # smiles
rxn.notes.procedure_details # notes
rxn.conditions.temperature.control #AMBIENT
rxn.conditions.temperature.setpoint
rxn.outcomes[0].reaction_time #time



In [368]:
rxn.conditions.temperature.control



In [303]:
rxn = random.choice(dataset.reactions)

for p in rxn.outcomes[0].products:
    for m in p.measurements:
        if m.type == 3: # yield
            print(m.percentage.value)
            print(m.details)

19.0
PERCENTYIELD
19.200000762939453
CALCULATEDPERCENTYIELD


In [434]:

# df = pd.DataFrame(columns=['rxn_id', 'rxn_smiles', 'time', 'temp', 'yield', 'patent', 'notes'])


TEMP_CONTROL_MAP = {
    2 : 25.0,   # AMBIENT
    6 : 0.0,      # ICE_BATH
    9 : -78.0,    # DRY_ICE_BATH
    11: -120.0    # LIQUID_NITROGEN
}

def parse_dataset_rxn(dataset: dataset_pb2.Dataset) -> np.ndarray:
    N = len(dataset.reactions)
    arr = np.empty((N, 7), dtype=object)
    idx = 0

    for rxn in dataset.reactions:
        arr[idx, 0] = rxn.reaction_id
        arr[idx, 1] = rxn.identifiers[0].value

        arr[idx, 5] = rxn.provenance.patent
        arr[idx, 6] = rxn.notes.procedure_details

        # time (in hours)
        time = rxn.outcomes[0].reaction_time
        if time:
            if time.units == 1:     # HOURS
                t = time.value
            elif time.units == 2:   # MINUTE
                t = time.value / 60
            elif time.units == 4:   # DAY
                t = time.value * 24
            arr[idx, 2] = t

        # temperature (celcius)
        temp = rxn.conditions.temperature
        t = None
        if temp.control:
            t = TEMP_CONTROL_MAP.get(temp.control.type, None)
        if temp.setpoint:
            t = temp.setpoint.value
            if temp.setpoint.units == 2: # FAHRENHEIT
                t = fahrenheit_to_celsius(t)
            elif temp.setpoint.units == 3: # KELVIN
                t = t - 273.15
        if t:
            arr[idx, 3] = t

        # yield
        yields = {}
        for p in rxn.outcomes[0].products:
            for m in p.measurements:
                if m.type == 3: # yield
                    yields[m.details] = m.percentage.value
        if yields:
            if len(yields) > 1:
                y = yields.get("PERCENTYIELD", None)
            else:
                y = yields.get("CALCULATEDPERCENTYIELD", None)
            arr[idx, 4] = y
        idx += 1
    return arr

parse_dataset_rxn(dataset)

array([['ord-426abac6664346d6b4fd864a7f287fac',
        '[CH3:1][NH2:2].Cl[CH2:4][CH2:5][CH2:6][CH2:7][CH2:8][CH2:9][CH2:10][CH2:11][CH2:12][OH:13].[OH-].[Na+]>O>[CH3:1][NH:2][CH2:4][CH2:5][CH2:6][CH2:7][CH2:8][CH2:9][CH2:10][CH2:11][CH2:12][OH:13] |f:2.3|',
        1.0, ..., None, 'US05282983',
        '400 g a 40% aqueous solution of methylamine and 180 g of 9-chlorononanol were introduced into an autoclave equipped with a stirrer, a thermometer and a pressure pump and heated to 100° C. At the temperature, 93 g of a 48% aqueous solution of sodium hydroxide was forcedly introduced to the autoclave over 6 hours and the resulting mixture was aged for 1 hours. After cooling, the content was taken out from the autoclave and unreacted methylamine and water were removed therefrom under reduced pressure. Then, ethanol was added thereto and sodium chloride thus precipitated was filtered off. Ethanol was distilled off from the filtrate and the resulting solution was further subjected to distil

# Parse whole USPTO dataset to numpy array of reactions

In [444]:
import numpy as np
import pandas as pd
from ord_data_load import pb2_to_numpy_rxn, ORD_PATH

uspto_files = pd.read_csv(f"{ORD_PATH}/uspto_files.csv").squeeze()

In [445]:
%%time
n_cores = 24

if __name__ == '__main__':
    with mp.Pool(n_cores) as p:
        res = p.map(pb2_to_numpy_rxn, uspto_files)
len(res)

CPU times: total: 9.36 s
Wall time: 18.4 s


489

In [447]:
%%time
arr = np.vstack(res)

CPU times: total: 125 ms
Wall time: 127 ms


In [459]:
%%time
np.save(f'{ORD_PATH}/rxn_np.npy', arr)

CPU times: total: 7.39 s
Wall time: 20.5 s


In [458]:
arr

array([['ord-89aff4b1c18042e4bbb1f88ebba96f86',
        'S(Cl)([Cl:3])=O.[CH2:5]([O:13][C:14]1[CH:22]=[CH:21][C:17]([C:18](O)=[O:19])=[CH:16][CH:15]=1)[CH2:6][CH2:7][CH2:8][CH2:9][CH2:10][CH2:11][CH3:12]>>[CH2:5]([O:13][C:14]1[CH:22]=[CH:21][C:17]([C:18]([Cl:3])=[O:19])=[CH:16][CH:15]=1)[CH2:6][CH2:7][CH2:8][CH2:9][CH2:10][CH2:11][CH3:12]',
        None, ..., None, 'US05723069',
        'A large excess of thionyl chloride was added to 10 g of the p-octyloxybenzoic acid prepared in (1), and the mixture was refluxed for 5 hours. Excessive thionyl chloride was distilled off to give a crude end compound.'],
       ['ord-c218088b16c541088a933a7606f68c4c',
        '[OH:1][C:2]1[CH:10]=[CH:9][C:5]([C:6]([OH:8])=[O:7])=[CH:4][CH:3]=1.[C:11](Cl)(=[O:21])[CH2:12][CH2:13][CH2:14][CH2:15][CH2:16][CH2:17][CH2:18][CH2:19][CH3:20].N1C=CC=CC=1>O>[C:11]([O:1][C:2]1[CH:10]=[CH:9][C:5]([C:6]([OH:8])=[O:7])=[CH:4][CH:3]=1)(=[O:21])[CH2:12][CH2:13][CH2:14][CH2:15][CH2:16][CH2:17][CH2:18][CH2:19][CH3:20]',


In [455]:
%%time
df = pd.DataFrame(arr[:, 1:], columns=['rxn_smiles', 'time', 'temp', 'yield', 'patent', 'notes'], index=arr[:, 0])
df

CPU times: total: 15.6 ms
Wall time: 2 ms


Unnamed: 0,rxn_smiles,time,temp,yield,patent,notes
ord-89aff4b1c18042e4bbb1f88ebba96f86,S(Cl)([Cl:3])=O.[CH2:5]([O:13][C:14]1[CH:22]=[...,,0.0,,US05723069,A large excess of thionyl chloride was added t...
ord-c218088b16c541088a933a7606f68c4c,[OH:1][C:2]1[CH:10]=[CH:9][C:5]([C:6]([OH:8])=...,24.0,0.0,,US05723069,10 Grams of p-hydroxybenzoic acid and 15 g of ...
ord-d965967cbe69411f9fd760ac1f990dfb,F[C:2]1[CH:10]=[C:9]([OH:11])[CH:8]=[CH:7][C:3...,,0.0,,US05723069,p-Acetoxybenzoic acid was prepared in the same...
ord-e8642ae0ecb248809067cd131fde0eed,[NH2:1][C:2]1[CH:17]=[CH:16][C:5]([C:6]([NH:8]...,0.166667,55.0,72.300003,US05723075,"4,4'-Diaminobenzanilide (0.658 g, 2.5 mM) was ..."
ord-83e2d9b7b08846c09e3e4e23f2dc9784,[NH2:1][C:2]1[CH:17]=[CH:16][C:5]([C:6]([NH:8]...,0.166667,55.0,43.400002,US05723075,"4,4'-Diaminobenzanilide (0.568 g, 2.5 mM) was ..."
...,...,...,...,...,...,...
ord-8fa3fc051e7f496593d7c7ba2a3aa564,[OH:1][C:2]1[C:3]([CH3:18])=[C:4]2[C:9](=[C:10...,2.0,0.0,78.099998,US07968746B2,"6-Hydroxy-2,5,7,8-tetramethylchroman-2-carboxy..."
ord-efddf5d225dc462db60c67a0306a9460,[OH:1][C:2]1[C:3]([CH3:23])=[C:4]2[C:9](=[C:10...,3.0,0.0,,US07968746B2,"A solution of 319 mg of (6-hydroxy-2,5,7,8-tet..."
ord-dc172fbdf10e4ae893a012785f609af2,[OH:1][C:2]1[C:3]([CH3:18])=[C:4]2[C:9](=[C:10...,8.0,40.0,,US07968746B2,"6-Hydroxy-2,5,7,8-tetramethylchroman-2-carboxy..."
ord-d1adc6bb76454fb9b0b4a2f30a493858,[C:1]([O:4][C:5]1[C:6]([CH3:21])=[C:7]2[C:12](...,,0.0,,US07968746B2,To one of the above aliquots of crude 2-(chlor...


In [456]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1771032 entries, ord-89aff4b1c18042e4bbb1f88ebba96f86 to ord-2b0f97b346414e2083183fb1d64385e2
Data columns (total 6 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   rxn_smiles  object
 1   time        object
 2   temp        object
 3   yield       object
 4   patent      object
 5   notes       object
dtypes: object(6)
memory usage: 94.6+ MB


In [457]:
%%time
df.to_pickle(f"{ORD_PATH}/uspto_rxn_df.pkl.zst")

CPU times: total: 13.3 s
Wall time: 13.4 s
