In [1]:
import random
from glob import glob
from time import time
from typing import Union

import pandas as pd
import numpy as np
from ord_data_load import ORD_PATH, ORD_REPO_PATH, load_dataset, fahrenheit_to_celsius, df_na_vals

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)



#to disable warnings
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')


import os
import multiprocessing as mp
import gzip
from google import protobuf

from ord_schema import message_helpers
from ord_schema.proto import dataset_pb2
from ord_schema.proto import reaction_pb2

from time import time
from ord_data_load import load_dataset, filter_uspto_filenames
from chem_render import draw_reaction

%load_ext autoreload
%autoreload 2

In [2]:
%%time

if os.path.exists(f"{ORD_PATH}/uspto_files.csv"):
    print("uspto_files.csv already exists ... loading")
    uspto_files = pd.read_csv(f"{ORD_PATH}/uspto_files.csv").squeeze()
else:
    print("extracting uspto filenames ... ", end="")
    n_cores = 24
    with mp.Pool(n_cores) as p:
        uspto_filenames = p.map(filter_uspto_filenames, glob(f'{ORD_REPO_PATH}/data/*/*.pb.gz'))

    # save results
    uspto_files = pd.Series(uspto_filenames).dropna()
    uspto_files.to_csv(f"{ORD_PATH}/uspto_files.csv", index=False)
    print("saved to uspto_files.csv")

uspto_files.csv already exists ... loading
CPU times: total: 31.2 ms
Wall time: 9 ms


In [3]:
# dataset = load_dataset(random.choice(uspto_files))
dataset = load_dataset(glob(f"{ORD_REPO_PATH}/data/*/*ord_dataset-c3c1091f873b4f40827973a6f1f9b685.pb.gz")[0])
rxn = random.choice(dataset.reactions)

print('Dataset size:', len(dataset.reactions))

# rxn.conditions
rxn.conditions.temperature

Dataset size: 17639




In [4]:
# rxn.conditions.temperature.control - AMBIENT
# rxn.conditions.temperature.setpoint.value, rxn.conditions.temperature.setpoint.units

In [5]:
rxn.notes.procedure_details

'To a solution of 8-benzyl-10-(4-chlorophenyl)-11-oxa-3,8-diazaspiro[5.5]undecane (100 mg, 0.28 mmol) and 4-isopropoxy-3-methyl-benzoic acid (54 mg, 0.28 mmol) in DMF (1 mL) was added DIEA (98 μL, 0.56 mmol) followed by the addition of HATU (128 mg, 0.34 mmol). The reaction mixture was stirred for 10 minutes, then quenched with water and the aqueous layer was extracted with ethyl acetate. The combined organic layer was washed with water (twice). The organic layer was dried over MgSO4, filtered and concentrated in vacuo. The residue was purified by silica gel column chromatography using 0 to 50% EtOAc/hexanes as eluent to obtain [8-benzyl-10-(4-chlorophenyl)-11-oxa-3,8-diazaspiro[5.5]undecan-3-yl]-(4-isopropoxy-3-methyl-phenyl)methanone (44 mg, 29%). ESI-MS m/z calc. 532.2. Found 533.3 (M+1)+; Retention time: 2.17 minutes (3 min run).'

In [6]:
rxn.provenance.patent
rxn.reaction_id
rxn.identifiers[0].value # smiles
rxn.notes.procedure_details # notes
rxn.conditions.temperature.control #AMBIENT
rxn.conditions.temperature.setpoint
rxn.outcomes[0].reaction_time #time

value: 10.0
units: MINUTE

In [7]:
rxn.conditions.temperature.control



In [8]:
rxn = random.choice(dataset.reactions)

for p in rxn.outcomes[0].products:
    for m in p.measurements:
        if m.type == 3: # yield
            print(m.percentage.value)
            print(m.details)

In [9]:

"""
    Numpy array [0 .. N, cols]
    cols = ['rxn_id',   'rxn_smiles', 'time_unit', 'time_val', 'temp_unit', 'temp_val', 'temp_control', 'yield', 'patent', 'notes']
                0               1            2            3             4             5             6        7        8            9
"""



TEMP_CONTROL_MAP = {
    2 : 25.0,       # AMBIENT
    6 : 0.0,      # ICE_BATH
    9 : -78.0,    # DRY_ICE_BATH
    11: -120.0    # LIQUID_NITROGEN
}


N = len(dataset.reactions)
arr = np.empty((N, 10), dtype=object)

for idx, rxn in enumerate(dataset.reactions):
    # fields always present in USPTO
    arr[idx, 0] = rxn.reaction_id
    arr[idx, 1] = rxn.identifiers[0].value  # rxn_smiles

    # time (hours)
    if rxn.outcomes[0].HasField('reaction_time'):
        time = rxn.outcomes[0].reaction_time
        arr[idx, 2] = time.units
        arr[idx, 3] = time.value

    # temperature (°C)
    temp = rxn.conditions.temperature
    if temp.HasField('setpoint'):
        arr[idx, 4] = temp.setpoint.units
        arr[idx, 5] = temp.setpoint.value

    # ambient temp control
    if temp.HasField('control'):
        arr[idx, 6] = temp.control.type


    # yield (keep PERCENTYIELD if more than two)
    yields = {}
    for p in rxn.outcomes[0].products:
        for m in p.measurements:
            if m.type == 3:  # YIELD
                yields[m.details] = m.percentage.value
    if yields:
        if len(yields) > 1:
            y = yields.get("PERCENTYIELD", None)
        else:
            y = yields.get("CALCULATEDPERCENTYIELD", None)
        arr[idx, 7] = y

    arr[idx, 8] = rxn.provenance.patent
    arr[idx, 9] = rxn.notes.procedure_details

arr = arr[:idx]
arr[:10]

array([['ord-43d5b7a6265d46a0ab8a7e2b2db5ad33',
        '[NH2:1][C:2]1[C:3]([OH:12])=[CH:4][C:5]2[C:10]([CH:11]=1)=[CH:9][CH:8]=[CH:7][CH:6]=2.C(=O)([O-])[O-].[Na+].[Na+].[C:19](Cl)(=[O:21])[CH3:20].Cl>CC(C)=O>[OH:12][C:3]1[C:2]([NH:1][C:19](=[O:21])[CH3:20])=[CH:11][C:10]2[C:5]([CH:4]=1)=[CH:6][CH:7]=[CH:8][CH:9]=2 |f:1.2.3|',
        1, 4.0, 1, 0.0, None, 78.0, 'USRE045108E1',
        'An acetone solution (60 ml) of 3-amino-2-naphthol (5.0 g, 31.4 mmol) was added to an aqueous solution (20 ml) of sodium carbonate (4.77 g, 34.5 mmol). The mixture was cooled in an ice-water bath, and then acetyl chloride (2.27 ml, 32.0 mmol) was added to the mixture dropwise over 5 minutes. The resulting mixture was stirred at 0° C. for 4 hours and then allowed to stand at room temperature overnight. 2N Hydrochloric acid was added to the reaction mixture to adjust its pH to 3. The generated insoluble matter was separated, washed with water, and then dried, giving a white powder of N-(3-hydroxynaphthale

# Parse whole USPTO dataset to numpy array of reactions

In [10]:
import numpy as np
import pandas as pd
import multiprocessing as mp
from ord_data_load import pb2_to_numpy_rxn, ORD_PATH

uspto_files = pd.read_csv(f"{ORD_PATH}/uspto_files.csv").squeeze()

In [11]:
%%time
n_cores = 24

if __name__ == '__main__':
    with mp.Pool(n_cores) as p:
        res = p.map(pb2_to_numpy_rxn, uspto_files)
len(res)

CPU times: total: 8.5 s
Wall time: 18.3 s


489

In [12]:
%%time
arr = np.vstack(res)

CPU times: total: 156 ms
Wall time: 157 ms


In [13]:
%%time
np.save(f'{ORD_PATH}/rxn_np.npy', arr)

CPU times: total: 7.52 s
Wall time: 7.94 s


In [14]:
arr

array([['ord-89aff4b1c18042e4bbb1f88ebba96f86',
        'S(Cl)([Cl:3])=O.[CH2:5]([O:13][C:14]1[CH:22]=[CH:21][C:17]([C:18](O)=[O:19])=[CH:16][CH:15]=1)[CH2:6][CH2:7][CH2:8][CH2:9][CH2:10][CH2:11][CH3:12]>>[CH2:5]([O:13][C:14]1[CH:22]=[CH:21][C:17]([C:18]([Cl:3])=[O:19])=[CH:16][CH:15]=1)[CH2:6][CH2:7][CH2:8][CH2:9][CH2:10][CH2:11][CH3:12]',
        None, ..., None, 'US05723069',
        'A large excess of thionyl chloride was added to 10 g of the p-octyloxybenzoic acid prepared in (1), and the mixture was refluxed for 5 hours. Excessive thionyl chloride was distilled off to give a crude end compound.'],
       ['ord-c218088b16c541088a933a7606f68c4c',
        '[OH:1][C:2]1[CH:10]=[CH:9][C:5]([C:6]([OH:8])=[O:7])=[CH:4][CH:3]=1.[C:11](Cl)(=[O:21])[CH2:12][CH2:13][CH2:14][CH2:15][CH2:16][CH2:17][CH2:18][CH2:19][CH3:20].N1C=CC=CC=1>O>[C:11]([O:1][C:2]1[CH:10]=[CH:9][C:5]([C:6]([OH:8])=[O:7])=[CH:4][CH:3]=1)(=[O:21])[CH2:12][CH2:13][CH2:14][CH2:15][CH2:16][CH2:17][CH2:18][CH2:19][CH3:20]',


In [15]:
%%time
cols = ['rxn_smiles', 'time_unit', 'time_val', 'temp_unit', 'temp_val', 'temp_control', 'yield', 'patent', 'notes']
df = pd.DataFrame(arr[:, 1:], columns = cols, index=arr[:, 0])
df

CPU times: total: 0 ns
Wall time: 999 µs


Unnamed: 0,rxn_smiles,time_unit,time_val,temp_unit,temp_val,temp_control,yield,patent,notes
ord-89aff4b1c18042e4bbb1f88ebba96f86,S(Cl)([Cl:3])=O.[CH2:5]([O:13][C:14]1[CH:22]=[...,,,,,,,US05723069,A large excess of thionyl chloride was added t...
ord-c218088b16c541088a933a7606f68c4c,[OH:1][C:2]1[CH:10]=[CH:9][C:5]([C:6]([OH:8])=...,4,1.0,,,,,US05723069,10 Grams of p-hydroxybenzoic acid and 15 g of ...
ord-d965967cbe69411f9fd760ac1f990dfb,F[C:2]1[CH:10]=[C:9]([OH:11])[CH:8]=[CH:7][C:3...,,,,,,,US05723069,p-Acetoxybenzoic acid was prepared in the same...
ord-e8642ae0ecb248809067cd131fde0eed,[NH2:1][C:2]1[CH:17]=[CH:16][C:5]([C:6]([NH:8]...,2,10.0,1,55.0,,72.300003,US05723075,"4,4'-Diaminobenzanilide (0.658 g, 2.5 mM) was ..."
ord-83e2d9b7b08846c09e3e4e23f2dc9784,[NH2:1][C:2]1[CH:17]=[CH:16][C:5]([C:6]([NH:8]...,2,10.0,1,55.0,,43.400002,US05723075,"4,4'-Diaminobenzanilide (0.568 g, 2.5 mM) was ..."
...,...,...,...,...,...,...,...,...,...
ord-0ba13e40e6ab4628a543d7ac7faf6fdf,[CH3:1][N:2]([CH3:24])[CH2:3][CH2:4][NH:5][C:6...,,,,,,,US07968746B2,N-(2-(dimethylamino)ethyl)-2-hydroxy-2-methyl-...
ord-8fa3fc051e7f496593d7c7ba2a3aa564,[OH:1][C:2]1[C:3]([CH3:18])=[C:4]2[C:9](=[C:10...,1,2.0,,,2,78.099998,US07968746B2,"6-Hydroxy-2,5,7,8-tetramethylchroman-2-carboxy..."
ord-efddf5d225dc462db60c67a0306a9460,[OH:1][C:2]1[C:3]([CH3:23])=[C:4]2[C:9](=[C:10...,1,3.0,,,2,,US07968746B2,"A solution of 319 mg of (6-hydroxy-2,5,7,8-tet..."
ord-dc172fbdf10e4ae893a012785f609af2,[OH:1][C:2]1[C:3]([CH3:18])=[C:4]2[C:9](=[C:10...,1,8.0,1,40.0,,,US07968746B2,"6-Hydroxy-2,5,7,8-tetramethylchroman-2-carboxy..."


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1770543 entries, ord-89aff4b1c18042e4bbb1f88ebba96f86 to ord-d1adc6bb76454fb9b0b4a2f30a493858
Data columns (total 9 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   rxn_smiles    object
 1   time_unit     object
 2   time_val      object
 3   temp_unit     object
 4   temp_val      object
 5   temp_control  object
 6   yield         object
 7   patent        object
 8   notes         object
dtypes: object(9)
memory usage: 135.1+ MB


In [17]:
%%time
numerical = ['time_val', 'temp_val', 'yield']
categorical = ['time_unit', 'temp_unit', 'temp_control', 'patent']

for col in numerical:
    df[col] = df[col].astype(float)

for col in categorical:
    df[col] = df[col].astype('category')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1770543 entries, ord-89aff4b1c18042e4bbb1f88ebba96f86 to ord-d1adc6bb76454fb9b0b4a2f30a493858
Data columns (total 9 columns):
 #   Column        Dtype   
---  ------        -----   
 0   rxn_smiles    object  
 1   time_unit     category
 2   time_val      float64 
 3   temp_unit     category
 4   temp_val      float64 
 5   temp_control  category
 6   yield         float64 
 7   patent        category
 8   notes         object  
dtypes: category(4), float64(3), object(2)
memory usage: 97.9+ MB
CPU times: total: 1.97 s
Wall time: 1.96 s


In [18]:
df

Unnamed: 0,rxn_smiles,time_unit,time_val,temp_unit,temp_val,temp_control,yield,patent,notes
ord-89aff4b1c18042e4bbb1f88ebba96f86,S(Cl)([Cl:3])=O.[CH2:5]([O:13][C:14]1[CH:22]=[...,,,,,,,US05723069,A large excess of thionyl chloride was added t...
ord-c218088b16c541088a933a7606f68c4c,[OH:1][C:2]1[CH:10]=[CH:9][C:5]([C:6]([OH:8])=...,4,1.0,,,,,US05723069,10 Grams of p-hydroxybenzoic acid and 15 g of ...
ord-d965967cbe69411f9fd760ac1f990dfb,F[C:2]1[CH:10]=[C:9]([OH:11])[CH:8]=[CH:7][C:3...,,,,,,,US05723069,p-Acetoxybenzoic acid was prepared in the same...
ord-e8642ae0ecb248809067cd131fde0eed,[NH2:1][C:2]1[CH:17]=[CH:16][C:5]([C:6]([NH:8]...,2,10.0,1,55.0,,72.300003,US05723075,"4,4'-Diaminobenzanilide (0.658 g, 2.5 mM) was ..."
ord-83e2d9b7b08846c09e3e4e23f2dc9784,[NH2:1][C:2]1[CH:17]=[CH:16][C:5]([C:6]([NH:8]...,2,10.0,1,55.0,,43.400002,US05723075,"4,4'-Diaminobenzanilide (0.568 g, 2.5 mM) was ..."
...,...,...,...,...,...,...,...,...,...
ord-0ba13e40e6ab4628a543d7ac7faf6fdf,[CH3:1][N:2]([CH3:24])[CH2:3][CH2:4][NH:5][C:6...,,,,,,,US07968746B2,N-(2-(dimethylamino)ethyl)-2-hydroxy-2-methyl-...
ord-8fa3fc051e7f496593d7c7ba2a3aa564,[OH:1][C:2]1[C:3]([CH3:18])=[C:4]2[C:9](=[C:10...,1,2.0,,,2,78.099998,US07968746B2,"6-Hydroxy-2,5,7,8-tetramethylchroman-2-carboxy..."
ord-efddf5d225dc462db60c67a0306a9460,[OH:1][C:2]1[C:3]([CH3:23])=[C:4]2[C:9](=[C:10...,1,3.0,,,2,,US07968746B2,"A solution of 319 mg of (6-hydroxy-2,5,7,8-tet..."
ord-dc172fbdf10e4ae893a012785f609af2,[OH:1][C:2]1[C:3]([CH3:18])=[C:4]2[C:9](=[C:10...,1,8.0,1,40.0,,,US07968746B2,"6-Hydroxy-2,5,7,8-tetramethylchroman-2-carboxy..."


In [19]:
"""
Definition from reaction.proto

enum TimeUnit {
UNSPECIFIED = 0;
DAY = 4;
HOUR = 1;
MINUTE = 2;
SECOND = 3;
}
"""

time_unit_map = {
    0: 'UNSPECIFIED',
    4: 'DAY',
    1: 'HOUR',
    2: 'MINUTE',
    3: 'SECOND'
}

# df.time_unit = df.time_unit.map(time_unit_map)
df['time_unit'] = df.time_unit.map(time_unit_map)
df.time_unit

ord-89aff4b1c18042e4bbb1f88ebba96f86       NaN
ord-c218088b16c541088a933a7606f68c4c       DAY
ord-d965967cbe69411f9fd760ac1f990dfb       NaN
ord-e8642ae0ecb248809067cd131fde0eed    MINUTE
ord-83e2d9b7b08846c09e3e4e23f2dc9784    MINUTE
                                         ...  
ord-0ba13e40e6ab4628a543d7ac7faf6fdf       NaN
ord-8fa3fc051e7f496593d7c7ba2a3aa564      HOUR
ord-efddf5d225dc462db60c67a0306a9460      HOUR
ord-dc172fbdf10e4ae893a012785f609af2      HOUR
ord-d1adc6bb76454fb9b0b4a2f30a493858       NaN
Name: time_unit, Length: 1770543, dtype: category
Categories (4, object): ['HOUR', 'MINUTE', 'SECOND', 'DAY']

In [20]:
"""
Definition from reaction.proto

  enum TemperatureUnit {
    UNSPECIFIED = 0;
    CELSIUS = 1;
    FAHRENHEIT = 2;
    KELVIN = 3;
  }

"""
temp_unit_map = {
    0: "UNSPECIFIED",
    1: "CELSIUS",
    2: "FAHRENHEIT",
    3: "KELVIN"
}
df['temp_unit'] = df.temp_unit.map(temp_unit_map)
df.temp_unit

ord-89aff4b1c18042e4bbb1f88ebba96f86        NaN
ord-c218088b16c541088a933a7606f68c4c        NaN
ord-d965967cbe69411f9fd760ac1f990dfb        NaN
ord-e8642ae0ecb248809067cd131fde0eed    CELSIUS
ord-83e2d9b7b08846c09e3e4e23f2dc9784    CELSIUS
                                         ...   
ord-0ba13e40e6ab4628a543d7ac7faf6fdf        NaN
ord-8fa3fc051e7f496593d7c7ba2a3aa564        NaN
ord-efddf5d225dc462db60c67a0306a9460        NaN
ord-dc172fbdf10e4ae893a012785f609af2    CELSIUS
ord-d1adc6bb76454fb9b0b4a2f30a493858        NaN
Name: temp_unit, Length: 1770543, dtype: category
Categories (3, object): ['CELSIUS', 'FAHRENHEIT', 'KELVIN']

In [21]:
"""
Definition from reaction.proto

enum TemperatureControlType {
  UNSPECIFIED = 0;
  CUSTOM = 1;
  AMBIENT = 2;
  OIL_BATH = 3;
  WATER_BATH = 4;
  SAND_BATH = 5;
  ICE_BATH = 6;
  DRY_ALUMINUM_PLATE = 7;
  MICROWAVE = 8;
  DRY_ICE_BATH = 9;
  AIR_FAN = 10;
  LIQUID_NITROGEN = 11;
}
"""

temp_control_map = {
    0: 'UNSPECIFIED',
    1: 'CUSTOM',
    2: 'AMBIENT',
    3: 'OIL_BATH',
    4: 'WATER_BATH',
    5: 'SAND_BATH',
    6: 'ICE_BATH',
    7: 'DRY_ALUMINUM_PLATE',
    8: 'MICROWAVE',
    9: 'DRY_ICE_BATH',
    10: 'AIR_FAN',
    11: 'LIQUID_NITROGEN',
}
df['temp_control'] = df.temp_control.map(temp_control_map)
df.temp_control

ord-89aff4b1c18042e4bbb1f88ebba96f86        NaN
ord-c218088b16c541088a933a7606f68c4c        NaN
ord-d965967cbe69411f9fd760ac1f990dfb        NaN
ord-e8642ae0ecb248809067cd131fde0eed        NaN
ord-83e2d9b7b08846c09e3e4e23f2dc9784        NaN
                                         ...   
ord-0ba13e40e6ab4628a543d7ac7faf6fdf        NaN
ord-8fa3fc051e7f496593d7c7ba2a3aa564    AMBIENT
ord-efddf5d225dc462db60c67a0306a9460    AMBIENT
ord-dc172fbdf10e4ae893a012785f609af2        NaN
ord-d1adc6bb76454fb9b0b4a2f30a493858        NaN
Name: temp_control, Length: 1770543, dtype: category
Categories (1, object): ['AMBIENT']

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1770543 entries, ord-89aff4b1c18042e4bbb1f88ebba96f86 to ord-d1adc6bb76454fb9b0b4a2f30a493858
Data columns (total 9 columns):
 #   Column        Dtype   
---  ------        -----   
 0   rxn_smiles    object  
 1   time_unit     category
 2   time_val      float64 
 3   temp_unit     category
 4   temp_val      float64 
 5   temp_control  category
 6   yield         float64 
 7   patent        category
 8   notes         object  
dtypes: category(4), float64(3), object(2)
memory usage: 162.4+ MB


In [23]:
df

Unnamed: 0,rxn_smiles,time_unit,time_val,temp_unit,temp_val,temp_control,yield,patent,notes
ord-89aff4b1c18042e4bbb1f88ebba96f86,S(Cl)([Cl:3])=O.[CH2:5]([O:13][C:14]1[CH:22]=[...,,,,,,,US05723069,A large excess of thionyl chloride was added t...
ord-c218088b16c541088a933a7606f68c4c,[OH:1][C:2]1[CH:10]=[CH:9][C:5]([C:6]([OH:8])=...,DAY,1.0,,,,,US05723069,10 Grams of p-hydroxybenzoic acid and 15 g of ...
ord-d965967cbe69411f9fd760ac1f990dfb,F[C:2]1[CH:10]=[C:9]([OH:11])[CH:8]=[CH:7][C:3...,,,,,,,US05723069,p-Acetoxybenzoic acid was prepared in the same...
ord-e8642ae0ecb248809067cd131fde0eed,[NH2:1][C:2]1[CH:17]=[CH:16][C:5]([C:6]([NH:8]...,MINUTE,10.0,CELSIUS,55.0,,72.300003,US05723075,"4,4'-Diaminobenzanilide (0.658 g, 2.5 mM) was ..."
ord-83e2d9b7b08846c09e3e4e23f2dc9784,[NH2:1][C:2]1[CH:17]=[CH:16][C:5]([C:6]([NH:8]...,MINUTE,10.0,CELSIUS,55.0,,43.400002,US05723075,"4,4'-Diaminobenzanilide (0.568 g, 2.5 mM) was ..."
...,...,...,...,...,...,...,...,...,...
ord-0ba13e40e6ab4628a543d7ac7faf6fdf,[CH3:1][N:2]([CH3:24])[CH2:3][CH2:4][NH:5][C:6...,,,,,,,US07968746B2,N-(2-(dimethylamino)ethyl)-2-hydroxy-2-methyl-...
ord-8fa3fc051e7f496593d7c7ba2a3aa564,[OH:1][C:2]1[C:3]([CH3:18])=[C:4]2[C:9](=[C:10...,HOUR,2.0,,,AMBIENT,78.099998,US07968746B2,"6-Hydroxy-2,5,7,8-tetramethylchroman-2-carboxy..."
ord-efddf5d225dc462db60c67a0306a9460,[OH:1][C:2]1[C:3]([CH3:23])=[C:4]2[C:9](=[C:10...,HOUR,3.0,,,AMBIENT,,US07968746B2,"A solution of 319 mg of (6-hydroxy-2,5,7,8-tet..."
ord-dc172fbdf10e4ae893a012785f609af2,[OH:1][C:2]1[C:3]([CH3:18])=[C:4]2[C:9](=[C:10...,HOUR,8.0,CELSIUS,40.0,,,US07968746B2,"6-Hydroxy-2,5,7,8-tetramethylchroman-2-carboxy..."


In [24]:
%%time
df.to_pickle(f"{ORD_PATH}/uspto_rxn.pkl.zst")

CPU times: total: 10.2 s
Wall time: 10.2 s
