In [2]:
import pandas as pd
import numpy as np
from ord_data_load import ORD_PATH, ORD_REPO_PATH

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#to disable warnings
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

%load_ext autoreload
%autoreload 2

In [2]:
# all_reactions = pd.read_pickle(f"{ORD_PATH}/all_reactions.pkl")

#TODO treat those datasets separately

# Datasets of interest

### 1) Pd CAT REACTIONS DATASETS (500-5k cmpds, Good quality, quantitative)

750 Buchwald-Hartwig reactions generated from AstraZeneca ELN which is used in yield prediction work
*"Graph Neural Networks for Predicting Chemical Reaction Performance"*
[https://doi.org/10.26434/chemrxiv.14589498.v2](https://doi.org/10.26434/chemrxiv.14589498.v2)
[ord_dataset-00005539a1e04c809a9a78647bea649c](https://open-reaction-database.org/client/search?dataset_ids=ord_dataset-00005539a1e04c809a9a78647bea649c)


1536 reactions using HTS with Pd cat. Good quality quantitative data.
*"Nanomole-scale high-throughput chemistry for the synthesis of complex molecules"*
[https://doi.org/10.1126/science.1259203](https://doi.org/10.1126/science.1259203)
[ord_dataset-7d8f5fd922d4497d91cb81489b052746](https://open-reaction-database.org/client/search?dataset_ids=ord_dataset-7d8f5fd922d4497d91cb81489b052746)

5760 Suzuki reactions. Quantitative data. Single scaffold [6-quin]-[4-benzIm]
*A platform for automated nanomole-scale reaction screening and micromole-scale synthesis in flow*
[https://doi.org/10.1126/science.aap9112](https://doi.org/10.1126/science.aap9112)
```ord_dataset-68cb8b4b2b384e3d85b5b1efae58b203```

376 Suzuki reactions. Quantitative data.
Same as scaffold as ```ord_dataset-68cb8b4b2b384e3d85b5b1efae58b203```
[6-quin]-[4-benzIm], but with multiple outcomes as byproducts.
```ord_dataset-eeba974d3c284aed86d1c1d442260a1e```


4312 C-N cross-coupling reactions. Quantitative data. Single scaffold [Ph-NH-Ph].
Different heterocycles - B, Si etc.
*"Predicting reaction performance in C–N cross-coupling using machine learning"*
[https://doi.org/10.1126/science.aar5169](https://doi.org/10.1126/science.aar5169)
```ord_dataset-46ff9a32d9e04016b9380b1b1ef949c3```


288 Buchwald reactions. Quantitative data. Single reaction:
[3-Br-py + 4-Ph-THPyrazine]
```ord_dataset-cbcc4048add7468e850b6ec42549c70d```


450 Pd/Ni cat reactions. Might be usable. Single Scaffold [Ph-Ph]
*"Linking Mechanistic Analysis of Catalytic Reactivity Cliffs to Ligand Classification"*
[https://doi.org/10.26434/chemrxiv.14388557.v1](https://doi.org/10.26434/chemrxiv.14388557.v1)
```ord_dataset-3b5db90e337942ea886b8f5bc5e3aa72```

### 2) 500k REACTIONS, NO CONDITIONS, CLEANED SINGLE DATASET.

500k reactions. Split to train and test. Different reaction classes. No conditions
*"A graph-convolutional neural network model for the prediction of chemical reactivity"*
[https://doi.org/10.1039/C8SC04228D](https://doi.org/10.1039/C8SC04228D)
```
ord_dataset-de0979205c84441190feef587fef8d6d
ord_dataset-488402f6ec0d441ca2f7d6fabea7c220
ord_dataset-5481550056a14935b76e031fb94b88be
```

### 3) USPTO DATABASE ~1.7kk reactions. Needs cleaning. Raw data
Non-zero "patent" field


### 4) everything else - Low quality or small datasets

1728 low variability data.
*Nano CN PhotoChemistry Informers Library*
```ord_dataset-ac78456835404910b3a4c840248b6ac9```


1728 + 256 quantitative data. Imidazole arylation. Single reaction.
https://doi.org/10.1038/s41586-021-03213-y
```ord_dataset-d26118acda314269becc35db5c22dc59```
```ord_dataset-0c75d67751634f0594b24b9f498b77c2```




In [3]:
# uspto_df = all_reactions.loc[~all_reactions.patent.isna()]
# uspto_df

In [3]:
from glob import glob
import os
import multiprocessing as mp
import gzip
from google import protobuf

from ord_schema import message_helpers
from ord_schema.proto import dataset_pb2
from ord_schema.proto import reaction_pb2

In [4]:
from time import time
from ord_data_load import load_dataset, filter_uspto_filenames

# uspto_filenames = []
# N = len(glob(f'{ORD_REPO_PATH}/data/*/*.pb.gz'))
#
# start = time()
# for i, pb in enumerate(glob(f'{ORD_REPO_PATH}/data/*/*.pb.gz'), 1):
#     print(f"{i:3d} / {N}: parsing dataset {time() - start:.1f}s", end="\r")
#     dataset = load_dataset(pb)
#     if "uspto" in dataset.name:
#         uspto_filenames.append(pb)
#
# len(uspto_filenames)

print("515 / 515: parsing dataset 134.3s")

515 / 515: parsing dataset 134.3s


In [5]:
%%time
n_cores = 24

with mp.Pool(n_cores) as p:
    uspto_filenames = p.map(filter_uspto_filenames, glob(f'{ORD_REPO_PATH}/data/*/*.pb.gz'))

pd.Series(uspto_filenames)

Wall time: 30.6 s


0                                                   None
1      ./ord-data/data\01\ord_dataset-018fd0e1351f4fd...
2      ./ord-data/data\01\ord_dataset-01dbb772c5e2491...
3      ./ord-data/data\02\ord_dataset-02ee22616630481...
4      ./ord-data/data\03\ord_dataset-0387783899c642a...
                             ...                        
510    ./ord-data/data\fd\ord_dataset-fdef1f30cad6443...
511    ./ord-data/data\fe\ord_dataset-fe016e2f90e741a...
512    ./ord-data/data\fe\ord_dataset-fea3ada7aaad45a...
513    ./ord-data/data\ff\ord_dataset-ff0bcb6c2300494...
514    ./ord-data/data\ff\ord_dataset-ffbef48837674f3...
Length: 515, dtype: object

In [6]:
# save results
uspto_files = pd.Series(uspto_filenames).dropna() #.apply(lambda x: os.path.basename(x).split(".")[0])
uspto_files.to_csv(f"{ORD_PATH}/uspto_files.csv", index=False, header=False)

In [7]:
uspto_files = pd.read_csv(f"{ORD_PATH}/uspto_files.csv").squeeze("columns")

pb = uspto_files.sample().iat[0]
pb

'./ord-data/data\\48\\ord_dataset-48929f64ce614f1181a555eafd7c97a6.pb.gz'

In [8]:
dataset = load_dataset(pb)

In [9]:
for field, value in dataset.ListFields():
    print(field.name)

# print(dataset.description)
# print(len(dataset.reactions))

name
description
reactions
dataset_id


In [10]:
dataset.reactions[2].inputs

{'m4': components {
  identifiers {
    type: NAME
    value: "petrol-ether"
  }
  amount {
    volume {
      value: 500.0
      units: MILLILITER
    }
  }
  reaction_role: REACTANT
}
, 'm1_m2_m3': components {
  identifiers {
    type: NAME
    value: "isophorone"
  }
  identifiers {
    type: SMILES
    value: "O=C1C=C(CC(C)(C)C1)C"
  }
  identifiers {
    type: INCHI
    value: "InChI=1S/C9H14O/c1-7-4-8(10)6-9(2,3)5-7/h4H,5-6H2,1-3H3"
  }
  amount {
    mass {
      value: 13.819999694824219
      units: GRAM
    }
  }
  reaction_role: REACTANT
}
components {
  identifiers {
    type: NAME
    value: "N-bromo-succinimide"
  }
  identifiers {
    type: SMILES
    value: "BrN1C(CCC1=O)=O"
  }
  identifiers {
    type: INCHI
    value: "InChI=1S/C4H4BrNO2/c5-6-3(7)1-2-4(6)8/h1-2H2"
  }
  amount {
    mass {
      value: 17.799999237060547
      units: GRAM
    }
  }
  reaction_role: REACTANT
}
components {
  identifiers {
    type: NAME
    value: "\316\261,\316\261\'-azodiisobutyron

In [11]:
dataset.reactions[4].outcomes[0].products

[identifiers {
  type: NAME
  value: "2,3-dimethylpent-3-yl allyl carbonate"
}
identifiers {
  type: SMILES
  value: "C(OC(C(C)C)(CC)C)(OCC=C)=O"
}
identifiers {
  type: INCHI
  value: "InChI=1S/C11H20O3/c1-6-8-13-10(12)14-11(5,7-2)9(3)4/h6,9H,1,7-8H2,2-5H3"
}
measurements {
  type: AMOUNT
  details: "MASS"
  amount {
    mass {
      value: 10.789999961853027
      units: GRAM
    }
  }
}
measurements {
  type: YIELD
  details: "CALCULATEDPERCENTYIELD"
  percentage {
    value: 53.900001525878906
  }
}
reaction_role: PRODUCT
]

In [12]:
dataset.reactions[2].DESCRIPTOR.fields

<MessageFields sequence>

In [13]:
%%time
message_helpers.message_to_row(dataset)

Wall time: 621 ms


{'name': 'uspto-grants-1978_06',
 'description': 'CML filenames: pftaps19780606_wk23.xml,pftaps19780613_wk24.xml,pftaps19780620_wk25.xml,pftaps19780627_wk26.xml',
 'reactions[0].identifiers[0].type': 'REACTION_CXSMILES',
 'reactions[0].identifiers[0].value': '[OH-:1].[Na+].[CH3:3][C:4]1[C:9](=[O:10])[C:8]([CH3:12])([CH3:11])[CH2:7][C:6](=[O:13])[CH:5]=1.OO.O>CO>[CH3:3][C:4]12[O:1][CH:5]1[C:6](=[O:13])[CH2:7][C:8]([CH3:12])([CH3:11])[C:9]2=[O:10] |f:0.1|',
 'reactions[0].identifiers[0].is_mapped': True,
 'reactions[0].inputs["m1_m2_m3_m4_m5_m7"].components[0].identifiers[0].type': 'NAME',
 'reactions[0].inputs["m1_m2_m3_m4_m5_m7"].components[0].identifiers[0].value': 'aqueous solution',
 'reactions[0].inputs["m1_m2_m3_m4_m5_m7"].components[0].amount.moles.value': 0.0,
 'reactions[0].inputs["m1_m2_m3_m4_m5_m7"].components[0].amount.moles.precision': 1.0,
 'reactions[0].inputs["m1_m2_m3_m4_m5_m7"].components[0].amount.moles.units': 'MOLE',
 'reactions[0].inputs["m1_m2_m3_m4_m5_m7"].compon

In [14]:
%%time
rxn_msg_list = message_helpers.find_submessages(dataset, reaction_pb2.Reaction)
rxn_msg_list[50]

Wall time: 0 ns


identifiers {
  type: REACTION_CXSMILES
  value: "[CH3:1][C:2]1[C:7]([CH3:8])=[CH:6][CH:5]=[C:4]([C:9]([CH3:16])([CH3:15])[CH2:10][C:11]([CH3:14])([CH3:13])[CH3:12])[C:3]=1[OH:17].[C:18]([O:22][CH3:23])(=[O:21])[CH:19]=[CH2:20]>>[CH3:8][C:7]1[C:2]([CH3:1])=[C:3]([OH:17])[C:4]([C:9]([CH2:10][C:11]([CH3:12])([CH3:14])[CH3:13])([CH3:16])[CH3:15])=[CH:5][C:6]=1[CH2:20][CH2:19][C:18]([O:22][CH3:23])=[O:21]"
  is_mapped: true
}
inputs {
  key: "m2"
  value {
    components {
      identifiers {
        type: NAME
        value: "2,3-dimethyl-6-(1,1,3,3-tetramethylbutyl)phenol"
      }
      identifiers {
        type: SMILES
        value: "CC1=C(C(=CC=C1C)C(CC(C)(C)C)(C)C)O"
      }
      identifiers {
        type: INCHI
        value: "InChI=1S/C16H26O/c1-11-8-9-13(14(17)12(11)2)16(6,7)10-15(3,4)5/h8-9,17H,10H2,1-7H3"
      }
      amount {
        moles {
          value: 0.0
          precision: 1.0
          units: MOLE
        }
      }
      reaction_role: REACTANT
    }
  }
}
inputs

In [15]:
cmpds = message_helpers.find_submessages(rxn_msg_list[50], reaction_pb2.Compound)
cmpds[1].identifiers[1]

type: SMILES
value: "C(C=C)(=O)OC"

In [16]:
%%time
from time import time
reactants = np.empty((80000, 2), dtype='<U512')
# reactant_smiles = np.empty((80000, 2), dtype='U256')
# reactant_smiles = np.char.array([""])
reactants_df = pd.DataFrame(columns=['name', 'smiles'])

start = time()
pb = glob(f'{ORD_REPO_PATH}/data/*/*c3c1091f873b4f40827973a6f1f9b685.pb.gz')[0]
# dataset = message_helpers.load_message(pb, dataset_pb2.Dataset)
dataset = load_dataset(pb)
print(f"Dataset {dataset.name} loaded in {time() - start:.2f}s", )

start = time()
idx = 0

for rxn in message_helpers.find_submessages(dataset, reaction_pb2.Reaction):
    cmpds = message_helpers.find_submessages(rxn, reaction_pb2.Compound)
    for cmpd in cmpds:
        if cmpd.reaction_role == reaction_pb2.ReactionRole.REACTANT:
            name = message_helpers.get_compound_identifier(cmpd, reaction_pb2.CompoundIdentifier.NAME)
            smiles = message_helpers.get_compound_identifier(cmpd, reaction_pb2.CompoundIdentifier.SMILES)
            # print(name)
            # reactants = np.append(reactants, [[name, smiles]], axis=0)
            # reactants[idx] = [name, smiles]
            reactants[idx, 0] = name
            reactants[idx, 1] = smiles
            # reactants_df.loc[len(reactants_df), reactants_df.columns] = name, smiles
            idx += 1

print(f"Dataset {dataset.name} parsed in {time() - start:.2f}s", )

print(reactants.shape)
print("Total reactions parsed", len(dataset.reactions))
reactants

Dataset uspto-grants-2014_09 loaded in 1.05s
Dataset uspto-grants-2014_09 parsed in 2.79s
(80000, 2)
Total reactions parsed 17639
Wall time: 3.84 s


array([['Hydrochloric acid', 'Cl'],
       ['acetyl chloride', 'C(C)(=O)Cl'],
       ['3-amino-2-naphthol', 'NC=1C(=CC2=CC=CC=C2C1)O'],
       ...,
       ['', ''],
       ['', ''],
       ['', '']], dtype='<U512')

In [17]:
reactants_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    0 non-null      object
 1   smiles  0 non-null      object
dtypes: object(2)
memory usage: 0.0+ bytes


In [18]:
reactants_df = pd.DataFrame(columns=['name', 'smiles'])
reactants_df.loc[0, reactants_df.columns] = name, smiles
reactants_df.head()

Unnamed: 0,name,smiles
0,trifluoroacetic acid,FC(C(=O)O)(F)F


In [19]:
reactants_df

Unnamed: 0,name,smiles
0,trifluoroacetic acid,FC(C(=O)O)(F)F


In [20]:
idx = 0
reactants[idx] = "ac"
reactants[idx, 1] = "bb"
# reactants[0] = name, smiles
reactants[idx, 1]

'bb'

In [21]:
print(reactants.nbytes)
reactants[np.char.str_len(reactants[:, 0]).argmax()]

327680000


array(['(2S,3S,4S,5R,6R)-benzyl 6-((4-((1R,3aS,5aR,5bR,7aR,11aS,11bR,13aR,13bR)-3a-((2-(1,1-dioxidothiomorpholino)ethyl)amino)-5a,5b,8,8,11a-pentamethyl-1-(prop-1-en-2-yl)-2,3,3a,4,5,5a,5b,6,7,7a,8,11,11a,11b,12,13,13a,13b-octadecahydro-1H-cyclopenta[a]chrysen-9-yl)benzoyl)oxy)-3,4,5-trihydroxytetrahydro-2H-pyran-2-carboxylate',
       'O=S1(CCN(CC1)CCN[C@]12[C@@H]([C@H]3CC[C@@H]4[C@]5(CC=C(C([C@@H]5CC[C@]4([C@@]3(CC1)C)C)(C)C)C1=CC=C(C(=O)O[C@@H]3[C@@H]([C@H]([C@@H]([C@H](O3)C(=O)OCC3=CC=CC=C3)O)O)O)C=C1)C)[C@@H](CC2)C(=C)C)=O'],
      dtype='<U512')