## Imports

In [1]:
import logging
import re
from datetime import datetime

import pandas as pd
import typer
from ord_schema import message_helpers, validations
from ord_schema.proto import dataset_pb2, reaction_pb2
from rdkit.Chem import MolFromSmiles, MolToSmiles
from rich.progress import track
from typing_extensions import Annotated


In [2]:
from surf_utils.helpers import pubchem_property_from_cas
from surf_utils.mappings import (
    doi_pattern,
    email_pattern,
    mapping_analyses_ord,
    mapping_atmo,
    mapping_role,
    mapping_stirring,
    orcid_pattern,
)

In [3]:
from surf2ord import surf2ord


  temp = max([float(t) for t in re.findall("[0-9\.]+", row.temperature_deg_c)])
  time_h = max([float(t) for t in re.findall("[0-9\.]+", row.time_h)])
  for cpd in sorted(set(re.findall("\w+_[1-9]", " ".join(row.index.tolist())))):
  "[0-9\.]",
  re.findall("[Ff]ound[:\s\w\(\)\+\-]+[0-9\.]+", row[f"{cpd}_ms"])[-1].strip(
  "[0-9\.]",
  re.findall("[Ee]xpect[ed]*[:\s\w\(\)\+\-]+[0-9\.]+", row[f"{cpd}_ms"])[


In [4]:
df = pd.read_csv('./data/borylation_lit.txt', delimiter= '\t')
print(df.head(5))

         rxn_id                 source_id  source_type    rxn_type  \
0  lit_pub_bo_1  10.1021/acscatal.0c00152  publication  borylation   
1  lit_pub_bo_2  10.1021/acscatal.0c00152  publication  borylation   
2  lit_pub_bo_3  10.1021/acscatal.0c00152  publication  borylation   
3  lit_pub_bo_4  10.1021/acscatal.0c00152  publication  borylation   
4  lit_pub_bo_5  10.1021/acscatal.0c00152  publication  borylation   

   temperature_deg_c  time_h atmosphere stirring_shaking  scale_mol  \
0                100    48.0      ARGON      UNSPECIFIED   0.004437   
1                100    48.0      ARGON      UNSPECIFIED   0.004437   
2                100    48.0      ARGON      UNSPECIFIED   0.004437   
3                100    48.0      ARGON      UNSPECIFIED   0.004437   
4                100    48.0      ARGON      UNSPECIFIED   0.004437   

   concentration_mol_l  ... product_1_nmr            product_1_smiles  \
0                  NaN  ...           NaN  O1B(OC(C)(C)C1(C)C)C2COCC2   
1     

In [5]:
df = pd.read_csv('./data/minisci_lit.txt', delimiter = '\t')
print(df.head(5))

              rxn_id                    source_id source_type    rxn_type  \
0  lsf_lit_minisci_1  10.1021/acs.orglett.8b02988       paper  alkylation   
1  lsf_lit_minisci_2  10.1021/acs.orglett.8b02988       paper  alkylation   
2  lsf_lit_minisci_3  10.1021/acs.orglett.8b02988       paper  alkylation   
3  lsf_lit_minisci_4  10.1021/acs.orglett.8b02988       paper  alkylation   
4  lsf_lit_minisci_5  10.1021/acs.orglett.8b02988       paper  alkylation   

  rxn_name  rxn_tech  temperature_deg_c  time_h atmosphere stirring_shaking  \
0  minisci  standard                 40    16.0         ar              st_   
1  minisci  standard                 40    16.0         ar              st_   
2  minisci  standard                 40    16.0         ar              st_   
3  minisci  standard                 40    16.0         ar              st_   
4  minisci  standard                 40    16.0         ar              st_   

   ...  product_2_ms  product_2_nmr product_3_cas product_3_sm

In [6]:
#df = pd.read_csv('minisci_tidy_lit.txt', delimiter = '\t')
#print(df.head(5))

## Borylation

In [7]:
input = './data/borylation_lit.txt'
output = './data/borylation_ord.pbtxt'
dataset_name = 'Borylation dataset from Roche'
description = 'Borylation reactions which were extracted from the literature by Roche scientists and published in the SURF format (ChemRxiv, 2024, DOI: 10.26434/chemrxiv-2023-nfq7h)'

In [8]:
surf2ord(input_file = input, output_file = output, dataset_name = dataset_name, dataset_description = description)



Output()



INFO 2024-07-30 00:38:13,320 surf2ord.py:449: Running final ORD dataset validation...
INFO 2024-07-30 00:38:17,480 surf2ord.py:454: Writing ORD file ./data/borylation_ord.pbtxt


## Minisci

In [9]:
input = './data/minisci_lit.txt'
output = './data/minisci_ord.pbtxt'
dataset_name = 'Minisci dataset from Roche'
description = 'Minisci reactions which were extracted from the literature by Roche scientists and published in the SURF format (ChemRxiv, 2024, DOI: 10.26434/chemrxiv-2023-nfq7h)'
#delimiter =
username = "Alex T. Mueller"
email = "alex.mueller@roche.com"
orcid = "0000-0001-8063-9952"
#overwrite_provenance = True
organization = "F. Hoffmann-La Roche Ltd."
#validate_cat_smls
validate = True


surf2ord(input_file = input, output_file = output, dataset_name = dataset_name, dataset_description = description, validate = validate, username = username, 
        email = email, orcid = orcid, organization = organization)



Output()

INFO 2024-07-30 00:38:48,986 surf2ord.py:449: Running final ORD dataset validation...
INFO 2024-07-30 00:38:49,352 surf2ord.py:454: Writing ORD file ./data/minisci_ord.pbtxt
