In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem,Descriptors
import itertools as itertools
import copy, random,re,collections
import pandas as pd
import numpy as np

# This notebook was used to extract data from Reaxys IDs used in the publication by Beker et al. https://doi.org/10.1002/anie.201806920
These structures had been previously verified in their work so the task here was to extract as many of these reactions as possible.

In [None]:
def clean_smiles(smiles,stereo=True):
    """ Clean SMILES and make it consistent by running it through rdkit.chem
    
    dependencies: rdkit.Chem"""
    clean_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles),isomericSmiles=stereo)
    return clean_smiles

# 1) Get Data from Excel Files

In [None]:
### Your Reaxys results file
df_master = pd.read_csv('mydataset.csv')

df_master = df_master.rename(columns={'Reaction ID':'rxid'})
my_cols = list(df_master.columns.values)
df_master['rxid'] = df_master.apply (lambda row: str(row.values[my_cols.index('rxid')]), axis=1)

### A file containing all the donated Reaxys IDs
df_grzy = pd.read_csv('donatedIDs.csv') # 
df_grzy = df_grzy.drop_duplicates()


## 1.1) Find any missing data

In [None]:
df_ls = []  

no_rxn_data = []
prod_not_identifiable = []
my_cols = list(df_master.columns.values)
for x in df_grzy['rxid']:
    ##### Get temporary dataframe for all reaction rows that are present in the reaction ID. Note that these reaction IDs can contain multiple IDs
    temp_df = df_master.loc[df_master['rxid']=='huh',:] ### quickly make dataframe with same columns but no rows, hence the meaningless 'huh'
    for y in x.split(','):
        temp_df = pd.concat([temp_df,df_master.loc[df_master['rxid']==y,:]])
    
    ##### Extract the yields and get the maximum
    yields = [re.findall(r'[0-9]{1,3}\.{0,1}[0-9]{0,4}',z) if type(z) is not float else z for z in temp_df['Yield (numerical)'].to_list() ]
    yields = [max([float(z) for z in y]) if type(y) is not float else y for y in yields]

    if len(yields) ==0:
        max_yield = ""
    else:
        max_yield = max(yields)
    
    ##### Find the reaction. If there is one reaction that has one product, this is simple.
    temp_rxns = [z for z in set(temp_df['Reaction']) if type(z) is not float and len(z.split('.')) == 2] 
    if len(temp_rxns) ==1:
        my_rxn = temp_rxns[0]
        
    #### If all reactions have multiple products (or are blank), we need to get the major product by yield or molecular mass or create a product without stereocentres
    elif len(temp_rxns) ==0:
        rxn_found_via_yield=False
        for row in temp_df.iterrows(): ### We go through each row and find the yield for the listed products and then pick the product with highest yield
            row_yield = [float(y) for y in str(row[1][my_cols.index('Yield (numerical)')]).split(';')]
            row_reaction = row[1][my_cols.index('Reaction')]
            if type(row_reaction) == float:
                continue
            if len(row_yield) == len(row_reaction.split('>>')[1].split('.')):
                my_rxn = f"""{row_reaction.split('>>')[0]}>>{max(zip(row_yield,row_reaction.split('>>')[1].split('.')))[1]}"""
                rxn_found_via_yield = True
                break
                
            ### If the yields do not help, we check for moelcular masses. DA reaction is 100% atom efficient so mass difference should be less than 1
            reactant_mass = Descriptors.MolWt(Chem.MolFromSmiles(row_reaction.split('>>')[0]))
            product_masses = {y:Descriptors.MolWt(Chem.MolFromSmiles(y)) for y in row_reaction.split('>>')[1].split('.')}
            for pm in product_masses.keys():
                if abs(reactant_mass -product_masses[pm]) <1:
                    my_rxn = f"""{row_reaction.split('>>')[0]}>>{pm}"""
                    rxn_found_via_yield=True
                    break
                    
        #### If nothing else works, the solution is to make a reaction where no stereochemistry is involved
        if not rxn_found_via_yield:  
            rxn_found_via_nostereo = False
            for row in temp_df.iterrows():
                row_reaction = row[1][my_cols.index('Reaction')]
                if type(row_reaction) == float: #### Ignore blanks
                    continue
                #### Remove stereochemistry from products then check if set() makes it 1 product. This generates regio/site-selective reactions
                row_simp_prods = [Chem.MolToSmiles(Chem.MolFromSmiles(y),isomericSmiles=False) for y in row_reaction.split('>>')[1].split('.')]
                if len(set(row_simp_prods)) ==1:
                    my_rxn = f"""{row_reaction.split('>>')[0]}>>{row_simp_prods[0]}"""
                    rxn_found_via_nostereo = True
                    break
                    
            #### If nothing works, identify why this failed. Either no reaction data is present (blank) or some other issue occured
            if not rxn_found_via_nostereo:
                if len([y for y in temp_df['Reaction'].tolist() if type(y) is not float]) ==0:
                    no_rxn_data.append(x)
                else:
                    prod_not_identifiable.append(x)
                continue
                
    ### If we have multiple reactions where there is 1 product, pick the first reaction with stereochemistry, otherwise pick the first reaction
    else:
        temp_rxns_stereo = [z for z in temp_rxns if "@" in z]
        if len(temp_rxns_stereo) ==0:
            my_rxn = temp_rxns[0]
        else:
            my_rxn = temp_rxns_stereo[0]
            
    ### Make sure the reaction has been cleaned then add the dictionary to a list so that it can become a dataframe
    my_rxn = f"""{Chem.MolToSmiles(Chem.MolFromSmiles(my_rxn.split('>>')[0]))}>>{Chem.MolToSmiles(Chem.MolFromSmiles(my_rxn.split('>>')[1]))}"""
    for y in x.split(','):
        df_ls.append({'rxid':y,'group':x,'yield':max_yield,'reaction':my_rxn})

df_gryz_rxns = pd.DataFrame(df_ls)
print(f'no reaction data for {len(no_rxn_data)} reactions\nno product was identified for {len(prod_not_identifiable)} reactions\n')

### For reactions with no data, the RXIDs are extracted so that they can be imported into Reaxys and missing reactions exported in a different format

In [None]:
# no_rxn_IDs = [x for x in ','.join(no_rxn_data).split(',') if x[0].isdigit()]
# with open('rxns_noRXN.txt','w') as wf:
#     for x in no_rxn_IDs:
#         wf.write(f'RX.ID={x}\n')

# 2) Extract Reactions from RDF format (.txt file)
Sometimes the data is not exported properly into certain formats so instead of the .csv format, I have chosen the RDF format. Other formats can also be used but the corresponding code needs to be changed.

Check that the yields in the RDF file matches the yields on the Reaxys website for the following IDs:
2127634,4908869,8996022,2015581,3076836


In [None]:
with open('missingRXNdata.txt','r') as rf:
    my_file = rf.read()
    reactions = my_file.split('$RFMT $RIREG ')

rxn_data = []
for r in reactions[1:]:
#     print(r)
    rxid = re.search(r'\$DTYPE ROOT\:RX_ID\n\$DATUM [\d]*',r).group(0).split()[-1]
    group = ''
    for y in [x.split(',') for x in df_grzy['rxid'].to_list()]:
        if rxid in y:
            my_group = ','.join(y)
    molecules = re.findall(r'\$MOL[^\$]*',r)
    smiles = [Chem.MolToSmiles(Chem.MolFromMolBlock(x[5:])) for x in molecules]
    num_r,num_p = [int(x) for x in re.search(r'\$RXN\n\n  Marvin       102001222302\n\n  [\d]\s{1,2}[\d]',r).group(0).split()[-2:]]
    reactants,products = smiles[:num_r],smiles[-num_p:]
    rxn_yield = re.search(r'\$DTYPE ROOT\:RXD\([\d]*\)\:NYD\n\$[^\$]*',r)
    
    if rxn_yield is not None:
        rxn_yield = [float(x) for x in re.findall(r'[\d]+\.{0,1}\d{0,3}',rxn_yield.group(0)[22:])]
#         rxn_yield = [float(x) for x in rxn_yield.group(0).split()[-1].split('|')]
    else:
        rxn_yield = re.search(r'\$DTYPE ROOT\:RXD\([\d]*\)\:YD\n\$[^\$]*',r)
        if rxn_yield is not None:
            rxn_yield = [float(x) for x in re.findall(r'[\d]+\.{0,1}\d{0,3}',rxn_yield.group(0)[22:])]
        else:
            rxn_yield = re.search(r'\$DTYPE ROOT\:RXD\([\d]*\)\:YDO\n\$[^\$]*',r)
            if rxn_yield is not None:
                rxn_yield = [float(x) for x in re.findall(r'[\d]+\.{0,1}\d{0,3}',rxn_yield.group(0)[22:])]
            else:
                rxn_yield = ['']
    
    if num_r == 1:
        reactants = reactants+reactants
        
    if len(products) ==1:
        if len(reactants) == 2:
            rxn_smiles = f'{reactants[0]}.{reactants[1]}>>{products[0]}'
            rxn_yield = rxn_yield[0]
        else:
            print(f'Rxn {rxid} has more than 2 reactants!')
    else:
        ### Check yields
        if len(rxn_yield) == len(products):
            my_rxn = f"""{reactants[0]}.{reactants[1]}>>{max(zip(rxn_yield,products))[1]}"""
            rxn_data.append({'rxid':rxid,'group':my_group,'yield':max(rxn_yield),'reaction':my_rxn})
            continue

        react_combos = [x for x in itertools.product(reactants,repeat=2)] + [[x] for x in reactants]
#         atom_eco = {sum([Descriptors.MolWt(Chem.MolFromSmiles(x)) for x in combo]):combo for combo in react_combos}
        valid_combos = []
        for p in products:
            if abs(Descriptors.MolWt(Chem.MolFromSmiles(p))- Descriptors.MolWt(Chem.MolFromSmiles('.'.join(reactants)))) <1:
                valid_combos.append(p)
        if len(valid_combos) ==1:
            my_rxn = f"""{reactants[0]}.{reactants[1]}>>{max(zip(rxn_yield,products))[1]}"""
            rxn_data.append({'rxid':rxid,'group':my_group,'yield':max(rxn_yield),'reaction':my_rxn})
            continue
        else: ## convert to non-stereo
            non_stereoproducts = [Chem.MolToSmiles(Chem.MolFromSmiles(x),isomericSmiles=False) for x in products]
            if len(set(non_stereoproducts)) == 1:
                my_rxn = f"""{Chem.MolToSmiles(Chem.MolFromSmiles(reactants[0]),isomericSmiles=False)}.{Chem.MolToSmiles(Chem.MolFromSmiles(reactants[1]),isomericSmiles=False)}>>{non_stereoproducts[0]}"""
                rxn_data.append({'rxid':rxid,'group':my_group,'yield':max(rxn_yield),'reaction':my_rxn})
                continue
            print('This reaction cannot be identified',rxid,reactants,products)
df_gryz_rxns_missing = pd.DataFrame(rxn_data)

### Concat the previously extracted data with these additional reactions
df_gryz_rxns_full = pd.concat([df_gryz_rxns,df_gryz_rxns_missing])
df_gryz_rxns_full
    


# 3) Ensure Reactions are Unique


In [None]:
### Remove repeats but keep the first
df_gryz_rxns_uniq = df_gryz_rxns_full.drop_duplicates(subset='reaction')
df_gryz_rxns_uniq = df_gryz_rxns_uniq.drop_duplicates(subset='rxid')
print(df_gryz_rxns_uniq.shape)
df_gryz_rxns_uniq = df_gryz_rxns_uniq.drop_duplicates(subset='group')
print(df_gryz_rxns_uniq.shape)

#### Remove cases where the reactants are the same but the products differ
my_cols = list(df_gryz_rxns_uniq.columns.values)
unique_reactions = []
ids_non_unique_reactions = []
for row in df_gryz_rxns_uniq.iterrows():
    reaction = row[1][my_cols.index('reaction')]
    reaction_nostereo = f"""{reaction.split('>>')[0]}>>{Chem.MolToSmiles(Chem.MolFromSmiles(reaction.split('>>')[1]),isomericSmiles=False)}"""
    if reaction_nostereo not in unique_reactions:
        unique_reactions.append(reaction_nostereo)
        continue
    else:
        ids_non_unique_reactions.append(row[1][my_cols.index('rxid')])
df_gryz_rxns_uniq = df_gryz_rxns_uniq[~df_gryz_rxns_uniq['rxid'].isin(ids_non_unique_reactions)]
print(df_gryz_rxns_uniq.shape)

# 4) Put Reaction Data into CSV and Combine with Previous Dataset

In [None]:
def clean_rxn(rxnsmiles):
    if type(rxnsmiles)!=str:
        return rxnsmiles
    reactants,products = rxnsmiles.split('>>')
    return f"""{Chem.MolToSmiles(Chem.MolFromSmiles(reactants))}>>{Chem.MolToSmiles(Chem.MolFromSmiles(products))}"""
def create_rxn(diene,dienophile,product):
    if type(dienophile) == float:
        return f'{diene}>>{product}'
    return f'{diene}.{dienophile}>>{product}'

### Previous dataset from Reaxys_Data_Extraction.ipynb
df12773 = pd.read_csv('firstdataset.csv')
gryzrxns = copy.copy(df_gryz_rxns_uniq)
df12773 = df12773.rename(columns={'Reaction ID':'rxid'})

### Format/Canonicalize SMILES
df12773['reactionsmiles'] = df12773.apply (lambda row: create_rxn(row['Diene'],row['Dienophile'],row['Major Product']), axis=1)
df12773['Cleaned_Reaction_Smiles'] = df12773.apply (lambda row: clean_rxn(row['reactionsmiles']), axis=1)
gryzrxns['Cleaned_Reaction_Smiles'] = gryzrxns.apply (lambda row: clean_rxn(row['reaction']), axis=1)

### Merge files
dfmerged = pd.concat([df12773,gryzrxns])
dfmerged = dfmerged.drop_duplicates('rxid')
dfmerged = dfmerged.drop_duplicates('Cleaned_Reaction_Smiles')
dfmerged.to_csv(f'unverifiedrxns_{dfmerged.shape[0]}.csv')
dfmerged