In [1]:
import os, sys
import numpy as np
import pandas as pd
from pprint import pprint
sys.path.append('scripts')

#from rdkit.Chem import PandasTools
#PandasTools.RenderImagesInAllDataFrames(images=True)

from Synthesis import init_LocalTransform, predict_product, predict_product_batch 

dataset = 'USPTO_480k' # get the info of derived templates
scenario = 'mix' # 'sep' or 'mix'

device = 'cpu' # cpu or cuda
model_name = 'LocalTransform_%s' % scenario
model_path = 'models/%s.pth' % model_name
config_path = 'data/configs/default_config'
data_dir = 'data/%s' % dataset

args = {'data_dir': data_dir, 'model_path': model_path, 'config_path': config_path, 'device': device, 'mode': 'test'}
model, graph_functions, template_dicts, template_infos = init_LocalTransform(args)
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print ('# model parameters: %.2fM' % (params/1000000))

Parameters of loaded LocalTransform:
{'attention_heads': 8, 'attention_layers': 3, 'edge_hidden_feats': 32, 'node_out_feats': 256, 'num_step_message_passing': 3, 'Template_rn': 4370, 'Template_vn': 2535, 'in_node_feats': 80, 'in_edge_feats': 13}
loaded 4370 real templates
loaded 2535 virtual templates
# model parameters: 9.09M


In [7]:
# Simple reaction outcome prediction
reactants = ['N#Cc1ccsc1N.O=[N+]([O-])c1cc(F)c(F)cc1F.C1CCOC1.[H-].[Na+]',
              'N#Cc1ccsc1N.O=[N+]([O-])c1cc(F)c(F)cc1F.C1CCOC1.[H-].[Na+]',
              '[CH3:2][CH2:1][NH:3][c:4]1[cH:13][cH:12][c:11]2[c:6]([cH:5]1)[C:7]([CH3:16])=[CH:8][CH2:9][C:10]2([CH3:14])[CH3:15].[O:23]=[C:22]([OH:24])[c:21]1[cH:25][cH:26][c:18](F)[n:19][cH:20]1']
sep = False
verbose = 1
results_df = predict_product_batch(args, reactants, model, graph_functions, template_dicts, template_infos, verbose = 0, sep = sep)
results_df

['N#Cc1ccsc1Nc1cc(F)c(F)cc1[N+](=O)[O-]']
['N#Cc1ccsc1Nc1cc(F)c(F)cc1[N+](=O)[O-]']
['O=C(O)c1cccnc1']


['N#Cc1ccsc1Nc1cc(F)c(F)cc1[N+](=O)[O-]',
 'N#Cc1ccsc1Nc1cc(F)c(F)cc1[N+](=O)[O-]',
 'O=C(O)c1cccnc1']

In [3]:
# Human benchmark
from tqdm import tqdm

url = 'https://github.com/connorcoley/rexgen_direct/blob/master/human/benchmarking.xlsx?raw=true'
human_benchmark_rxns = pd.read_excel(url)['Reaction smiles'][:80]
        
reactants = []
products = []
scores = []
corrects = []
for rxn in tqdm(human_benchmark_rxns):
    reactant, product = rxn.split('>>')
    results_df, results_dict  = predict_product(args, reactant, model, graph_functions, template_dicts, template_infos, product = product, top_k = 3, verbose = 0)
    reactants.append(results_df['Reactants'][0])
    products.append(results_df['Top-1'][0])
    scores.append(results_dict['Top-1']['score'])
    corrects.append(results_df['Correct at'][0])

100%|███████████████████████████████████████████| 80/80 [00:41<00:00,  1.91it/s]


In [4]:
pd.set_option('display.max_rows', None)
pd.DataFrame({'Reactants':reactants, 'Top-1 products': products,  'Scores': scores, 'Correct': corrects})

Unnamed: 0,Reactants,Top-1 products,Scores,Correct
0,,,0.947657,1
1,,,0.6684,1
2,,,0.996591,1
3,,,0.99313,1
4,,,0.608948,1
5,,,0.524128,1
6,,,0.994528,1
7,,,0.871567,1
8,,,0.913686,1
9,,,0.999624,1
