# Get ASKCOS results

In [1]:
import sys
from pathlib import Path
root_directory = Path('__file__').parent.parent.resolve()
print('root_directory: ', root_directory)
project_directory = root_directory.parent
print('project_directory: ', project_directory)
search_tool_path = project_directory/'pathway_search_standalone'
sys.path.append(str(search_tool_path))
sfscore_path = project_directory/'sfscore'
sys.path.append(str(sfscore_path))
from sfscore import SFScore
from scripts.utils import *
from scripts.process_utils import (compare_results_with_askcos,
                                   print_rxns,print_bypass_rxns,
                                   print_rxns_list)
from rdkit.Chem import AllChem as Chem
import networkx as nx
import numpy as np
import json
import gzip

root_directory:  /home/xuan/GitLab/AceRetro/evaluate_score
project_directory:  /home/xuan/GitLab/AceRetro


In [2]:
sfscore_model = SFScore()
sfscore_model.load()

Loading model /home/xuan/GitLab/AceRetro/process_reaction_database/saved_model/ecfp4_4096_3_layer_epoch10.pt


<sfscore.SFScore at 0x7f52065bc828>

In [3]:
# 1.5 min
path = '../../hybmind/data/boutique_1000_retrosynthesis_graphs.json.gz'
with gzip.open(path, 'r') as fin:
    graph_dict = json.loads(fin.read().decode('utf-8'))
print(len(graph_dict))

1001


In [4]:
# 7min
askcos_shortest_path_dict = {}
askcos_spg_dict = {}

for smiles in graph_dict:
    askcos_shortest_path_dict[smiles] = {}
    askcos_spg_dict[smiles] = {}

    for prioritizer in graph_dict[smiles]:
        try:
            g = nx.node_link_graph(graph_dict[smiles][prioritizer]['output'])
            g = g.reverse()

            if g and len(g.nodes)>1:
                std_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
                chem_nodes = [n for n in g.nodes if '>>' not in n]
                starting_nodes = [n for n in chem_nodes if g.nodes[n]['terminal']]
                s_p_g = metabolic_dijkstra(g, starting_nodes)
                askcos_spg_dict[smiles][prioritizer] = s_p_g
                try:
                    askcos_shortest_path_dict[smiles][prioritizer] = s_p_g.nodes[std_smiles]['path_length']
                except:
                    askcos_shortest_path_dict[smiles][prioritizer] = s_p_g.nodes[smiles]['path_length']

            else:
                print ('No Graph for', smiles, prioritizer)
        except:
            print ('Graph retrieval failed for', smiles, prioritizer)

print ('Short Paths found')

Short Paths found


In [5]:
smi = 'N[C@H](CO)C(=O)N[C@H](CC(=O)O)C(=O)O'
askcos_spg_dict[smi]['bkms,reaxys'].nodes[smi]

{'type': 'chemical',
 'as_reactant': 0,
 'purchase_price': 0.0,
 'as_product': 0,
 'terminal': False,
 'path_length': 1,
 'visited': False,
 'shortest_pathway': ['N[C@H](CC(=O)O)C(=O)O.N[C@H](CO)C(=O)O>>N[C@H](CO)C(=O)N[C@H](CC(=O)O)C(=O)O']}

In [6]:
shortest_paths = pd.DataFrame(askcos_shortest_path_dict).T    
# shortest_paths_cascade = pd.DataFrame(shortest_path_cascade_dict).T
any_path_found = shortest_paths.copy()
for idx in shortest_paths.index:
    for col in shortest_paths.columns:
        addl_data = shortest_paths.loc[idx, col]
        if addl_data < np.inf:
            any_path_found.loc[idx, col] = 1
        elif addl_data > 12:
            any_path_found.loc[idx, col] = 0
any_path_found = any_path_found.dropna(axis=0)
any_path_found = any_path_found>0

In [7]:
shortest_paths

Unnamed: 0,"bkms,reaxys",bkms,reaxys
CC[C@H](C)N(C[C@@H](O)c1cccn1Cc1ccccc1Cl)[C@@H](C)CC,inf,inf,10.0
C=C(C)[C@@H]1CC=C(C)C[C@@H]1O,1.0,1.0,inf
C[C@H](C(=O)CC[C@@H](C)CO)[C@H]1C(=O)C[C@H]2[C@@H]3CC=C4C[C@@H](O)CC[C@]4(C)[C@H]3CC[C@@]21C,inf,inf,inf
C=CC1=C(C)[C@@H](CC2=N/C(=C3\c4[nH]c(Cc5[nH]c(C=O)c(C)c5CC)c(C)c4C(=O)[C@@H]3C(=O)OC)[C@@H](CCC(=O)O)[C@@H]2C)NC1=O,inf,inf,inf
O=C(COP(=O)(O)O)[C@H](O)[C@H](O)CO,1.0,2.0,1.0
...,...,...,...
C=C(C)[C@H](CC=C(C)C)Cc1c(O)cc(OC)c2c1O[C@H](c1ccc(O)cc1O)[C@H](O)C2=O,inf,inf,inf
c1ccc(-c2nc(N3CCNCC3)cc3ccccc23)cc1,2.0,inf,2.0
CC1(C)O[C@](C)([C@H]2CC[C@](C)(Cl)[C@@H](Br)C2)[C@H](O)C[C@H]1Br,inf,inf,inf
CNC(=O)CCC(=O)c1cccnc1,2.0,2.0,2.0


In [8]:
any_path_found

Unnamed: 0,"bkms,reaxys",bkms,reaxys
CC[C@H](C)N(C[C@@H](O)c1cccn1Cc1ccccc1Cl)[C@@H](C)CC,False,False,True
C=C(C)[C@@H]1CC=C(C)C[C@@H]1O,True,True,False
C[C@H](C(=O)CC[C@@H](C)CO)[C@H]1C(=O)C[C@H]2[C@@H]3CC=C4C[C@@H](O)CC[C@]4(C)[C@H]3CC[C@@]21C,False,False,False
C=CC1=C(C)[C@@H](CC2=N/C(=C3\c4[nH]c(Cc5[nH]c(C=O)c(C)c5CC)c(C)c4C(=O)[C@@H]3C(=O)OC)[C@@H](CCC(=O)O)[C@@H]2C)NC1=O,False,False,False
O=C(COP(=O)(O)O)[C@H](O)[C@H](O)CO,True,True,True
...,...,...,...
C=C(C)[C@H](CC=C(C)C)Cc1c(O)cc(OC)c2c1O[C@H](c1ccc(O)cc1O)[C@H](O)C2=O,False,False,False
c1ccc(-c2nc(N3CCNCC3)cc3ccccc23)cc1,True,False,True
CC1(C)O[C@](C)([C@H]2CC[C@](C)(Cl)[C@@H](Br)C2)[C@H](O)C[C@H]1Br,False,False,False
CNC(=O)CCC(=O)c1cccnc1,True,True,True


In [9]:
len(shortest_paths[shortest_paths.loc[:, 'bkms,reaxys']<np.inf])

493

In [11]:
max_depth = 10
max_trees = 5000
validate_paths = True
NIL_UUID = '00000000-0000-0000-0000-000000000000'

In [3]:
# 36 min
try:
    with open(f"../data/askcos_benchmark_result/enumerated_paths_in_hybrid.json", "r") as outfile:
        enumerated_paths_in_hybrid = json.load(outfile)
    with open(f"../data/askcos_benchmark_result/molecule_label_in_all_paths_found.json", "r") as outfile:
        molecule_label_in_all_paths_found = json.load(outfile)
    with open(f"../data/askcos_benchmark_result/molecule_label_in_shortest_paths_found.json", "r") as outfile:
        molecule_label_in_shortest_paths_found = json.load(outfile)
    print('Loading data from files.')
except:
    print('Running the script ...')
    enumerated_paths_in_hybrid = {}
    molecule_label_in_all_paths_found = {}
    molecule_label_in_shortest_paths_found = {}

    path_num_count = 0
    rxn_num_count = 0
    shortest_path_num_count = 0
    shortest_rxn_num_count = 0

    #for smiles in shortest_paths[shortest_paths.loc[:, 'bkms,reaxys']<2].index:

    for smiles in shortest_paths[shortest_paths.loc[:, 'bkms,reaxys']<np.inf].index:
        g = askcos_spg_dict[smiles]['bkms,reaxys']
        std_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
        try:
            shortest_path = g.nodes[smiles]['shortest_pathway']
        except:
            shortest_path = g.nodes[std_smiles]['shortest_pathway']
        
        paths = get_paths(
    #                    graph_dict[smiles]['bkms,reaxys'].reverse(),
                        nx.node_link_graph(graph_dict[smiles]['bkms,reaxys']['output']),
                        root=smiles,
                        root_uuid=NIL_UUID,
                        max_depth=max_depth,
                        max_trees=max_trees,
                        validate_paths=validate_paths,
                    )
        all_paths = nx_paths_to_json(paths, NIL_UUID)
        enumerated_paths_in_hybrid[smiles] = all_paths

        shortest_path_length = shortest_paths.loc[smiles, 'bkms,reaxys']
        
        for path in all_paths:
            rxn_list = get_all_children(path)
            path_length = len(rxn_list)
            path_num_count += 1
            if shortest_path_length == path_length:
                shortest_path_num_count += 1
                for rxn in rxn_list:
                    rxn_num_count += 1
                    shortest_rxn_num_count += 1
                    product = rxn.split('>>')[-1]
                    if product not in molecule_label_in_all_paths_found:
                        molecule_label_in_all_paths_found[product] = {'any_bkms':False,'any_reaxys':False}
                    if product not in molecule_label_in_shortest_paths_found:
                        molecule_label_in_shortest_paths_found[product] = {'any_bkms':False,'any_reaxys':False}
                    if 'bkms' in g.nodes[rxn]['template_set']:
                        molecule_label_in_all_paths_found[product]['any_bkms'] = True
                        molecule_label_in_shortest_paths_found[product]['any_bkms'] = True
                    if 'reaxys' in g.nodes[rxn]['template_set']:
                        molecule_label_in_all_paths_found[product]['any_reaxys'] = True
                        molecule_label_in_shortest_paths_found[product]['any_reaxys'] = True
            else:
                for rxn in rxn_list:
                    rxn_num_count += 1
                    product = rxn.split('>>')[-1]
                    if product not in molecule_label_in_all_paths_found:
                        molecule_label_in_all_paths_found[product] = {'any_bkms':False,'any_reaxys':False}
                    if 'bkms' in g.nodes[rxn]['template_set']:
                        molecule_label_in_all_paths_found[product]['any_bkms'] = True
                    if 'reaxys' in g.nodes[rxn]['template_set']:
                        molecule_label_in_all_paths_found[product]['any_reaxys'] = True
    with open(f"../data/askcos_benchmark_result/enumerated_paths_in_hybrid.json", "w") as outfile:
        json.dump(enumerated_paths_in_hybrid, outfile)
    with open(f"../data/askcos_benchmark_result/molecule_label_in_all_paths_found.json", "w") as outfile:
        json.dump(molecule_label_in_all_paths_found, outfile)
    with open(f"../data/askcos_benchmark_result/molecule_label_in_shortest_paths_found.json", "w") as outfile:
        json.dump(molecule_label_in_shortest_paths_found, outfile)
    print('Files saved')

Loading data from files.


In [12]:
molecule_label_in_all_paths_found = {}
molecule_label_in_shortest_paths_found = {}

path_num_count = 0
rxn_num_count = 0
shortest_path_num_count = 0
shortest_rxn_num_count = 0

for smiles in shortest_paths[shortest_paths.loc[:, 'bkms,reaxys']<np.inf].index:
    #if smiles == 'CNC(=O)CCC(=O)c1cccnc1':
    g = askcos_spg_dict[smiles]['bkms,reaxys']
    std_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
    all_paths = enumerated_paths_in_hybrid[smiles]
    shortest_path_length = shortest_paths.loc[smiles, 'bkms,reaxys']        
    for path in all_paths:
        rxn_list = get_all_children(path)
        path_length = len(rxn_list)
        path_num_count += 1
        if shortest_path_length == path_length:
            shortest_path_num_count += 1
            for rxn in rxn_list:
                rxn_num_count += 1
                shortest_rxn_num_count += 1
                product = rxn.split('>>')[-1]
                if product not in molecule_label_in_all_paths_found:
                    molecule_label_in_all_paths_found[product] = {'any_bkms':False,'any_reaxys':False}
                if product not in molecule_label_in_shortest_paths_found:
                    molecule_label_in_shortest_paths_found[product] = {'any_bkms':False,'any_reaxys':False}
                if 'bkms' in g.nodes[rxn]['template_set']:
                    molecule_label_in_all_paths_found[product]['any_bkms'] = True
                    molecule_label_in_shortest_paths_found[product]['any_bkms'] = True
                if 'reaxys' in g.nodes[rxn]['template_set']:
                    molecule_label_in_all_paths_found[product]['any_reaxys'] = True
                    molecule_label_in_shortest_paths_found[product]['any_reaxys'] = True
        else:
            for rxn in rxn_list:
                rxn_num_count += 1
                product = rxn.split('>>')[-1]
                if product not in molecule_label_in_all_paths_found:
                    molecule_label_in_all_paths_found[product] = {'any_bkms':False,'any_reaxys':False}
                if 'bkms' in g.nodes[rxn]['template_set']:
                    molecule_label_in_all_paths_found[product]['any_bkms'] = True
                if 'reaxys' in g.nodes[rxn]['template_set']:
                    molecule_label_in_all_paths_found[product]['any_reaxys'] = True
print(f'''
path_num_count = {path_num_count}
rxn_num_count = {rxn_num_count}
shortest_path_num_count = {shortest_path_num_count}
shortest_rxn_num_count = {shortest_rxn_num_count}
molecule_label_in_all_paths_found = {len(molecule_label_in_all_paths_found)}
molecule_label_in_shortest_paths_found = {len(molecule_label_in_shortest_paths_found)}
''')


path_num_count = 397040
rxn_num_count = 3719157
shortest_path_num_count = 1531
shortest_rxn_num_count = 5383
molecule_label_in_all_paths_found = 26741
molecule_label_in_shortest_paths_found = 1544



## field coverage in shortest paths

In [31]:
shortest_paths_found_smiles_list = list(molecule_label_in_shortest_paths_found.keys())
shortest_paths_found_smiles_label = []

for smi in shortest_paths_found_smiles_list:
    if molecule_label_in_shortest_paths_found [smi]['any_bkms'] and molecule_label_in_shortest_paths_found [smi]['any_reaxys']:
        shortest_paths_found_smiles_label.append(0)
    elif molecule_label_in_shortest_paths_found [smi]['any_bkms'] and not molecule_label_in_shortest_paths_found [smi]['any_reaxys']:
        shortest_paths_found_smiles_label.append(-1)
    elif not molecule_label_in_shortest_paths_found [smi]['any_bkms'] and molecule_label_in_shortest_paths_found [smi]['any_reaxys']:
        shortest_paths_found_smiles_label.append(1)
print('shortest_paths_found_smiles_label = ',len(shortest_paths_found_smiles_label))

shortest_paths_found_smiles_label =  1544


In [32]:
print('Chem:',shortest_paths_found_smiles_label.count(1),
' Overlap:',shortest_paths_found_smiles_label.count(0),
' Enzy:',shortest_paths_found_smiles_label.count(-1))

Chem: 788  Overlap: 275  Enzy: 481


In [33]:
try:
    shortest_paths_found_molecules_sfscore = np.load('../data/askcos_benchmark_result/shortest_paths_found_molecules_sfscore.npy')
    print('Load saved file.')
except:
    shortest_paths_found_molecules_sfscore = sfscore_model.score_from_smi_many(shortest_paths_found_smiles_list)
    np.save('../data/askcos_benchmark_result/shortest_paths_found_molecules_sfscore.npy',shortest_paths_found_molecules_sfscore)
print('shortest_paths_found_molecules_sfscore = ',len(shortest_paths_found_molecules_sfscore))

Load saved file.
shortest_paths_found_molecules_sfscore =  1544


In [64]:
def get_search_field_coverage(molecules_sfscore, smiles_label, margin = 0.15):
    chem_cor = 0
    chem_over = 0
    bio_cor = 0
    bio_over = 0
    over_num = 0
    for pred_sfscore, real_label in zip(molecules_sfscore, smiles_label):
        score_diff = pred_sfscore[0] - pred_sfscore[1]
        if real_label == 1 and score_diff>margin:
            chem_cor += 1
        if real_label == 1 and score_diff>-margin:
            chem_over += 1
        if real_label == -1 and score_diff<margin:
            bio_over += 1
        if real_label == -1 and score_diff<-margin:
            bio_cor += 1
        if real_label == 0 and score_diff>-margin and score_diff<margin:
            over_num += 1

    chem_mol_num = len([i for i in smiles_label if i == 1])
    bio_mol_num = len([i for i in smiles_label if i == -1])
    overlap_mol_num = len([i for i in smiles_label if i == 0])

    sf_coverage = (chem_over+bio_over+over_num)/len(molecules_sfscore)
    overall_acc = (chem_cor+bio_cor)/(chem_mol_num+bio_mol_num)
    print(f'margin = {margin}')
    print(f'Cover {sf_coverage*100 :0.1f}% of molecule\'s synthesis field in original paths, while {overall_acc*100 :0.1f}% of {chem_mol_num+bio_mol_num} molecules only search one synthesis field guided by SFScore.')

    return sf_coverage, overall_acc

# user defined margin
margin_list = [0.05, 0.1, 0.15, 0.2, 0.25]
result_kept_list = []
overall_acc_list = []
for margin in margin_list:
    result_kept, overall_acc = get_search_field_coverage(shortest_paths_found_molecules_sfscore, shortest_paths_found_smiles_label, margin=margin)
    result_kept_list.append(result_kept)
    overall_acc_list.append(overall_acc)

margin = 0.05
Cover 66.8% of molecule's synthesis field in original paths, while 66.9% of 1269 molecules only search one synthesis field guided by SFScore.
margin = 0.1
Cover 74.4% of molecule's synthesis field in original paths, while 58.9% of 1269 molecules only search one synthesis field guided by SFScore.
margin = 0.15
Cover 85.8% of molecule's synthesis field in original paths, while 40.2% of 1269 molecules only search one synthesis field guided by SFScore.
margin = 0.2
Cover 95.9% of molecule's synthesis field in original paths, while 18.7% of 1269 molecules only search one synthesis field guided by SFScore.
margin = 0.25
Cover 99.0% of molecule's synthesis field in original paths, while 6.4% of 1269 molecules only search one synthesis field guided by SFScore.


## field coverage in all paths

In [3]:
all_paths_found_smiles_list = list(molecule_label_in_all_paths_found.keys())
all_paths_found_smiles_label = []
for smi in all_paths_found_smiles_list:
    if molecule_label_in_all_paths_found[smi]['any_bkms'] and molecule_label_in_all_paths_found[smi]['any_reaxys']:
        all_paths_found_smiles_label.append(0)
    elif molecule_label_in_all_paths_found[smi]['any_bkms'] and not molecule_label_in_all_paths_found[smi]['any_reaxys']:
        all_paths_found_smiles_label.append(-1)
    elif not molecule_label_in_all_paths_found[smi]['any_bkms'] and molecule_label_in_all_paths_found[smi]['any_reaxys']:
        all_paths_found_smiles_label.append(1)
print('all_paths_found_smiles_label = ',len(all_paths_found_smiles_label))

NameError: name 'molecule_label_in_all_paths_found' is not defined

In [14]:
print('Chem:',all_paths_found_smiles_label.count(1),
' Overlap:',all_paths_found_smiles_label.count(0),
' Enzy:',all_paths_found_smiles_label.count(-1))

Chem: 9162  Overlap: 7368  Enzy: 10211


In [15]:
try:
    all_paths_found_molecules_sfscore = np.load('../data/askcos_benchmark_result/all_paths_found_molecules_sfscore.npy')
    print('Load saved file.')
except:
    all_paths_found_molecules_sfscore = sfscore_model.score_from_smi_many(all_paths_found_smiles_list)
    np.save('../data/askcos_benchmark_result/all_paths_found_molecules_sfscore.npy',all_paths_found_molecules_sfscore)
    np.savetxt('../data/askcos_benchmark_result/all_paths_found_molecules_sfscore.csv', all_paths_found_molecules_sfscore, delimiter=",")
print('all_paths_found_molecules_sfscore = ',len(all_paths_found_molecules_sfscore))

Load saved file.
all_paths_found_molecules_sfscore =  26741


In [16]:
import matplotlib as plt
def draw_sfscore_difference_distribution(score_list_1, title):
    a= score_list_1[:,0] - score_list_1[:,1]
    plt.hist(a, color = "steelblue", label = "in-vivo", bins=140,alpha = 0.7)
    #plt.ylim((0,1400))
    plt.ylabel('Number of molecules')
    plt.xlabel('S_Chem - S_Enzy')
    plt.show()

def get_diff_margin_distribution(score_list,margin_list=[0.05,0.1,0.15,0.2,0.25],title=None):
    a= score_list[:,0]
    b= score_list[:,1]
    sfscore_diff = a - b
    chem_search_precent_list = []
    enzy_search_precent_list = []
    overlap_search_precent_list = []
    for margin in margin_list:
        chem_search_idx = [x > margin for x in sfscore_diff]
        enzy_search_idx = [x < -margin for x in sfscore_diff]
        # overlap_search_idx = np.multiply(enzy_search_idx, chem_search_idx)
        overlap_search_idx = [x >= -margin and x <= margin for x in sfscore_diff]
        chem_search_precent = sum(chem_search_idx)/len(sfscore_diff)
        enzy_search_precent = sum(enzy_search_idx)/len(sfscore_diff)
        overlap_search_precent = sum(overlap_search_idx)/len(sfscore_diff)

        chem_search_precent_list.append(chem_search_precent*100)
        enzy_search_precent_list.append(enzy_search_precent*100)
        overlap_search_precent_list.append(overlap_search_precent*100)
        print(f'margin: {margin}, chem_search_precent: {chem_search_precent:.4%}, enzy_search_precent: {enzy_search_precent:.4%}, overlap_search_precent: {overlap_search_precent:.4%}')
    return chem_search_precent_list, enzy_search_precent_list, overlap_search_precent_list
get_diff_margin_distribution(all_paths_found_molecules_sfscore)

margin: 0.05, chem_search_precent: 59.5079%, enzy_search_precent: 27.6242%, overlap_search_precent: 12.8679%
margin: 0.1, chem_search_precent: 51.6772%, enzy_search_precent: 21.3455%, overlap_search_precent: 26.9773%
margin: 0.15, chem_search_precent: 38.5475%, enzy_search_precent: 10.8635%, overlap_search_precent: 50.5890%
margin: 0.2, chem_search_precent: 20.9341%, enzy_search_precent: 2.6140%, overlap_search_precent: 76.4519%
margin: 0.25, chem_search_precent: 7.4006%, enzy_search_precent: 0.4600%, overlap_search_precent: 92.1394%


([59.50787180733704,
  51.67719980554205,
  38.54754870797652,
  20.934146067835908,
  7.400620769604727],
 [27.624247410343667,
  21.34549942036573,
  10.86346808271942,
  2.613963576530421,
  0.45996783964698407],
 [12.867880782319286,
  26.977300774092217,
  50.58898320930406,
  76.45189035563367,
  92.1394113907483])

In [17]:
all_paths_found_smiles_sfscore_dict  = {}
for smi,sfscore in zip(all_paths_found_smiles_list,all_paths_found_molecules_sfscore):
    all_paths_found_smiles_sfscore_dict[smi] = sfscore

In [25]:
len(all_paths_found_smiles_label)

26741

In [23]:
len(all_paths_found_molecules_sfscore)

26741

In [22]:
len(all_paths_found_smiles_list)

26741

In [63]:
margin_list = [0.05, 0.1, 0.15, 0.2, 0.25]
result_kept_list = []
overall_acc_list = []
for margin in margin_list:
    result_kept, overall_acc = get_search_field_coverage(all_paths_found_molecules_sfscore, all_paths_found_smiles_label, margin=margin)
    result_kept_list.append(result_kept)
    overall_acc_list.append(overall_acc)

margin = 0.05
Cover 52.7% of molecule's synthesis field in original paths, while 55.0% of 19373 molecules only search one synthesis field guided by SFScore.
margin = 0.1
Cover 61.4% of molecule's synthesis field in original paths, while 47.5% of 19373 molecules only search one synthesis field guided by SFScore.
margin = 0.15
Cover 75.0% of molecule's synthesis field in original paths, while 33.8% of 19373 molecules only search one synthesis field guided by SFScore.
margin = 0.2
Cover 89.0% of molecule's synthesis field in original paths, while 17.3% of 19373 molecules only search one synthesis field guided by SFScore.
margin = 0.25
Cover 96.7% of molecule's synthesis field in original paths, while 6.3% of 19373 molecules only search one synthesis field guided by SFScore.


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

def pca_fn(all_smi, chem_smi, overlap_smi, enzy_smi, size: int=2, filename=None):
    all  = [Chem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(m), 2, nBits=1024) for m in all_smi]
    chem  = [Chem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(m), 2, nBits=1024) for m in chem_smi]
    overlap  = [Chem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(m), 2, nBits=1024) for m in overlap_smi]
    enzy  = [Chem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(m), 2, nBits=1024) for m in enzy_smi]
    pca = PCA(n_components=2)
    products_data = all + chem + overlap + enzy
    crds = pca.fit_transform(products_data)

    type_list = ['all']*len(all) + ['chem']*len(chem) + ['overlap']*len(overlap) +  ['enzy']*len(enzy) 
    crds_df = pd.DataFrame(crds,columns=["PC_1","PC_2"])
    crds_df['type'] = type_list
    ax = sns.scatterplot(data=crds_df.query("type == 'all'"),x="PC_1",y="PC_2",color='black', s=size)
    ax = sns.scatterplot(data=crds_df.query("type == 'chem'"),x="PC_1",y="PC_2",color='steelblue', s=size)
    ax = sns.scatterplot(data=crds_df.query("type == 'overlap'"),x="PC_1",y="PC_2",color='green', s=size)
    ax = sns.scatterplot(data=crds_df.query("type == 'enzy'"),x="PC_1",y="PC_2",color='purple', s=size)
    ax.set(xlabel='PC 1', ylabel='PC 2')

    _ = plt.legend(labels=['All','Chem', 'Both', 'Enzy'])
    for handle in _.legendHandles:
        handle.set_sizes([42])
    if filename:
        plt.savefig(filename,dpi=600)

In [2]:
def get_failed_molecules(molecules_sfscore, smiles_label, smiles_list, margin = 0.15):
    chem_cor = 0
    chem_over = 0
    chem_failed_list = []
    bio_cor = 0
    bio_over = 0
    bio_failed_list = []
    over_num = 0
    over_failed_list = []
    for pred_sfscore, real_label, smiles in zip(molecules_sfscore, smiles_label, smiles_list):
        score_diff = pred_sfscore[0] - pred_sfscore[1]
        if real_label == 1 and score_diff>margin:
            chem_cor += 1
        if real_label == 1 and score_diff>-margin:
            chem_over += 1
        if real_label == 1 and score_diff<-margin:
            chem_failed_list.append(smiles)
        if real_label == -1 and score_diff<margin:
            bio_over += 1
        if real_label == -1 and score_diff<-margin:
            bio_cor += 1
        if real_label == -1 and score_diff>margin: 
            bio_failed_list.append(smiles)
        if real_label == 0 and score_diff>-margin and score_diff<margin:
            over_num += 1
        if real_label == 0 and (score_diff<-margin or score_diff>margin):
            over_failed_list.append(smiles)

    chem_mol_num = len([i for i in smiles_label if i == 1])
    bio_mol_num = len([i for i in smiles_label if i == -1])
    overlap_mol_num = len([i for i in smiles_label if i == 0])

    #sf_coverage = (chem_over+bio_over+over_num)/len(molecules_sfscore)
    #overall_acc = (chem_cor+bio_cor)/(chem_mol_num+bio_mol_num)
    #print(f'margin = {margin}')
    #print(f'Cover {sf_coverage*100 :0.1f}% of molecule\'s synthesis field in original paths, while {overall_acc*100 :0.1f}% of molecules only search one synthesis field guided by SFScore.')
    return chem_failed_list, over_failed_list, bio_failed_list

chem_failed_list, over_failed_list, bio_failed_list = get_failed_molecules(all_paths_found_molecules_sfscore, all_paths_found_smiles_label, all_paths_found_smiles_list, margin=0.15)
all_smi = [i for i in all_paths_found_smiles_list if i not in chem_failed_list and i not in over_failed_list and i not in bio_failed_list]
pca_fn(all_smi,chem_failed_list,over_failed_list,bio_failed_list,size=4,filename='margin0.15.pdf')
print(len(all_smi))
len(chem_failed_list+bio_failed_list+over_failed_list)


NameError: name 'all_paths_found_molecules_sfscore' is not defined

In [79]:
print(len(chem_failed_list),len(over_failed_list),len(bio_failed_list))

410 3332 2931


In [53]:
a = [1,2]
b= [2]
a-b

TypeError: unsupported operand type(s) for -: 'list' and 'list'

# Routes in shortest paths

In [23]:
i = 0
for smiles in shortest_paths[shortest_paths.loc[:, 'bkms,reaxys']<np.inf].index:
    i += 1
i

493

In [39]:
# 3 min
try:
    with open(f"../data/askcos_benchmark_result/enumerated_paths_count.json", "r") as outfile:
        enumerated_paths_count = json.load(outfile)
    print('Load data from file.')
except:
    print('Running the script...')
    enumerated_paths_count = {}
    margin_list = [0.05, 0.1, 0.15, 0.2, 0.25]
    for margin in margin_list:
        enumerated_paths_count[margin] = {}
        for smiles in shortest_paths[shortest_paths.loc[:, 'bkms,reaxys']<np.inf].index:
            #if smiles == 'CNC(=O)CCC(=O)c1cccnc1':
            g = askcos_spg_dict[smiles]['bkms,reaxys']
            # std_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
            all_paths = enumerated_paths_in_hybrid[smiles]
            shortest_path_length = shortest_paths.loc[smiles, 'bkms,reaxys']
            path_num_count += len(all_paths)
            
            enumerated_paths_count[margin][smiles] = {}
            enumerated_paths_count[margin][smiles]['paths_count'] = len(all_paths)
            enumerated_paths_count[margin][smiles]['shortest_paths_count'] = 0
            enumerated_paths_count[margin][smiles]['rxns_count'] = 0
            enumerated_paths_count[margin][smiles]['shortest_rxns_count'] = 0

            # enumerated_paths_count[margin][smiles]['kept_paths_count'] = 0
            enumerated_paths_count[margin][smiles]['kept_shortest_paths_count'] = 0
            enumerated_paths_count[margin][smiles]['kept_rxns_count'] = 0
            enumerated_paths_count[margin][smiles]['kept_shortest_rxns_count'] = 0

            enumerated_paths_count[margin][smiles]['num_rxns_each_path'] = []
            enumerated_paths_count[margin][smiles]['kept_num_rxns_each_path'] = []
            enumerated_paths_count[margin][smiles]['if_path_kept'] = []
            
            for path in all_paths:
                has_not_matched_sfscore = False

                rxn_list = get_all_children(path)
                path_length = len(rxn_list)
                
                num_rxns_each_path = 0
                kept_num_rxns_each_path = 0

                if shortest_path_length == path_length:
                    # shortest_path_num_count += 1
                    enumerated_paths_count[margin][smiles]['shortest_paths_count'] += 1
                    
                    for rxn in rxn_list:
                        num_rxns_each_path += 1
                        #rxn_num_count += 1
                        #shortest_rxn_num_count += 1
                        enumerated_paths_count[margin][smiles]['rxns_count'] += 1
                        enumerated_paths_count[margin][smiles]['shortest_rxns_count'] += 1

                        product = rxn.split('>>')[-1]
                        product_sfscore = all_paths_found_smiles_sfscore_dict[product]
                        product_sfscore_diff = product_sfscore[0] - product_sfscore[1]
                        if 'bkms' in g.nodes[rxn]['template_set'] and 'reaxys' not in g.nodes[rxn]['template_set']:
                            if product_sfscore_diff > margin:
                                has_not_matched_sfscore = True
                            else:
                                #kept_rxn_count += 1
                                #kept_shortest_rxn_count += 1
                                enumerated_paths_count[margin][smiles]['kept_rxns_count'] += 1
                                enumerated_paths_count[margin][smiles]['kept_shortest_rxns_count'] += 1
                                kept_num_rxns_each_path += 1
                        elif 'reaxys' in g.nodes[rxn]['template_set'] and 'bkms' not in g.nodes[rxn]['template_set']:
                            if product_sfscore_diff < -margin:
                                has_not_matched_sfscore = True
                            else:
                                #kept_rxn_count += 1
                                #kept_shortest_rxn_count += 1
                                enumerated_paths_count[margin][smiles]['kept_rxns_count'] += 1
                                enumerated_paths_count[margin][smiles]['kept_shortest_rxns_count'] += 1
                                kept_num_rxns_each_path += 1
                        else:
                            if product_sfscore_diff < -margin or product_sfscore_diff > margin:
                                has_not_matched_sfscore = True
                            else:
                                #kept_rxn_count += 1
                                #kept_shortest_rxn_count += 1
                                enumerated_paths_count[margin][smiles]['kept_rxns_count'] += 1
                                enumerated_paths_count[margin][smiles]['kept_shortest_rxns_count'] += 1
                                kept_num_rxns_each_path += 1
                    enumerated_paths_count[margin][smiles]['num_rxns_each_path'].append(num_rxns_each_path)
                    enumerated_paths_count[margin][smiles]['kept_num_rxns_each_path'].append(kept_num_rxns_each_path)
                    if not has_not_matched_sfscore:
                        #kept_path_count += 1
                        #kept_shortest_path_count += 1
                        # enumerated_paths_count[margin][smiles]['kept_paths_count'] += 1
                        enumerated_paths_count[margin][smiles]['kept_shortest_paths_count'] += 1
                        enumerated_paths_count[margin][smiles]['if_path_kept'].append(True)
                    else:
                        enumerated_paths_count[margin][smiles]['if_path_kept'].append(False)
                else:
                    for rxn in rxn_list:
                        num_rxns_each_path += 1
                        # rxn_num_count += 1
                        enumerated_paths_count[margin][smiles]['rxns_count'] += 1
                        product = rxn.split('>>')[-1]
                        product_sfscore = all_paths_found_smiles_sfscore_dict[product]
                        product_sfscore_diff = product_sfscore[0] - product_sfscore[1]
                        if 'bkms' in g.nodes[rxn]['template_set'] and 'reaxys' not in g.nodes[rxn]['template_set']:
                            if product_sfscore_diff > margin:
                                has_not_matched_sfscore = True
                            else:
                                #kept_rxn_count += 1
                                enumerated_paths_count[margin][smiles]['kept_rxns_count'] += 1
                                kept_num_rxns_each_path += 1

                        elif 'reaxys' in g.nodes[rxn]['template_set'] and 'bkms' not in g.nodes[rxn]['template_set']:
                            if product_sfscore_diff < -margin:
                                has_not_matched_sfscore = True
                            else:
                                #kept_rxn_count += 1
                                enumerated_paths_count[margin][smiles]['kept_rxns_count'] += 1
                                kept_num_rxns_each_path += 1
                        else:
                            if product_sfscore_diff < -margin or product_sfscore_diff > margin:
                                has_not_matched_sfscore = True
                            else:
                                #kept_rxn_count += 1
                                enumerated_paths_count[margin][smiles]['kept_rxns_count'] += 1
                                kept_num_rxns_each_path += 1
                    enumerated_paths_count[margin][smiles]['num_rxns_each_path'].append(num_rxns_each_path)
                    enumerated_paths_count[margin][smiles]['kept_num_rxns_each_path'].append(kept_num_rxns_each_path)
                    if not has_not_matched_sfscore:
                        #kept_path_count += 1
                        #enumerated_paths_count[margin][smiles]['kept_paths_count'] += 1
                        enumerated_paths_count[margin][smiles]['if_path_kept'].append(True)
                    else:
                        enumerated_paths_count[margin][smiles]['if_path_kept'].append(False)
    with open(f"../data/askcos_benchmark_result/enumerated_paths_count.json", "w") as outfile:
        json.dump(enumerated_paths_count, outfile)
    print('File saved')

Load data from file.


In [42]:
margin_list = ['0.05', '0.1', '0.15', '0.2', '0.25']
for margin in margin_list:
    shortest_rxns_count = 0
    kept_shortest_rxns_count = 0

    near_shortest_rxns_count = 0
    kept_near_shortest_rxns_count = 0

    for smi,result_dict in enumerated_paths_count[margin].items():
        if_path_kept = result_dict['if_path_kept']
        num_rxns_each_path = result_dict['num_rxns_each_path']
        kept_num_rxns_each_path = result_dict['kept_num_rxns_each_path']
        min_length = shortest_paths.loc[smi, 'bkms,reaxys']
        # min_length = min(num_rxns_each_path)
        for num_rxns,kept_num_rxns in zip(num_rxns_each_path,kept_num_rxns_each_path):
            if num_rxns <= min_length+2:
                near_shortest_rxns_count += num_rxns
                kept_near_shortest_rxns_count += kept_num_rxns
            if num_rxns == min_length:
                shortest_rxns_count += num_rxns
                kept_shortest_rxns_count += kept_num_rxns


                    
    print(f'''
    margin = {margin}
    shortest_rxns_count = {shortest_rxns_count}
    kept_shortest_rxns_count = {kept_shortest_rxns_count}
    kept_shortest_rxns_rate = {kept_shortest_rxns_count/shortest_rxns_count:.4%}
    near_shortest_rxns_count = {near_shortest_rxns_count}
    kept_near_shortest_rxns_count = {kept_near_shortest_rxns_count}
    kept_near_shortest_rxns_rate = {kept_near_shortest_rxns_count/near_shortest_rxns_count:.4%}
    ''')


    margin = 0.05
    shortest_rxns_count = 5383
    kept_shortest_rxns_count = 3810
    kept_shortest_rxns_rate = 70.7784%
    near_shortest_rxns_count = 63491
    kept_near_shortest_rxns_count = 43454
    kept_near_shortest_rxns_rate = 68.4412%
    

    margin = 0.1
    shortest_rxns_count = 5383
    kept_shortest_rxns_count = 4209
    kept_shortest_rxns_rate = 78.1906%
    near_shortest_rxns_count = 63491
    kept_near_shortest_rxns_count = 48647
    kept_near_shortest_rxns_rate = 76.6203%
    

    margin = 0.15
    shortest_rxns_count = 5383
    kept_shortest_rxns_count = 4841
    kept_shortest_rxns_rate = 89.9313%
    near_shortest_rxns_count = 63491
    kept_near_shortest_rxns_count = 54599
    kept_near_shortest_rxns_rate = 85.9949%
    

    margin = 0.2
    shortest_rxns_count = 5383
    kept_shortest_rxns_count = 5272
    kept_shortest_rxns_rate = 97.9380%
    near_shortest_rxns_count = 63491
    kept_near_shortest_rxns_count = 60825
    kept_near_shortest_rxns_rate = 95.8

In [45]:
margin_list = ['0.05', '0.1', '0.15', '0.2', '0.25']
for margin in margin_list:
    one_shortest_path_count = 0
    kept_one_shortest_path_count = 0

    three_shortest_paths_count = 0
    kept_three_shortest_paths_count = 0

    for smi,result_dict in enumerated_paths_count[margin].items():
        if_path_kept = result_dict['if_path_kept']
        num_rxns_each_path = result_dict['num_rxns_each_path']
        kept_num_rxns_each_path = result_dict['kept_num_rxns_each_path']
        one_shortest_path_count += 1
        min_length = shortest_paths.loc[smi, 'bkms,reaxys']
        num_min_length = num_rxns_each_path.count(min_length)
        #has_three_shortest_paths_count = False
        # has_kept_one_shortest_path_count = False
        num_of_shortest_paths_kept = 0
        for path_kept,num_rxns,kept_num_rxns in zip(if_path_kept,num_rxns_each_path,kept_num_rxns_each_path):
            if num_rxns == min_length and path_kept:
                num_of_shortest_paths_kept += 1
        # print(three_shortest_paths_count)
        if num_min_length >= 3:
            three_shortest_paths_count += 1
            if num_of_shortest_paths_kept >= 3:
                kept_three_shortest_paths_count += 1
        if num_of_shortest_paths_kept > 0:
            kept_one_shortest_path_count += 1
    # print(kept_three_shortest_paths_count)
    print(f'''
    margin = {margin}
    one_shortest_path_count = {one_shortest_path_count}
    kept_one_shortest_path_count = {kept_one_shortest_path_count}
    kept_one_shortest_path_rate = {kept_one_shortest_path_count/one_shortest_path_count:.4%}
    three_shortest_paths_count = {three_shortest_paths_count}
    kept_three_shortest_paths_count = {kept_three_shortest_paths_count}
    kept_three_shortest_paths_rate = {kept_three_shortest_paths_count/three_shortest_paths_count:.4%}
    ''')


    margin = 0.05
    one_shortest_path_count = 493
    kept_one_shortest_path_count = 279
    kept_one_shortest_path_rate = 56.5923%
    three_shortest_paths_count = 148
    kept_three_shortest_paths_count = 60
    kept_three_shortest_paths_rate = 40.5405%
    

    margin = 0.1
    one_shortest_path_count = 493
    kept_one_shortest_path_count = 319
    kept_one_shortest_path_rate = 64.7059%
    three_shortest_paths_count = 148
    kept_three_shortest_paths_count = 76
    kept_three_shortest_paths_rate = 51.3514%
    

    margin = 0.15
    one_shortest_path_count = 493
    kept_one_shortest_path_count = 387
    kept_one_shortest_path_rate = 78.4990%
    three_shortest_paths_count = 148
    kept_three_shortest_paths_count = 105
    kept_three_shortest_paths_rate = 70.9459%
    

    margin = 0.2
    one_shortest_path_count = 493
    kept_one_shortest_path_count = 461
    kept_one_shortest_path_rate = 93.5091%
    three_shortest_paths_count = 148
    kept_three_shortest_paths_count =

In [73]:
def get_diff_margin_distribution(score_list,margin_list=[0.05,0.1,0.15,0.2,0.25],title=None):
    a= score_list[:,0]
    b= score_list[:,1]
    sfscore_diff = a - b
    for margin in margin_list:
        chem_search_idx = [x > margin for x in sfscore_diff]
        enzy_search_idx = [x < -margin for x in sfscore_diff]
        # overlap_search_idx = np.multiply(enzy_search_idx, chem_search_idx)
        overlap_search_idx = [x >= -margin and x <= margin for x in sfscore_diff]
        chem_search_precent = sum(chem_search_idx)/len(sfscore_diff)
        enzy_search_precent = sum(enzy_search_idx)/len(sfscore_diff)
        overlap_search_precent = sum(overlap_search_idx)/len(sfscore_diff)
        print(f'margin: {margin}, chem_search_precent: {chem_search_precent:.4%}, enzy_search_precent: {enzy_search_precent:.4%}, overlap_search_precent: {overlap_search_precent:.4%}')
get_diff_margin_distribution(all_paths_found_molecules_sfscore)
# get_diff_margin_distribution(shortest_paths_found_molecules_sfscore)

margin: 0.05, chem_search_precent: 59.5079%, enzy_search_precent: 27.6242%, overlap_search_precent: 12.8679%
margin: 0.1, chem_search_precent: 51.6772%, enzy_search_precent: 21.3455%, overlap_search_precent: 26.9773%
margin: 0.15, chem_search_precent: 38.5475%, enzy_search_precent: 10.8635%, overlap_search_precent: 50.5890%
margin: 0.2, chem_search_precent: 20.9341%, enzy_search_precent: 2.6140%, overlap_search_precent: 76.4519%
margin: 0.25, chem_search_precent: 7.4006%, enzy_search_precent: 0.4600%, overlap_search_precent: 92.1394%


# Compare pathways with Async

In [10]:
import urllib.parse
from IPython.display import Image,display
def print_rxns_askcos(spg_dict,prioritizer,smi):
    #images = []
    for rxn in spg_dict[smi][prioritizer].nodes[smi]['shortest_pathway']:
        url_rxn = urllib.parse.quote(rxn)
        img = Image(url=f'https://askcos.mit.edu/api/v2/draw/?smiles={url_rxn}&draw_map=false&highlight=false')
        #images.append(img)
        model = spg_dict[smi][prioritizer].nodes[rxn]['template_set']
        #print(spg_dict[smi][prioritizer].nodes[rxn])
        product = rxn.split('>>')[-1]
        # product_sfscore = spg_dict[smi][prioritizer].nodes[product]['sfscore']
        print(model,': ',rxn)
        display(img)

In [45]:
smi = 'CC[C@H](C)Cn1c(=O)n(C)c(=O)c2[nH]c(C)nc21'
print_rxns_askcos(askcos_spg_dict,'bkms,reaxys',smi)

['reaxys'] :  CC[C@H](C)CBr.Cn1c(=O)cc(N)[nH]c1=O>>CC[C@H](C)Cn1c(N)cc(=O)n(C)c1=O


['reaxys'] :  CC[C@H](C)Cn1c(N)cc(=O)n(C)c1=O>>CC[C@H](C)Cn1c(N)c(N=O)c(=O)n(C)c1=O


['reaxys'] :  CC[C@H](C)Cn1c(N)c(N=O)c(=O)n(C)c1=O>>CC[C@H](C)Cn1c(N)c(N)c(=O)n(C)c1=O


['reaxys'] :  CC(=O)O.CC[C@H](C)Cn1c(N)c(N)c(=O)n(C)c1=O>>CC[C@H](C)Cn1c(=O)n(C)c(=O)c2[nH]c(C)nc21


In [53]:
# 2 steps shorter example
smi = 'CC(=O)/C=C/C1=C(C)C[C@H](O)CC1(C)C'
print_rxns_askcos(askcos_spg_dict,'bkms,reaxys',smi)

['reaxys'] :  CC(C)=O.CC1=C(C=O)C(C)(C)CCC1>>CC(=O)/C=C/C1=C(C)CCCC1(C)C


['reaxys'] :  CC(=O)/C=C/C1=C(C)CCCC1(C)C>>CC1=C(/C=C/C(C)O)C(C)(C)CCC1


['bkms', 'bkms'] :  CC1=C(/C=C/C(C)O)C(C)(C)CCC1>>CC1=C(/C=C/C(C)O)C(C)(C)C[C@@H](O)C1


['reaxys', 'reaxys'] :  CC1=C(/C=C/C(C)O)C(C)(C)C[C@@H](O)C1>>CC(=O)/C=C/C1=C(C)C[C@H](O)CC1(C)C


In [56]:
# 2 steps shorter example
smi = 'CCOC(C(=O)OCCN(C)C)(c1ccccc1)c1ccccc1'
print_rxns_askcos(askcos_spg_dict,'bkms,reaxys',smi)

['reaxys'] :  CC(O)(c1ccccc1)c1ccccc1.CCI>>CCOC(C)(c1ccccc1)c1ccccc1


['bkms'] :  CCOC(C)(c1ccccc1)c1ccccc1>>CCOC(CO)(c1ccccc1)c1ccccc1


['reaxys'] :  CCOC(CO)(c1ccccc1)c1ccccc1>>CCOC(C(=O)O)(c1ccccc1)c1ccccc1


['reaxys'] :  CCOC(C(=O)O)(c1ccccc1)c1ccccc1.CN(C)CCCl>>CCOC(C(=O)OCCN(C)C)(c1ccccc1)c1ccccc1


In [13]:
smi = 'CO[C@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1ccc(O)c(O)c1'
print_rxns_askcos(askcos_spg_dict,'bkms,reaxys',smi)

['reaxys'] :  O[C@H](CCCl)c1ccccc1.Oc1ccccc1Br>>ClCC[C@H](Oc1ccccc1Br)c1ccccc1


['reaxys'] :  ClCC[C@H](Oc1ccccc1Br)c1ccccc1>>c1ccc([C@@H]2CCc3ccccc3O2)cc1


['bkms'] :  c1ccc([C@@H]2CCc3ccccc3O2)cc1>>O[C@H]1Cc2ccccc2O[C@@H]1c1ccccc1


['reaxys'] :  CI.O[C@H]1Cc2ccccc2O[C@@H]1c1ccccc1>>CO[C@H]1Cc2ccccc2O[C@@H]1c1ccccc1


['bkms'] :  CO[C@H]1Cc2ccccc2O[C@@H]1c1ccccc1>>CO[C@H]1Cc2c(O)cccc2O[C@@H]1c1ccccc1


['bkms'] :  CO[C@H]1Cc2c(O)cccc2O[C@@H]1c1ccccc1>>CO[C@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1ccccc1


['bkms'] :  CO[C@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1ccccc1>>CO[C@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1ccc(O)cc1


['bkms', 'reaxys'] :  CO[C@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1ccc(O)cc1>>CO[C@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1ccc(O)c(O)c1


In [46]:
smi = 'CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(COC(C)=O)C[S@@](=O)[C@H]12)c1csc(N)n1'
print_rxns_askcos(askcos_spg_dict,'bkms,reaxys',smi)

['reaxys'] :  CCOC(=O)C(=NO)C(C)=O.CI>>CCOC(=O)C(=NOC)C(C)=O


['reaxys'] :  CCOC(=O)C(=NOC)C(C)=O>>CCOC(=O)C(=NOC)C(=O)CBr


['reaxys'] :  CCOC(=O)C(=NOC)C(=O)CBr.NC(N)=S>>CCOC(=O)/C(=N\OC)c1csc(N)n1


['reaxys'] :  CCOC(=O)/C(=N\OC)c1csc(N)n1>>CO/N=C(\C(=O)O)c1csc(N)n1


['reaxys'] :  CCOP(=O)(Cl)OCC.CO/N=C(\C(=O)O)c1csc(N)n1>>CCOP(=O)(OCC)OC(=O)/C(=N\OC)c1csc(N)n1


['reaxys'] :  CCOP(=O)(OCC)OC(=O)/C(=N\OC)c1csc(N)n1.N[C@@H]1C(=O)N2C(C(=O)O)=C(CO)CS[C@H]12>>CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CO)CS[C@H]12)c1csc(N)n1


['reaxys'] :  CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CO)CS[C@H]12)c1csc(N)n1>>CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CO)CS(=O)[C@H]12)c1csc(N)n1


['bkms', 'bkms', 'bkms'] :  CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CO)CS(=O)[C@H]12)c1csc(N)n1>>CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(COC(C)=O)C[S@@](=O)[C@H]12)c1csc(N)n1


In [15]:
smi = 'O=C(O)CCC#CC#C/C=C\CO'
print_rxns_askcos(askcos_spg_dict,'bkms,reaxys',smi)

['reaxys'] :  C#C[Si](C)(C)C.C=CCBr>>C=CCC#C[Si](C)(C)C


['reaxys'] :  C=CCC#C[Si](C)(C)C>>C/C=C\C#C[Si](C)(C)C


['reaxys', 'reaxys'] :  C/C=C\C#C[Si](C)(C)C>>C#C/C=C\C


['reaxys', 'reaxys'] :  C#C/C=C\C.O=C(O)CCC#CBr>>C/C=C\C#CC#CCCC(=O)O


['bkms'] :  C/C=C\C#CC#CCCC(=O)O>>O=C(O)CCC#CC#C/C=C\CO


In [16]:
smi = 'Cc1ccc(C(C)C)o1'
print_rxns_askcos(askcos_spg_dict,'bkms,reaxys',smi)

['reaxys'] :  CC(=O)c1ccc(C)o1.C[Mg]Br>>Cc1ccc(C(C)(C)O)o1


['reaxys', 'reaxys'] :  Cc1ccc(C(C)(C)O)o1>>C=C(C)c1ccc(C)o1


['reaxys'] :  C=C(C)c1ccc(C)o1>>Cc1ccc(C(C)C)o1


In [17]:
smi = 'CC(C)CNC(=O)/C=C/C=C\c1ccc2c(c1)OCO2'
print_rxns_askcos(askcos_spg_dict,'bkms,reaxys',smi)

['reaxys'] :  CC(=O)c1ccc2c(c1)OCO2.CN(C)C=O>>O=C/C=C(\Cl)c1ccc2c(c1)OCO2


['reaxys'] :  CCOC(=O)CP(=O)(OCC)OCC.O=C/C=C(\Cl)c1ccc2c(c1)OCO2>>CCOC(=O)/C=C/C=C(\Cl)c1ccc2c(c1)OCO2


['reaxys'] :  CCOC(=O)/C=C/C=C(\Cl)c1ccc2c(c1)OCO2>>O=C(O)/C=C/C#Cc1ccc2c(c1)OCO2


['reaxys', 'reaxys'] :  CC(C)CN.O=C(O)/C=C/C#Cc1ccc2c(c1)OCO2>>CC(C)CNC(=O)/C=C/C#Cc1ccc2c(c1)OCO2


['reaxys'] :  CC(C)CNC(=O)/C=C/C#Cc1ccc2c(c1)OCO2>>CC(C)CNC(=O)/C=C/C=C\c1ccc2c(c1)OCO2


In [22]:
smi = 'CC/C=C\CC/C=C/C(=O)NC1CC1'
print_rxns_askcos(askcos_spg_dict,'bkms,reaxys',smi)

['reaxys'] :  CCC#CCCCCC>>CC/C=C\CCCCC


['bkms'] :  CC/C=C\CCCCC>>CC/C=C\CCCCCO


['reaxys'] :  CC/C=C\CCCCCO>>CC/C=C\CCCCC(=O)O


['bkms'] :  CC/C=C\CCCCC(=O)O>>CC/C=C\CC/C=C/C(=O)O


['reaxys', 'reaxys', 'reaxys'] :  CC/C=C\CC/C=C/C(=O)O.NC1CC1>>CC/C=C\CC/C=C/C(=O)NC1CC1


In [24]:
smi = 'O=C(O)CC[C@@H](NC(=O)[C@H]1CCCN1)C(=O)O'
print_rxns_askcos(askcos_spg_dict,'bkms,reaxys',smi)

['reaxys'] :  CC(C)(C)OC(=O)N1CCC[C@@H]1C(=O)O.N[C@H](CCC(=O)O)C(=O)OCc1ccccc1>>CC(C)(C)OC(=O)N1CCC[C@@H]1C(=O)N[C@H](CCC(=O)O)C(=O)OCc1ccccc1


['reaxys'] :  CC(C)(C)OC(=O)N1CCC[C@@H]1C(=O)N[C@H](CCC(=O)O)C(=O)OCc1ccccc1>>CC(C)(C)OC(=O)N1CCC[C@@H]1C(=O)N[C@H](CCC(=O)O)C(=O)O


['reaxys'] :  CC(C)(C)OC(=O)N1CCC[C@@H]1C(=O)N[C@H](CCC(=O)O)C(=O)O>>O=C(O)CC[C@@H](NC(=O)[C@H]1CCCN1)C(=O)O
