In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import os, sys
import time
import pickle

import numpy as np
import pandas as pd

import rdkit
import rdkit.Chem as chem
import rdkit.Chem.AllChem as allchem

from gryffin import Gryffin

from MolarInterface import MolarInterface

## Make connection to the MolarDB

In [3]:
db = MolarInterface(
    db_name="madness_laser",
    fragments=("fragment_a", "fragment_b", "fragment_c")
)

## Retrieve the first set of molecules in the DB

We use a filtered query to return all of the molecules for which we have tried to measure the optical properties. 

In [4]:
previous_syntheses = db.get_all_syntheses()

In [5]:
print(previous_syntheses.shape)
previous_syntheses.head()

(236, 12)


Unnamed: 0,product.hid,product.smiles,product.optical_properties,fragment_a.hid,fragment_a.smiles,fragment_b.hid,fragment_b.smiles,fragment_c.hid,fragment_c.smiles,synthesis.synthesis_id,synthesis.status,synthesis.molecule_id
0,A001B046C003,Cc1cc(-c2cc(F)cc(-c3ccc(-n4c5ccccc5c5ccccc54)c...,{},A001,CC1(C)OB(c2ccc(-n3c4ccccc4c4ccccc43)cc2)OC1(C)C,B046,CC1(C)C(=O)O[B-]2(c3cc(F)cc(Br)c3)OC(=O)C(C)(C...,C003,Cc1cc(Br)c(C)cc1Br,4f21e4b4-5259-42e3-ae43-f6a276e563f6,AVAILABLE,ced41854-1953-4071-b48e-1dd1189f3a9e
1,A041B058C100,Cn1cnc2ccc(-c3ccc(C=Cc4cnc(C=Cc5ccc(-c6ccc7ncn...,{},A041,Cn1cnc2ccc(B3OC(C)(C)C(C)(C)O3)cc21,B058,CC1(C)C(=O)O[B-]2(/C=C/c3ccc(Br)cc3)OC(=O)C(C)...,C100,Brc1cnc(Br)s1,63bea729-df1c-43ca-8078-90ba52716ffd,AVAILABLE,e9b6f686-e5d5-4a2e-a21d-e52cca5036dd
2,A001B031C021,Cc1cc(-c2ccc(-c3cc(C)c(-c4ccc(-n5c6ccccc6c6ccc...,{},A001,CC1(C)OB(c2ccc(-n3c4ccccc4c4ccccc43)cc2)OC1(C)C,B031,Cc1cc([B-]23OC(=O)C(C)(C)[N+]2(C)C(C)(C)C(=O)O...,C021,FC(F)(F)c1cc(Br)ccc1Br,97b29bb5-16fb-4b5c-9fe8-ffdd6a869ebe,AVAILABLE,50db756f-a682-4e30-b39c-e3af5f2b111d
3,A028B038C085,CCCCCCCCCCCCc1cc(-c2ccccc2-c2ccc(N3c4ccccc4CCc...,{},A028,CC1(C)OB(c2ccc(N3c4ccccc4CCc4ccccc43)cc2)OC1(C)C,B038,CC1(C)C(=O)O[B-]2(c3ccccc3Br)OC(=O)C(C)(C)[N+]21C,C085,CCCCCCCCCCCCc1cc(Br)sc1Br,9839e50f-d7dc-4f63-a682-5feb2f99579b,AVAILABLE,b8e0476d-00d2-45df-9f0b-5c9b2029d1a2
4,A039B060C079,CCn1c2ccccc2c2cc(-c3ccccc3-c3ccccc3-c3ccc(C)c(...,{},A039,CCn1c2ccccc2c2cc(B3OC(C)(C)C(C)(C)O3)ccc21,B060,CC1(C)C(=O)O[B-]2(c3ccccc3-c3ccccc3Br)OC(=O)C(...,C079,Cc1ccc(Br)cc1Br,df641953-a3eb-4c88-9d04-a8b024e4a502,AVAILABLE,5963ac30-5d0c-479b-a11c-86c95a56adc5


In [6]:
# get the sub dataframe of all molecules that are either SHIPPED OR IN PROCESSING
# these will be constrained in the Gryffin acquisition
df_proc = previous_syntheses[previous_syntheses['synthesis.status'].isin(["ACQUIRED", "PROCESSING", "SYNTHESIZED", "SHIPPED", "RECEIVED"])]
print('NUM IN PROGRESS : ', df_proc.shape[0])

# get all the entries for which the status is DONE
df_done = previous_syntheses[previous_syntheses['synthesis.status'].isin(['DONE', 'FAILED'])]
print('NUM FINISHED : ', df_done.shape[0])

print(f'PERCENT DONE {(df_done.shape[0] / (33*30*165) *100):.3f}')  # TODO: Make space calculaiton automatic from DB?

df_proc

NUM IN PROGRESS :  13
NUM FINISHED :  188
PERCENT DONE 0.115


Unnamed: 0,product.hid,product.smiles,product.optical_properties,fragment_a.hid,fragment_a.smiles,fragment_b.hid,fragment_b.smiles,fragment_c.hid,fragment_c.smiles,synthesis.synthesis_id,synthesis.status,synthesis.molecule_id
13,,CCCCCCN1c2ccc(-c3ccc(-c4ccc5c(c4)c4ccccc4n5CC)...,{},A012,CCn1c2ccccc2c2cc(B(O)O)ccc21,B025,C[N+]12CC(=O)O[B-]1(c1ccc(Br)s1)OC(=O)C2,C130,CCCCCCN1c2ccc(Br)cc2Sc2cc(Br)ccc21,8cf4e82b-6075-460d-a2ad-cb492f4c4b93,PROCESSING,b33b1cc9-81d8-4833-803f-466411581a4d
14,,CN(C)c1ccc(-c2ccc(-c3csc(C(=O)O)c3-c3ccc(-c4cc...,{},A015,CN(C)c1ccc(B2OC(C)(C)C(C)(C)O2)c2ccccc12,B025,C[N+]12CC(=O)O[B-]1(c1ccc(Br)s1)OC(=O)C2,C116,O=C(O)c1scc(Br)c1Br,43283ee2-6110-4460-aa57-08cedd716cbd,RECEIVED,322b8304-39b5-49bc-a307-99719f0d5911
136,,c1ccc(-c2nc3ccccc3n2-c2ccc(-c3cncc(-c4cc(-c5cn...,"{'MS': {'data': [[0, 100.0687, 994.79614257812...",A013,OB(O)c1ccc(-n2c(-c3ccccc3)nc3ccccc32)cc1,B005,C[N+]12CC(=O)O[B-]1(c1cncc(Br)c1)OC(=O)C2,C154,Brc1cc(Br)cc(N2CCC3(CC2)OCCO3)c1,c74040cf-8e5f-4f3f-8e06-e77363663672,SYNTHESIZED,036ef848-5792-4501-b904-63301e6f1027
139,,CCCCCCN1c2ccc(-c3ccc(-c4ccc5c(ccn5C)c4)s3)cc2S...,"{'MS': {'data': [[0, 100.076, 2040.015625], [1...",A017,Cn1ccc2cc(B(O)O)ccc21,B025,C[N+]12CC(=O)O[B-]1(c1ccc(Br)s1)OC(=O)C2,C130,CCCCCCN1c2ccc(Br)cc2Sc2cc(Br)ccc21,649fe653-970e-4b7e-9571-c944032aef39,RECEIVED,416db1b2-15d6-4a2b-b2d2-a0e688f0dce4
150,A001B012C005,CC1(C)c2cc(-c3cc(F)cc(-c4ccc(-n5c6ccccc6c6cccc...,{},A001,CC1(C)OB(c2ccc(-n3c4ccccc4c4ccccc43)cc2)OC1(C)C,B012,C[N+]12CC(=O)O[B-]1(c1cc(F)cc(Br)c1)OC(=O)C2,C005,CC1(C)c2cc(Br)ccc2-c2ccc(Br)cc21,5e6ce632-3571-46ba-b25c-718274b3dd05,ACQUIRED,c08b6d18-3a1d-4b8a-91e8-ec7fa281cf8e
162,A015B001C028,CN(C)c1ccc(-c2ccc(C=Cc3cc(F)c(C=Cc4ccc(-c5ccc(...,{},A015,CN(C)c1ccc(B2OC(C)(C)C(C)(C)O2)c2ccccc12,B001,C[N+]12CC(=O)O[B-]1(/C=C/c1ccc(Br)cc1)OC(=O)C2,C028,Fc1cc(Br)c(F)cc1Br,936d4d55-a669-48ed-a5ec-c41300c7ff64,ACQUIRED,f336bff6-b429-4d4b-9202-af690e1eacc1
190,A012B003C158,CCn1c2ccccc2c2cc(C=Cc3cc(C(=O)OC)c(C=Cc4ccc5c(...,{},A012,CCn1c2ccccc2c2cc(B(O)O)ccc21,B003,C[N+]12CC(=O)O[B-]1(/C=C/Br)OC(=O)C2,C158,COC(=O)c1cc(Br)oc1Br,05f4ee39-f0b8-4b5f-969e-4fcdb07da878,ACQUIRED,d77e729b-2284-4282-b2c4-78e55928364b
191,A015B002C031,CN(C)c1ccc(-c2ccsc2-c2ccc(-c3sccc3-c3ccc(N(C)C...,{},A015,CN(C)c1ccc(B2OC(C)(C)C(C)(C)O2)c2ccccc12,B002,C[N+]12CC(=O)O[B-]1(c1sccc1Br)OC(=O)C2,C031,Brc1ccc(Br)nn1,fecae425-531a-478e-ab03-9f7ff26cca24,ACQUIRED,5a14f557-6ba2-4afe-b48a-4a56040822f4
192,A016B004C074,Cc1cc(-c2cccnc2-c2ccc3ncn(C)c3c2)cc(-c2cccnc2-...,{},A016,Cn1cnc2ccc(B(O)O)cc21,B004,C[N+]12CC(=O)O[B-]1(c1cccnc1Br)OC(=O)C2,C074,Cc1cc(Br)cc(Br)c1O,c6191dcf-5f49-4a66-836b-b8474a14c269,ACQUIRED,57427931-f0bc-4b77-98e8-1959255f7c4c
205,A012B001C003,CCn1c2ccccc2c2cc(-c3ccc(C=Cc4cc(C)c(C=Cc5ccc(-...,{},A012,CCn1c2ccccc2c2cc(B(O)O)ccc21,B001,C[N+]12CC(=O)O[B-]1(/C=C/c1ccc(Br)cc1)OC(=O)C2,C003,Cc1cc(Br)c(C)cc1Br,7809efa1-062f-4b61-a410-083cf01a29ad,ACQUIRED,8ac8aa07-c5cd-4334-9c63-983b5a4a50d1


In [7]:
# get the fragment details for the fragments

def make_gryffin_observations(df_done):
    """
    prepare typical list of dictionary-style observations for Gryffin. This function
    considers all failed experiments as nan-valued objectives. We do not consider the cause
    for failure here (future improvement). This function is quite slow with all the calls to 
    the db about the fragments - make more efficient
    """
    # prepare observation list for Gryffin
    observations = []

    for _, row in df_done.iterrows():
        row = row.to_dict()

        if row["synthesis.status"]=='DONE':
            try:
                gain_cross_section = row["product.optical_properties"]["gain_cross_section"]
            except KeyError:
                print(f"Target property could not be found for {row['hid']}")

        elif row['synthesis.status']=='FAILED':
            # failed experiment, pass value to gryffin with nan objective value
            gain_cross_section = np.nan

        else:
            raise NotImplementedError

        observations.append(
            {
                'fragment': row["fragment_a.hid"],
                'fragment_b': row["fragment_b.hid"],
                'fragment_c': row["fragment_c.hid"],
                'obj': gain_cross_section
            }
        )
        
    return observations

def known_constraints(param):
    '''
    known constraint that the suggested molecule must not be either shipping or
    in progress. Avoids redundant suggestions
    '''
    # check if the param is within the known molecules
    query_hid = ''.join([param['fragment_a'], param['fragment_b'], param['fragment_c']])
    crit1 = query_hid not in proc_hids
    
    # check to see if the sample can be made in at least one single location
    crit2 = False
    # TODO: this in only for the first batch, constrain to Toronto ONLY!
    for lab_frags in [toronto_frags, uiuc_frags]:
        if (
            (param['fragment_a'] in lab_frags['fragment_a']) & (param['fragment_b'] in lab_frags['fragment_b']) & (param['fragment_c'] in lab_frags['fragment_c'])):
            crit2=True
            
    return crit1 & crit2


In [8]:

# get hids of molecules in process
proc_hids = df_proc["product.hid"].tolist()
proc_hids

[None,
 None,
 None,
 None,
 'A001B012C005',
 'A015B001C028',
 'A012B003C158',
 'A015B002C031',
 'A016B004C074',
 'A012B001C003',
 'A001B020C005',
 'A001B020C003',
 'A015B028C158']

In [9]:
# get gryffin observations
observations = make_gryffin_observations(df_done)

observations

[{'frag_a': 'A001', 'frag_b': 'B003', 'frag_c': 'C001', 'obj': 1.869e-16},
 {'frag_a': 'A001', 'frag_b': 'B003', 'frag_c': 'C002', 'obj': 1.968e-16},
 {'frag_a': 'A001', 'frag_b': 'B003', 'frag_c': 'C003', 'obj': 2.309e-16},
 {'frag_a': 'A001', 'frag_b': 'B003', 'frag_c': 'C004', 'obj': 6.72e-17},
 {'frag_a': 'A001',
  'frag_b': 'B003',
  'frag_c': 'C005',
  'obj': 2.6859999999999997e-16},
 {'frag_a': 'A001', 'frag_b': 'B003', 'frag_c': 'C006', 'obj': 1.19e-17},
 {'frag_a': 'A001', 'frag_b': 'B003', 'frag_c': 'C008', 'obj': 1.89e-17},
 {'frag_a': 'A001', 'frag_b': 'B003', 'frag_c': 'C011', 'obj': 2.442e-16},
 {'frag_a': 'A001', 'frag_b': 'B003', 'frag_c': 'C015', 'obj': 2.63e-17},
 {'frag_a': 'A001', 'frag_b': 'B003', 'frag_c': 'C017', 'obj': 1.212e-16},
 {'frag_a': 'A001', 'frag_b': 'B003', 'frag_c': 'C018', 'obj': 1.649e-16},
 {'frag_a': 'A001',
  'frag_b': 'B003',
  'frag_c': 'C019',
  'obj': 9.330000000000001e-17},
 {'frag_a': 'A001', 'frag_b': 'B003', 'frag_c': 'C021', 'obj': 1.42

In [10]:
# get all the unique frags in the previous observations to add onto the available 
# fragments (otherwise Gryffin will crash)
a_prev_hids = list(set([o['fragment_a'] for o in observations]))
b_prev_hids = list(set([o['fragment_b'] for o in observations]))
c_prev_hids = list(set([o['fragment_c'] for o in observations]))

In [11]:
def get_lab_frags(lab):
    ''' returns the set of available fragments for a specific laboratory
    '''
    a_hids = db.get_available_fragments("fragment_a", lab)["molecule.hid"].tolist()
    b_hids = db.get_available_fragments("fragment_b", lab)["molecule.hid"].tolist()
    c_hids = db.get_available_fragments("fragment_c", lab)["molecule.hid"].tolist()
    
    return {'fragment_a': a_hids, 'fragment_b': b_hids, 'fragment_c': c_hids}

def get_frags():
    ''' return all the unique molecules across all labs
    '''
    a_hids_all = []
    b_hids_all = []
    c_hids_all = []
    
    for lab in ['Toronto', 'Illinois']:
        lab_frags = get_lab_frags(lab)
        a_hids_all.extend(lab_frags['fragment_a'])
        b_hids_all.extend(lab_frags['fragment_b'])
        c_hids_all.extend(lab_frags['fragment_c'])
    
    a_hids_all = list(set(a_hids_all))
    b_hids_all = list(set(b_hids_all))
    c_hids_all = list(set(c_hids_all))
    
    return a_hids_all, b_hids_all, c_hids_all
    

In [12]:
a_hids_all, b_hids_all, c_hids_all = get_frags()

# add fragments from previous iterations
a_hids_all = list(set(a_hids_all + a_prev_hids))
b_hids_all = list(set(b_hids_all + b_prev_hids))
c_hids_all = list(set(c_hids_all + c_prev_hids))

print('NUM A FRAGS : ', len(a_hids_all))
print('NUM B FRAGS : ', len(b_hids_all))
print('NUM C FRAGS : ', len(c_hids_all))

print('NUM ACCESSIBLE MOLS : ', len(a_hids_all)*len(b_hids_all)*len(c_hids_all))

NUM A FRAGS :  33
NUM B FRAGS :  55
NUM C FRAGS :  161
NUM ACCESSIBLE MOLS :  292215


In [15]:
# get toronto and UIUC available frags
toronto_frags = get_lab_frags('Toronto')
uiuc_frags = get_lab_frags('Illinois')

{'frag_a': ['A014',
  'A015',
  'A018',
  'A021',
  'A020',
  'A002',
  'A004',
  'A003',
  'A022',
  'A023',
  'A024',
  'A029',
  'A028',
  'A027',
  'A026',
  'A032',
  'A001',
  'A008',
  'A034',
  'A035',
  'A036',
  'A038',
  'A039',
  'A040',
  'A042'],
 'frag_b': ['B033',
  'B031',
  'B043',
  'B049',
  'B051',
  'B047',
  'B057',
  'B059',
  'B038',
  'B042',
  'B054',
  'B068',
  'B036',
  'B035',
  'B032',
  'B055',
  'B037',
  'B039',
  'B044',
  'B046',
  'B048',
  'B052',
  'B058',
  'B060',
  'B062',
  'B063'],
 'frag_c': ['C002',
  'C003',
  'C004',
  'C005',
  'C006',
  'C007',
  'C001',
  'C008',
  'C009',
  'C012',
  'C013',
  'C010',
  'C011',
  'C014',
  'C015',
  'C016',
  'C017',
  'C018',
  'C019',
  'C020',
  'C021',
  'C022',
  'C023',
  'C024',
  'C025',
  'C026',
  'C028',
  'C029',
  'C030',
  'C031',
  'C032',
  'C035',
  'C036',
  'C034',
  'C037',
  'C038',
  'C039',
  'C040',
  'C041',
  'C042',
  'C043',
  'C044',
  'C047',
  'C062',
  'C064',
  'C067'

## Build Gryffin 

In [None]:
def get_descriptors(hid, descriptors):
    ''' looks up and returns descriptors for a particular fragment
    
    Args:
        hid (str): human readable identifier of the fragment
        descriptors (pd.DataFrame): dataframe corresponding to the type of fragment
    '''
    return descriptors[descriptors.hid==hid].iloc[0, 1:].values.astype(np.float).tolist()

def mol_hid_from_params(params):
    mol_hids = []
    for param in params:
        mol_hids.append(''.join([param['fragment_a'], param['fragment_b'], param['fragment_c']]))
    
    return mol_hids
        

In [16]:
# load descriptors from disk
pca_desc_a = pd.read_csv('descriptors/gen2_pca_desc_a.csv', index_col=None)
pca_desc_b = pd.read_csv('descriptors/gen2_pca_desc_b.csv', index_col=None)
pca_desc_c = pd.read_csv('descriptors/gen2_pca_desc_c.csv', index_col=None)
print('NUM DESC A ', pca_desc_a.shape[1]-1)
print('NUM DESC B ', pca_desc_b.shape[1]-1)
print('NUM DESC C ', pca_desc_c.shape[1]-1)

NUM DESC A  30
NUM DESC B  30
NUM DESC C  35


In [17]:
# set the available fragments to make the Gryffin search space 
# TODO: change this to only those available
avail_frags_a = a_hids_all
avail_frags_b = b_hids_all
avail_frags_c = c_hids_all
print('NUM AVAIL FRAGS A : ', len(avail_frags_a))
print('NUM AVAIL FRAGS B : ', len(avail_frags_b))
print('NUM AVAIL FRAGS C : ', len(avail_frags_c))

NUM AVAIL FRAGS A :  33
NUM AVAIL FRAGS B :  55
NUM AVAIL FRAGS C :  161


In [None]:
# Gryffin config

#sampling_strategies = np.linspace(-1, 1, 40)
sampling_strategies_1 = np.linspace(0.6, 1, 30)
sampling_strategies_2 = np.linspace(-1, 0.5, 10)
sampling_strategies = np.concatenate((sampling_strategies_1, sampling_strategies_2))
#sampling_strategies = np.linspace(-1, 0.45, 20)

# generate descriptors
FRAG_A_DESC = {i: get_descriptors(i, pca_desc_a) for i in avail_frags_a}
FRAG_B_DESC = {i: get_descriptors(i, pca_desc_b) for i in avail_frags_b}
FRAG_C_DESC = {i: get_descriptors(i, pca_desc_c) for i in avail_frags_c}

# FRAG_A_DESC = {i: None for i in avail_frags_a}
# FRAG_B_DESC = {i: None for i in avail_frags_b}
# FRAG_C_DESC = {i: None for i in avail_frags_c}

# fwa = feasibility-weighted acquisition
# fia = feasibility-interpolated acquisition
# fca = feasibility-constrained acquisition --> use, param=0.2 ish works best on cat benchmarks

config = {
     "general": {
             "backend": 'tensorflow',
             "num_cpus": 4,
             "auto_desc_gen": False, # dynamic Gryffin
             "batches": 1,
             "sampling_strategies": 1,
             "feas_approach": 'fca',
             "feas_param": 0.2,
             "boosted":  True,
             "caching": False,
             "random_seed": 22031996,
             "acquisition_optimizer": 'genetic',
             "verbosity": 3
                },
    "parameters": [
        {"name": "fragment_a", "type": "categorical", "category_details": FRAG_A_DESC },
        {"name": "fragment_b", "type": "categorical", "category_details": FRAG_B_DESC },
        {"name": "fragment_c", "type": "categorical", "category_details": FRAG_C_DESC },
      
    ],
    "objectives": [
        {"name": "obj", "goal": "max"},
    ]
}

In [None]:
# instantiate gryffin
gryffin = Gryffin(config_dict=config, known_constraints=known_constraints)

In [None]:
# recommend a batch of samples
samples = []
for ix, sampling_strat in enumerate(sampling_strategies):
    
    sample = gryffin.recommend(observations, sampling_strategies=[sampling_strat])
    print(f'\nSAMPLING STRATEGY : {sampling_strat} SAMPLE : {sample}\n')
    samples.extend(sample) 
    

In [None]:
def check_duplicates(samples, observations):
    ''' returns True if there are duplicates, False if there are none
    '''
    sample_hids = [''.join([s['fragment_a'],s['fragment_b'],s['fragment_c']]) for s in samples]
    observ_hids = [''.join([o['fragment_a'],o['fragment_b'],o['fragment_c']]) for o in observations]
    
    # check duplicates over previous observations 
    is_obs_dup = any(s in observ_hids for s in sample_hids)
    
    # check duplicates within the batch
    is_samp_dup = len(set(sample_hids))<len(sample_hids)
    
    return is_obs_dup, is_samp_dup

In [None]:
check_duplicates(samples, observations)

In [None]:
samples

In [None]:
rxn_transmut = allchem.ReactionFromSmarts("[*:1][Br,I]>>[*:1][Po]")
rxn_b_c = allchem.ReactionFromSmarts(
    "[Br,I][*:1].C[N+]12CC(=O)O[B-]1([*:2])OC(=O)C2>>[*:1]-[*:2]"
)
rxn_a_b_c = allchem.ReactionFromSmarts("CC1(C)OB([*:1])OC1(C)C.[*:2][Po]>>[*:1]-[*:2]")
rxn_a_b_c2 = allchem.ReactionFromSmarts("[*:1]B(O)O.[*:2][Po]>>[*:1]-[*:2]")


def assemble_fragments(hid_a, hid_b, hid_c):
    smiles_a = db.get_fragment_details(hid_a, identifier_type='hid')['smiles']
    smiles_b = db.get_fragment_details(hid_b, identifier_type='hid')['smiles']
    smiles_c = db.get_fragment_details(hid_c, identifier_type='hid')['smiles']

    a = chem.MolFromSmiles(smiles_a)
    b = chem.MolFromSmiles(smiles_b)
    c = chem.MolFromSmiles(smiles_c)
    if a is None or b is None or c is None:
        raise ValueError("Invalid SMILES")

    # Building the molecule... there is probably a better way
    b_transmut = rxn_transmut.RunReactants([b])[0][0]
    b_c = rxn_b_c.RunReactants([c, b_transmut])[0][0]
    b_c_b = rxn_b_c.RunReactants([b_c, b_transmut])[0][0]
    try:
        a_b_c_b = rxn_a_b_c.RunReactants([a, b_c_b])[0][0]
        a_b_c_b_a = rxn_a_b_c.RunReactants([a, a_b_c_b])[0][0]
    except:
        a_b_c_b = rxn_a_b_c2.RunReactants([a, b_c_b])[0][0]
        a_b_c_b_a = rxn_a_b_c2.RunReactants([a, a_b_c_b])[0][0]
    return chem.MolToSmiles(a_b_c_b_a)

In [None]:
molecule_smiles = []
molecule_hids = []

for sample in samples:
    molecule_hids.append(''.join([sample['fragment_a'],sample['fragment_b'],sample['fragment_c']]))
    molecule_smiles.append(assemble_fragments(sample['fragment_a'], sample['fragment_b'], sample['fragment_c']))
    

In [None]:
molecule_mols = [chem.MolFromSmiles(s) for s in molecule_smiles]

img = chem.Draw.MolsToGridImage(
    molecule_mols, molsPerRow=5, subImgSize=(400,400), returnPNG=False, legends=molecule_hids,
)
img.save('round_3_samples_all.png')

In [None]:
with open('round_3_samples_all.txt', 'w') as f:
    for smile, hid in zip(molecule_smiles, molecule_hids):
        f.write(f'{smile}\t{hid}\n')


## Write the new target molecules/syntheses to the DB

In [None]:
for sample, smi in zip(samples, molecule_smiles):
    print(f'>> Creating DB entry for sample {sample["fragment_a"]} {sample["fragment_b"]} {sample["fragment_c"]}...')
    db.create_target_compound(
        fragments=[sample["fragment_a"], sample["fragment_b"], sample["fragment_c"]],
        smiles=smi
    )

In [None]:
# db.get_fragment_details('A011', identifier_type='hid')

In [None]:
# db.client.query_database('synthesis', limit=1000)

In [None]:
# # load molecules from the first bacth
# with open('first_batch_smiles_uoft_only.txt', 'r') as f:
#     smiles_ = f.readlines()
    

In [None]:
# smiles = []
# hids = []
# for line in smiles_:
#
#     split = line.split('\t')
#     smiles.append(split[0])
#     hids.append(split[1].strip())

In [None]:
# mols = [chem.MolFromSmiles(s) for s in smiles]

In [None]:
# img = chem.Draw.MolsToGridImage(
#     mols, molsPerRow=10, subImgSize=(400,400), returnPNG=False, legends=hids,
# )
# img.save('first_round_samples_uoft_only_landscape.png')