In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
import time
import pickle

import numpy as np
import pandas as pd

import rdkit
import rdkit.Chem as chem
import rdkit.Chem.AllChem as allchem

from gryffin import Gryffin

from molar_interface import MolarInterface

2022-02-13 16:07:10.335403: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:/usr/lib/cuda/lib64:/home/riley/Software/orca:
2022-02-13 16:07:10.335424: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Make connection to the MolarDB 

In [3]:
# define user details
user_details = {
    'email':'riley.hickman13@gmail.com',
    'password':'az1$jH5v',
}


In [4]:
db = MolarInterface(user_details, database_name='madness_laser') # client in client attribute
db.client.test_token()

{'email': 'riley.hickman13@gmail.com',
 'is_superuser': False,
 'is_active': True,
 'full_name': 'rileyhickman',
 'created_on': '2022-01-18T19:16:33.241173',
 'user_id': 3}

## Retrieve the first set of molecules in the DB

We use a filtered query to return all of the molecules for which we have tried to measure the optical properties. 

In [5]:
# custom query to get the molecules with the optical properties
df = db.client.query_database(
    [
      'machine',
      'lab',
      'synthesis',
      'molecule',
      'molecule_molecule',
      'molecule.optical_properties',
    ],
    joins = [
      {
        'type': 'molecule',
        'on': {
          'column1': 'molecule.molecule_id',
          'column2': 'synthesis.molecule_id',
            
        },
      },
      {
        'type': 'molecule_molecule',
        'on': {
          'column1': 'molecule_molecule.molecule_id',
          'column2': 'molecule.molecule_id',
        },
      },
      {
        'type': 'machine',
        'on': {
          'column1': 'machine.machine_id',
          'column2': 'synthesis.machine_id',
        },
      },
      {
        'type': 'lab',
        'on': {
          'column1': 'lab.lab_id',
          'column2': 'synthesis.lab_id',
        },
      },
    ],
limit=1000)

In [6]:
print(df.shape)
df.head()

(118, 32)


Unnamed: 0,machine.machine_id,machine.created_on,machine.updated_on,machine.name,machine.lab_id,lab.lab_id,lab.created_on,lab.updated_on,lab.name,synthesis.synthesis_id,...,molecule_molecule.created_on,molecule_molecule.updated_on,molecule_molecule.fragment_a,molecule_molecule.fragment_b,molecule_molecule.fragment_c,molecule_molecule.molecule_id,synthesis.parcel_tracking_num,molecule.hid,molecule.commercially_available,molecule.molecule_type_id
0,5600f641-b180-41b1-8043-921e66107744,2022-01-18T12:59:47.995614,2022-01-18T12:59:47.995614,The Machine,b65835e3-b908-45c6-9995-ec98d0caafe7,b65835e3-b908-45c6-9995-ec98d0caafe7,2022-01-18T12:59:47.145756,2022-01-18T12:59:47.145756,Illinois,8cf4e82b-6075-460d-a2ad-cb492f4c4b93,...,2022-02-03T14:24:29.817992,2022-02-03T14:24:29.817992,56471498-825e-47e7-a772-136f0f7bd817,d1bb232c-b83a-49cb-bcec-0e02558a6c41,fb544fb8-5588-41af-ab92-fcc8c3b4e052,b33b1cc9-81d8-4833-803f-466411581a4d,,,,
1,5600f641-b180-41b1-8043-921e66107744,2022-01-18T12:59:47.995614,2022-01-18T12:59:47.995614,The Machine,b65835e3-b908-45c6-9995-ec98d0caafe7,b65835e3-b908-45c6-9995-ec98d0caafe7,2022-01-18T12:59:47.145756,2022-01-18T12:59:47.145756,Illinois,43283ee2-6110-4460-aa57-08cedd716cbd,...,2022-02-09T22:03:57.061626,2022-02-09T22:03:57.061626,05ca4304-bea5-41b7-a7f1-c2bc46462153,d1bb232c-b83a-49cb-bcec-0e02558a6c41,b0ee7203-c40b-4f5b-9b32-c647bd61ddf1,322b8304-39b5-49bc-a307-99719f0d5911,775853526901.0,,,
2,f158c26e-a94e-4994-9a95-de324aa1da23,2022-01-18T12:59:46.312079,2022-01-18T12:59:46.312079,ChemSpeed,08cc75ca-8d1e-4ead-9b56-ad1b6ba83a41,08cc75ca-8d1e-4ead-9b56-ad1b6ba83a41,2022-01-18T12:59:44.418607,2022-01-18T12:59:44.418607,Toronto,695c1c55-bf6a-4265-b68d-4655999a7b85,...,2022-02-11T16:19:41.240088,2022-02-11T16:19:41.240088,6bf2b5a1-c157-4b5a-8a80-bf0b44704273,45cb409f-376c-4f89-8ee7-79595d2668e7,2d83fadb-4b36-42a3-a29d-1ff30b8972d6,539c5e5e-25fa-457d-a2c0-ecf3f88c36f3,,A001B003C001,False,a06a38b6-78b6-4fcc-8a61-3fe9d0af7cea
3,f158c26e-a94e-4994-9a95-de324aa1da23,2022-01-18T12:59:46.312079,2022-01-18T12:59:46.312079,ChemSpeed,08cc75ca-8d1e-4ead-9b56-ad1b6ba83a41,08cc75ca-8d1e-4ead-9b56-ad1b6ba83a41,2022-01-18T12:59:44.418607,2022-01-18T12:59:44.418607,Toronto,9a71f775-c341-42b1-ab7a-cd0917b5e80f,...,2022-02-11T16:19:42.453093,2022-02-11T16:19:42.453093,6bf2b5a1-c157-4b5a-8a80-bf0b44704273,45cb409f-376c-4f89-8ee7-79595d2668e7,d84002d6-88ab-4007-80e3-629404e64478,3275cec5-919c-4f18-bb4c-2ec941f7db3b,,A001B003C002,False,a06a38b6-78b6-4fcc-8a61-3fe9d0af7cea
4,f158c26e-a94e-4994-9a95-de324aa1da23,2022-01-18T12:59:46.312079,2022-01-18T12:59:46.312079,ChemSpeed,08cc75ca-8d1e-4ead-9b56-ad1b6ba83a41,08cc75ca-8d1e-4ead-9b56-ad1b6ba83a41,2022-01-18T12:59:44.418607,2022-01-18T12:59:44.418607,Toronto,8cf70e33-a879-4ab7-879f-5284da51c300,...,2022-02-11T16:19:43.720683,2022-02-11T16:19:43.720683,6bf2b5a1-c157-4b5a-8a80-bf0b44704273,45cb409f-376c-4f89-8ee7-79595d2668e7,c995b067-f927-40cc-98ce-f067b07ef086,63bd1d6d-3e81-48ae-9535-e04dc6d9d32f,,A001B003C003,False,a06a38b6-78b6-4fcc-8a61-3fe9d0af7cea


In [7]:
df['synthesis.status'].unique()

array(['PROCESSING', 'SHIPPED', 'DONE', 'FAILED'], dtype=object)

In [8]:
# get the sub dataframe of all molecules that are either SHIPPED OR IN PROCESSING
# these will be constrained in the Gryffin acquisition
df_proc = df[df['synthesis.status'].isin(['SHIPPED', 'PROCESSING'])]
print('NUM IN PROGRESS : ', df_proc.shape[0])

# get all the entries for which the status is DONE
df_done = df[df['synthesis.status'].isin(['DONE', 'FAILED'])]
print('NUM FINISHED : ', df_done.shape[0])

df_proc.head()

NUM IN PROGRESS :  2
NUM FINISHED :  116


Unnamed: 0,machine.machine_id,machine.created_on,machine.updated_on,machine.name,machine.lab_id,lab.lab_id,lab.created_on,lab.updated_on,lab.name,synthesis.synthesis_id,...,molecule_molecule.created_on,molecule_molecule.updated_on,molecule_molecule.fragment_a,molecule_molecule.fragment_b,molecule_molecule.fragment_c,molecule_molecule.molecule_id,synthesis.parcel_tracking_num,molecule.hid,molecule.commercially_available,molecule.molecule_type_id
0,5600f641-b180-41b1-8043-921e66107744,2022-01-18T12:59:47.995614,2022-01-18T12:59:47.995614,The Machine,b65835e3-b908-45c6-9995-ec98d0caafe7,b65835e3-b908-45c6-9995-ec98d0caafe7,2022-01-18T12:59:47.145756,2022-01-18T12:59:47.145756,Illinois,8cf4e82b-6075-460d-a2ad-cb492f4c4b93,...,2022-02-03T14:24:29.817992,2022-02-03T14:24:29.817992,56471498-825e-47e7-a772-136f0f7bd817,d1bb232c-b83a-49cb-bcec-0e02558a6c41,fb544fb8-5588-41af-ab92-fcc8c3b4e052,b33b1cc9-81d8-4833-803f-466411581a4d,,,,
1,5600f641-b180-41b1-8043-921e66107744,2022-01-18T12:59:47.995614,2022-01-18T12:59:47.995614,The Machine,b65835e3-b908-45c6-9995-ec98d0caafe7,b65835e3-b908-45c6-9995-ec98d0caafe7,2022-01-18T12:59:47.145756,2022-01-18T12:59:47.145756,Illinois,43283ee2-6110-4460-aa57-08cedd716cbd,...,2022-02-09T22:03:57.061626,2022-02-09T22:03:57.061626,05ca4304-bea5-41b7-a7f1-c2bc46462153,d1bb232c-b83a-49cb-bcec-0e02558a6c41,b0ee7203-c40b-4f5b-9b32-c647bd61ddf1,322b8304-39b5-49bc-a307-99719f0d5911,775853526901.0,,,


In [12]:
# get the fragment details for the fragments

def make_gryffin_observations(df_done):
    ''' prepare typical list of dictionary-style observations for Gryffin. This function
    considers all failed experiments as nan-valued objectives. We do not consider the cause
    for failure here (future improvement). This function is quite slow with all the calls to 
    the db about the fragments - make more efficient
    '''
    # prepare observation list for Gryffin
    observations = []

    for _, row in df_done.iterrows():
        row = row.to_dict()
        frag_a_hid = db.get_fragment_details(
                    row['molecule_molecule.fragment_a'], identifier_type='molecule_id'
            )['id']
        frag_b_hid = db.get_fragment_details(
                    row['molecule_molecule.fragment_b'], identifier_type='molecule_id'
            )['id']
        frag_c_hid = db.get_fragment_details(
                    row['molecule_molecule.fragment_c'], identifier_type='molecule_id'
            )['id']
        opt_prop = row['molecule.optical_properties']

        if row['synthesis.status']=='DONE':
            # we should have some optical properties, we can use this data to train Gryffin
            assert opt_prop!={}
            obj = opt_prop['gain_cross_section']
            assert type(obj)==float

        elif row['synthesis.status']=='FAILED':
            # failed experiment, pass value to gryffin with nan objective value
            obj = np.nan
        else:
            raise NotImplementedError
        observations.append(
            {'frag_a': frag_a_hid, 'frag_b': frag_b_hid, 'frag_c': frag_c_hid, 'obj': obj}
        )
        
    return observations


    
def get_processing_molecules(df_proc):
    ''' prepare a list of molecule hids which are in progress
    '''
    proc_hids = []
    for _, row in df_proc.iterrows():
        row = row.to_dict()
        frag_a_hid = db.get_fragment_details(
                    row['molecule_molecule.fragment_a'], identifier_type='molecule_id'
            )['id']
        frag_b_hid = db.get_fragment_details(
                    row['molecule_molecule.fragment_b'], identifier_type='molecule_id'
            )['id']
        frag_c_hid = db.get_fragment_details(
                    row['molecule_molecule.fragment_c'], identifier_type='molecule_id'
            )['id']
        mol_hid = ''.join([frag_a_hid, frag_b_hid, frag_c_hid])
        proc_hids.append(mol_hid)
    return proc_hids
    
def known_constraints(param):
    ''' known constraint that the suggested molecule must not be either shipping or
    in progress. Avoids redundant suggestions
    '''
    # check if the param is within the known molecules
    query_hid = ''.join([param['frag_a'], param['frag_b'], param['frag_c']])
    crit1 = query_hid in proc_hids
    
    # check to see if the sample can be made in at least one single location
    crit2 = False
    # TODO: this in only for the first batch, constrain to Toronto ONLY!
    for lab_frags in [toronto_frags]: #[toronto_frags, uiuc_frags]:
        if (
            (param['frag_a'] in lab_frags['frag_a']) & \
            (param['frag_b'] in lab_frags['frag_b']) & \
            (param['frag_c'] in lab_frags['frag_c']) 
        ):
            crit2=True
            
    return (not(crit1))&crit2


In [10]:
# get hids of molecules in process
proc_hids = get_processing_molecules(df_proc)
proc_hids

['A012B025C130', 'A015B025C116']

In [11]:
# get gryffin observations
observations = make_gryffin_observations(df_done)

In [13]:
# get all the unique frags in the previous observations to add onto the available 
# fragments (otherwise Gryffin will crash)
a_prev_hids = list(set([o['frag_a'] for o in observations]))
b_prev_hids = list(set([o['frag_b'] for o in observations]))
c_prev_hids = list(set([o['frag_c'] for o in observations]))

In [14]:
def get_lab_frags(lab):
    ''' returns the set of available fragments for a specific laboratory
    '''
    frags=db.get_lab_avail_frags(lab=lab)
    a_hids = frags[frags['molecule.hid'].str.contains('A')]['molecule.hid'].tolist()
    b_hids = frags[frags['molecule.hid'].str.contains('B')]['molecule.hid'].tolist()
    c_hids = frags[frags['molecule.hid'].str.contains('C')]['molecule.hid'].tolist()
    
    return {'frag_a': a_hids, 'frag_b': b_hids, 'frag_c': c_hids}

def get_frags(strategy='set'):
    ''' return all the unique molecules across all labs
    '''
    a_hids_all = []
    b_hids_all = []
    c_hids_all = []
    
    for lab in ['Toronto', 'Illinois']:
        lab_frags = get_lab_frags(lab)
        a_hids_all.extend(lab_frags['frag_a'])
        b_hids_all.extend(lab_frags['frag_b'])
        c_hids_all.extend(lab_frags['frag_c'])
    
    a_hids_all = list(set(a_hids_all))
    b_hids_all = list(set(b_hids_all))
    c_hids_all = list(set(c_hids_all))
    
    return a_hids_all, b_hids_all, c_hids_all
    

In [15]:

a_hids_all, b_hids_all, c_hids_all = get_frags()

# add fragments from previous iterations
a_hids_all = list(set(a_hids_all + a_prev_hids))
b_hids_all = list(set(b_hids_all + b_prev_hids))
c_hids_all = list(set(c_hids_all + c_prev_hids))

print('NUM A FRAGS : ', len(a_hids_all))
print('NUM B FRAGS : ', len(b_hids_all))
print('NUM C FRAGS : ', len(c_hids_all))

print('NUM ACCESSIBLE MOLS : ', len(a_hids_all)*len(b_hids_all)*len(c_hids_all))

NUM A FRAGS :  12
NUM B FRAGS :  27
NUM C FRAGS :  161
NUM ACCESSIBLE MOLS :  52164


In [16]:
# get toronto and UIUC available frags
toronto_frags = get_lab_frags('Toronto')
uiuc_frags = get_lab_frags('Illinois')

## Build Gryffin 

In [17]:
def get_descriptors(hid, descriptors):
    ''' looks up and returns descriptors for a particular fragment
    
    Args:
        hid (str): human readable identifier of the fragment
        descriptors (pd.DataFrame): dataframe corresponding to the type of fragment
    '''
    return descriptors[descriptors.hid==hid].iloc[0, 1:].values.astype(np.float).tolist()

def mol_hid_from_params(params):
    mol_hids = []
    for param in params:
        mol_hids.append(''.join([param['frag_a'], param['frag_b'], param['frag_c']]))
    
    return mol_hids
        

In [18]:
# load descriptors from disk
pca_desc_a = pd.read_csv('descriptors/pca_desc_a.csv', index_col=None)
pca_desc_b = pd.read_csv('descriptors/pca_desc_b.csv', index_col=None)
pca_desc_c = pd.read_csv('descriptors/pca_desc_c.csv', index_col=None)
print('NUM DESC A ', pca_desc_a.shape[1]-1)
print('NUM DESC B ', pca_desc_b.shape[1]-1)
print('NUM DESC C ', pca_desc_c.shape[1]-1)

NUM DESC A  30
NUM DESC B  30
NUM DESC C  35


In [19]:
# set the available fragments to make the Gryffin search space 
# TODO: change this to only those available
avail_frags_a = a_hids_all # pca_desc_a['hid'].tolist()
avail_frags_b = b_hids_all # pca_desc_b['hid'].tolist()
avail_frags_c = c_hids_all # pca_desc_c['hid'].tolist()
print('NUM AVAIL FRAGS A : ', len(avail_frags_a))
print('NUM AVAIL FRAGS B : ', len(avail_frags_b))
print('NUM AVAIL FRAGS C : ', len(avail_frags_c))

NUM AVAIL FRAGS A :  12
NUM AVAIL FRAGS B :  27
NUM AVAIL FRAGS C :  161


In [52]:
# Gryffin config

# generate descritptors
FRAG_A_DESC = {i: get_descriptors(i, pca_desc_a) for i in avail_frags_a}
FRAG_B_DESC = {i: get_descriptors(i, pca_desc_b) for i in avail_frags_b}
FRAG_C_DESC = {i: get_descriptors(i, pca_desc_c) for i in avail_frags_c}

# fwa = feasibility-weighted acquisition
# fia = feasibility-interpolated acquisition
# fca = feasibility-constrained acquisition --> use, param=0.2 ish works best on cat benchmarks

config = {
     "general": {
             "backend": 'tensorflow',
             "num_cpus": 4,
             "auto_desc_gen": True, # dynamic Gryffin
             "batches": 1,
             "sampling_strategies": 40,
             "feas_approach": 'fca',
             "feas_param": 0.5,
             "boosted":  True,
             "caching": False,
             "random_seed": 22031996,
             "acquisition_optimizer": 'genetic',
             "verbosity": 3
                },
    "parameters": [
        {"name": "frag_a", "type": "categorical", "category_details": FRAG_A_DESC },
        {"name": "frag_b", "type": "categorical", "category_details": FRAG_B_DESC },
        {"name": "frag_c", "type": "categorical", "category_details": FRAG_C_DESC },
      
    ],
    "objectives": [
        {"name": "obj", "goal": "max"},
    ]
}

In [53]:
# instantiate gryffin
gryffin = Gryffin(config_dict=config, known_constraints=known_constraints)

In [54]:
# recommend a batch of samples
samples = gryffin.recommend(observations)

Output()

2022-02-13 11:50:43.497819: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-02-13 11:50:43.527273: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: forward compatibility was attempted on non supported HW
2022-02-13 11:50:43.527313: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: medusa
2022-02-13 11:50:43.527320: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: medusa
2022-02-13 11:50:43.527458: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.103.1
2022-02-13 11:50:43.527488: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc

2022-02-13 11:50:46.597462: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-02-13 11:50:46.623384: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: forward compatibility was attempted on non supported HW
2022-02-13 11:50:46.623432: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: medusa
2022-02-13 11:50:46.623445: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: medusa
2022-02-13 11:50:46.623628: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.103.1
2022-02-13 11:50:46.623675: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc

In [55]:
def check_duplicates(samples, observations):
    ''' returns True if there are duplicates, False if there are none
    '''
    sample_hids = [''.join([s['frag_a'],s['frag_b'],s['frag_c']]) for s in samples]
    observ_hids = [''.join([o['frag_a'],o['frag_b'],o['frag_c']]) for o in observations]
    
    # check duplicates over previous observations 
    is_obs_dup = any(s in observ_hids for s in sample_hids)
    
    # check duplicates within the batch
    is_samp_dup = len(set(sample_hids))<len(sample_hids)
    
    return is_obs_dup, is_samp_dup

In [56]:
check_duplicates(samples, observations)

(False, False)

In [57]:
samples

[{'frag_a': 'A011', 'frag_b': 'B016', 'frag_c': 'C036'},
 {'frag_a': 'A011', 'frag_b': 'B016', 'frag_c': 'C025'},
 {'frag_a': 'A032', 'frag_b': 'B016', 'frag_c': 'C025'},
 {'frag_a': 'A002', 'frag_b': 'B016', 'frag_c': 'C025'},
 {'frag_a': 'A002', 'frag_b': 'B007', 'frag_c': 'C025'},
 {'frag_a': 'A015', 'frag_b': 'B023', 'frag_c': 'C036'},
 {'frag_a': 'A015', 'frag_b': 'B020', 'frag_c': 'C036'},
 {'frag_a': 'A015', 'frag_b': 'B008', 'frag_c': 'C036'},
 {'frag_a': 'A010', 'frag_b': 'B015', 'frag_c': 'C058'},
 {'frag_a': 'A016', 'frag_b': 'B019', 'frag_c': 'C058'},
 {'frag_a': 'A012', 'frag_b': 'B016', 'frag_c': 'C025'},
 {'frag_a': 'A017', 'frag_b': 'B012', 'frag_c': 'C026'},
 {'frag_a': 'A011', 'frag_b': 'B006', 'frag_c': 'C007'},
 {'frag_a': 'A010', 'frag_b': 'B010', 'frag_c': 'C020'},
 {'frag_a': 'A012', 'frag_b': 'B009', 'frag_c': 'C010'},
 {'frag_a': 'A002', 'frag_b': 'B026', 'frag_c': 'C035'},
 {'frag_a': 'A032', 'frag_b': 'B022', 'frag_c': 'C034'},
 {'frag_a': 'A012', 'frag_b': '

In [58]:
rxn_transmut = allchem.ReactionFromSmarts("[*:1][Br,I]>>[*:1][Po]")
rxn_b_c = allchem.ReactionFromSmarts(
    "[Br,I][*:1].C[N+]12CC(=O)O[B-]1([*:2])OC(=O)C2>>[*:1]-[*:2]"
)
rxn_a_b_c = allchem.ReactionFromSmarts("CC1(C)OB([*:1])OC1(C)C.[*:2][Po]>>[*:1]-[*:2]")
rxn_a_b_c2 = allchem.ReactionFromSmarts("[*:1]B(O)O.[*:2][Po]>>[*:1]-[*:2]")


def assemble_fragments(hid_a, hid_b, hid_c):
    smiles_a = db.get_fragment_details(hid_a, identifier_type='hid')['smiles']
    smiles_b = db.get_fragment_details(hid_b, identifier_type='hid')['smiles']
    smiles_c = db.get_fragment_details(hid_c, identifier_type='hid')['smiles']

    a = chem.MolFromSmiles(smiles_a)
    b = chem.MolFromSmiles(smiles_b)
    c = chem.MolFromSmiles(smiles_c)
    if a is None or b is None or c is None:
        raise ValueError("Invalid SMILES")

    # Building the molecule... there is probably a better way
    b_transmut = rxn_transmut.RunReactants([b])[0][0]
    b_c = rxn_b_c.RunReactants([c, b_transmut])[0][0]
    b_c_b = rxn_b_c.RunReactants([b_c, b_transmut])[0][0]
    try:
        a_b_c_b = rxn_a_b_c.RunReactants([a, b_c_b])[0][0]
        a_b_c_b_a = rxn_a_b_c.RunReactants([a, a_b_c_b])[0][0]
    except:
        a_b_c_b = rxn_a_b_c2.RunReactants([a, b_c_b])[0][0]
        a_b_c_b_a = rxn_a_b_c2.RunReactants([a, a_b_c_b])[0][0]
    return chem.MolToSmiles(a_b_c_b_a)

In [59]:
molecule_smiles = []
molecule_hids = []

for sample in samples:
    molecule_hids.append(''.join([sample['frag_a'],sample['frag_b'],sample['frag_c']]))
    molecule_smiles.append(assemble_fragments(sample['frag_a'], sample['frag_b'], sample['frag_c']))
    

In [60]:
molecule_mols = [chem.MolFromSmiles(s) for s in molecule_smiles]


In [61]:
img = chem.Draw.MolsToGridImage(
    molecule_mols, molsPerRow=5, subImgSize=(400,400), returnPNG=False, legends=molecule_hids,
)
img.save('first_round_samples_uoft_only.png')

In [62]:
with open('first_batch_smiles_uoft_only.txt', 'w') as f:
    for smile, hid in zip(molecule_smiles, molecule_hids):
        f.write(f'{smile}\t{hid}\n')
    

## Write the new target molecules/syntheses to the DB

In [70]:
for sample, smi in zip(samples, molecule_smiles):
    print(f'>> Creating DB entry for sample {sample["frag_a"]} {sample["frag_b"]} {sample["frag_c"]}...')
    db.create_target_compound(
            smiles=smi,
            fragment_a=sample['frag_a'],
            fragment_b=sample['frag_b'],
            fragment_c=sample['frag_c'],
            fragment_identifier='hid',
    )

>> Creating DB entry for sample A011 B016 C036...
>> Creating DB entry for sample A011 B016 C025...
>> Creating DB entry for sample A032 B016 C025...
>> Creating DB entry for sample A002 B016 C025...
>> Creating DB entry for sample A002 B007 C025...
>> Creating DB entry for sample A015 B023 C036...
>> Creating DB entry for sample A015 B020 C036...
>> Creating DB entry for sample A015 B008 C036...
>> Creating DB entry for sample A010 B015 C058...
>> Creating DB entry for sample A016 B019 C058...
>> Creating DB entry for sample A012 B016 C025...
>> Creating DB entry for sample A017 B012 C026...
>> Creating DB entry for sample A011 B006 C007...
>> Creating DB entry for sample A010 B010 C020...
>> Creating DB entry for sample A012 B009 C010...
>> Creating DB entry for sample A002 B026 C035...
>> Creating DB entry for sample A032 B022 C034...
>> Creating DB entry for sample A012 B011 C011...
>> Creating DB entry for sample A001 B006 C014...
>> Creating DB entry for sample A011 B022 C043...


In [69]:
db.get_fragment_details('A011', identifier_type='hid')

{'id': 'A011',
 'smiles': 'OB(O)c1ccc2c3ccccc3n(-c3ccccc3)c2c1',
 'CAS': '1001911-63-2',
 'molecule_id': 'b230ce95-ad00-4920-9cd3-5d344ea521b6'}

In [72]:
db.client.query_database('synthesis', limit=1000)

Unnamed: 0,synthesis_id,created_on,updated_on,status,lab_id,molecule_id,machine_id,parcel_tracking_num
0,8cf4e82b-6075-460d-a2ad-cb492f4c4b93,2022-02-03T14:24:29.904436,2022-02-03T14:24:29.904436,PROCESSING,b65835e3-b908-45c6-9995-ec98d0caafe7,b33b1cc9-81d8-4833-803f-466411581a4d,5600f641-b180-41b1-8043-921e66107744,
1,43283ee2-6110-4460-aa57-08cedd716cbd,2022-02-09T22:03:57.156014,2022-02-11T09:25:36.506576,SHIPPED,b65835e3-b908-45c6-9995-ec98d0caafe7,322b8304-39b5-49bc-a307-99719f0d5911,5600f641-b180-41b1-8043-921e66107744,775853526901
2,695c1c55-bf6a-4265-b68d-4655999a7b85,2022-02-11T16:19:41.358733,2022-02-11T16:19:41.861241,DONE,08cc75ca-8d1e-4ead-9b56-ad1b6ba83a41,539c5e5e-25fa-457d-a2c0-ecf3f88c36f3,f158c26e-a94e-4994-9a95-de324aa1da23,
3,9a71f775-c341-42b1-ab7a-cd0917b5e80f,2022-02-11T16:19:42.607900,2022-02-11T16:19:43.118409,DONE,08cc75ca-8d1e-4ead-9b56-ad1b6ba83a41,3275cec5-919c-4f18-bb4c-2ec941f7db3b,f158c26e-a94e-4994-9a95-de324aa1da23,
4,8cf70e33-a879-4ab7-879f-5284da51c300,2022-02-11T16:19:43.835325,2022-02-11T16:19:44.321553,DONE,08cc75ca-8d1e-4ead-9b56-ad1b6ba83a41,63bd1d6d-3e81-48ae-9535-e04dc6d9d32f,f158c26e-a94e-4994-9a95-de324aa1da23,
...,...,...,...,...,...,...,...,...
153,c85fd6e2-2a8c-4da3-a540-2c847e6c15c8,2022-02-13T17:09:27.732358,2022-02-13T17:09:27.732358,AVAILABLE,,5cd51cd3-aa08-4008-9f9c-d59488563f8f,,
154,31351e5e-fc8f-4a77-ba16-b2cb3e04d652,2022-02-13T17:09:29.271548,2022-02-13T17:09:29.271548,AVAILABLE,,2025ee1f-aa4b-4945-8079-d1c094a3bb6f,,
155,814a898b-7c01-4a15-bae3-a97ef6f202b0,2022-02-13T17:09:30.314880,2022-02-13T17:09:30.314880,AVAILABLE,,85742e82-2bfe-4c23-a3c7-6d5548028b0b,,
156,5895211e-c565-4945-8f24-187bd7e9a002,2022-02-13T17:09:32.263675,2022-02-13T17:09:32.263675,AVAILABLE,,d29e0792-15ac-4145-8789-dd64a998f57a,,


In [6]:
# load molecules from the first bacth
with open('first_batch_smiles_uoft_only.txt', 'r') as f:
    smiles_ = f.readlines()
    

In [19]:
smiles = []
hids = []
for line in smiles_:
    
    split = line.split('\t')
    smiles.append(split[0])
    hids.append(split[1].strip())

In [21]:
mols = [chem.MolFromSmiles(s) for s in smiles]

In [22]:
img = chem.Draw.MolsToGridImage(
    mols, molsPerRow=10, subImgSize=(400,400), returnPNG=False, legends=hids,
)
img.save('first_round_samples_uoft_only_landscape.png')