In [100]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

import olympus
from olympus.datasets import Dataset
from olympus.objects import ParameterVector

In [101]:
# load each dataset
dataset_a = Dataset(kind='buchwald_a')
dataset_b = Dataset(kind='buchwald_b')
dataset_c = Dataset(kind='buchwald_c')
dataset_d = Dataset(kind='buchwald_d')
dataset_e = Dataset(kind='buchwald_e')
dataset_a.data.head()

Unnamed: 0,aryl_halide,additive,base,ligand,yield
0,FC(F)(F)c1ccc(Cl)cc1,o1nccc1c2ccccc2,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...,10.657812
1,FC(F)(F)c1ccc(Cl)cc1,o1nccc1c2ccccc2,CN(C)/C(N(C)C)=N\C(C)(C)C,CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...,14.018641
2,FC(F)(F)c1ccc(Cl)cc1,o1nccc1c2ccccc2,CN1CCCN2CCCN=C12,CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...,14.967396
3,FC(F)(F)c1ccc(Cl)cc1,CCOC(=O)c1onc(C)c1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...,3.606677
4,FC(F)(F)c1ccc(Cl)cc1,CCOC(=O)c1onc(C)c1,CN(C)/C(N(C)C)=N\C(C)(C)C,CC(C)C1=CC(C(C)C)=CC(C(C)C)=C1C2=C(P(C3CCCCC3)...,5.008761


In [102]:
def lookup(df_results, aryl_halide, additive, base, ligand):
    match = df_results.loc[
               (df_results['aryl_halide'] == aryl_halide) &
               (df_results['additive'] == additive) &
               (df_results['base'] == base) &
               (df_results['ligand'] == ligand)
    ]
    assert len(match) in [1, 0]
    if len(match) == 0:
        return  0.0
    elif len(match)==1:
        return match.loc[:, 'yield'].to_numpy()[0]
    else:
        raise ValueError()
        
        
def param_to_ohe(dataset, aryl_halide, additive, base, ligand):
    param_space = dataset.param_space
    ohe = []
    for param, val in zip(param_space, [aryl_halide, additive, base, ligand]):
        loc = param.options.index(val)
        ohe_ = [0. for _ in range(len(param.options))]
        ohe_[loc] = 1.
        ohe.extend(ohe_)
    return np.array(ohe)


def create_options(dataset):
    param_space = dataset.param_space
    param_names = [p.name for p in param_space]
    param_options = [p.options for p in param_space]
    
    cart_product = list(itertools.product(*param_options))
    cart_product = [list(elem) for elem in cart_product]
    
    return cart_product

In [103]:
product = create_options(dataset_a)
yields = []
params = []
for param in product:
    p = {
        'aryl_halide': param[0],
        'additive': param[1],
        'base': param[2],
        'ligand': param[3],
    }
    y = dataset_a.run(ParameterVector().from_dict(p, dataset_a.param_space))
    yields.append(y[0][0])
    ohe = param_to_ohe(dataset_a, param[0],param[1], param[2], param[3])
    params.append(ohe)
    
ix = np.argsort(yields)[::-1]
sort_yields = [yields[i] for i in ix]
sort_params = [product[i] for i in ix]

In [104]:
best_a = {'params': sort_params[:10], 'values': sort_yields[:10]}
task_a = {'params':np.array(params), 'values': np.array(yields).reshape(-1, 1)}

In [105]:
product = create_options(dataset_b)
yields = []
params = []
for param in product:
    p = {
        'aryl_halide': param[0],
        'additive': param[1],
        'base': param[2],
        'ligand': param[3],
    }
    y = dataset_b.run(ParameterVector().from_dict(p, dataset_b.param_space))
    yields.append(y[0][0])
    ohe = param_to_ohe(dataset_b, param[0],param[1], param[2], param[3])
    params.append(ohe)
    
ix = np.argsort(yields)[::-1]
sort_yields = [yields[i] for i in ix]
sort_params = [product[i] for i in ix]

In [106]:
best_b = {'params': sort_params[:10], 'values': sort_yields[:10]}
task_b = {'params':np.array(params), 'values': np.array(yields).reshape(-1, 1)}

In [107]:
product = create_options(dataset_c)
yields = []
params = []
for param in product:
    p = {
        'aryl_halide': param[0],
        'additive': param[1],
        'base': param[2],
        'ligand': param[3],
    }
    y = dataset_c.run(ParameterVector().from_dict(p, dataset_c.param_space))
    yields.append(y[0][0])
    ohe = param_to_ohe(dataset_c, param[0],param[1], param[2], param[3])
    params.append(ohe)
    
ix = np.argsort(yields)[::-1]
sort_yields = [yields[i] for i in ix]
sort_params = [product[i] for i in ix]

In [108]:
best_c = {'params': sort_params[:10], 'values': sort_yields[:10]}
task_c = {'params':np.array(params), 'values': np.array(yields).reshape(-1, 1)}

In [109]:
product = create_options(dataset_d)
yields = []
params = []
for param in product:
    p = {
        'aryl_halide': param[0],
        'additive': param[1],
        'base': param[2],
        'ligand': param[3],
    }
    y = dataset_d.run(ParameterVector().from_dict(p, dataset_d.param_space))
    yields.append(y[0][0])
    ohe = param_to_ohe(dataset_d, param[0],param[1], param[2], param[3])
    params.append(ohe)
    
ix = np.argsort(yields)[::-1]
sort_yields = [yields[i] for i in ix]
sort_params = [product[i] for i in ix]

In [110]:
best_d = {'params': sort_params[:10], 'values': sort_yields[:10]}
task_d = {'params':np.array(params), 'values': np.array(yields).reshape(-1, 1)}

In [111]:
product = create_options(dataset_e)
yields = []
params = []
for param in product:
    p = {
        'aryl_halide': param[0],
        'additive': param[1],
        'base': param[2],
        'ligand': param[3],
    }
    y = dataset_e.run(ParameterVector().from_dict(p, dataset_e.param_space))
    yields.append(y[0][0])
    ohe = param_to_ohe(dataset_e, param[0],param[1], param[2], param[3])
    params.append(ohe)
    
ix = np.argsort(yields)[::-1]
sort_yields = [yields[i] for i in ix]
sort_params = [product[i] for i in ix]

In [112]:
best_e = {'params': sort_params[:10], 'values': sort_yields[:10]}
task_e = {'params':np.array(params), 'values': np.array(yields).reshape(-1, 1)}

In [113]:
best = [best_a, best_b, best_c, best_d, best_e]
tasks = [task_a, task_b, task_c, task_d, task_e]

In [114]:
pickle.dump(best, open('buchwald_best.pkl', 'wb'))
pickle.dump(tasks, open('buchwald_tasks.pkl', 'wb'))