In [1]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dataset_utils import *

In [2]:
def get_mean_perf_by_model(perf_df, metric):
    for model in perf_df["model"].unique():
        print(model, round(perf_df[perf_df["model"]==model][metric].mean(), 3))

# With full dataset

## Amine dataset

In [3]:
amine_fp_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/amine_fp.joblib"))

In [4]:
print("RR")
get_mean_perf_by_model(amine_fp_perfs, "reciprocal_rank")

RR
baseline 0.718
RPC 0.672
IBM 0.717
IBPL 0.629
LRRF 0.715
LRT 0.695


In [5]:
rfr_amine_fp_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/amine_fp_rfr.joblib"))
get_mean_perf_by_model(rfr_amine_fp_perfs, "reciprocal_rank")


rfr 0.71


## Sulfonamide dataset

In [2]:
### Looking at sulfonamide dataset
sulfon_random_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/sulfonamide_random.joblib"))
sulfon_onehot_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/sulfonamide_onehot.joblib"))
sulfon_fp_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/sulfonamide_fp.joblib"))

In [11]:
print("RR")
get_mean_perf_by_model(sulfon_fp_perfs, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(sulfon_fp_perfs, "kendall_tau")

RR
baseline 0.732
RPC 0.729
IBM 0.682
IBPL 0.688
LRRF 0.471
LRT 0.773
rfr 0.768

KT
baseline 0.507
RPC 0.527
IBM 0.466
IBPL 0.456
LRRF 0.03
LRT 0.548
rfr 0.521


In [12]:
print("RR")
get_mean_perf_by_model(sulfon_onehot_perfs, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(sulfon_onehot_perfs, "kendall_tau")

RR
baseline 0.732
RPC 0.732
IBM 0.591
IBPL 0.591
LRRF 0.716
LRT 0.732
rfr 0.682

KT
baseline 0.507
RPC 0.53
IBM 0.392
IBPL 0.392
LRRF 0.476
LRT 0.509
rfr 0.479


In [13]:
print("RR")
get_mean_perf_by_model(sulfon_random_perfs, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(sulfon_random_perfs, "kendall_tau")

RR
baseline 0.732
RPC 0.716
IBM 0.664
IBPL 0.69
LRRF 0.375
LRT 0.552
rfr 0.755

KT
baseline 0.507
RPC 0.467
IBM 0.342
IBPL 0.311
LRRF -0.117
LRT 0.267
rfr 0.531


## Amide dataset

In [14]:
### Looking at sulfonamide dataset
amide_random_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/amide_random.joblib"))
amide_onehot_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/amide_onehot.joblib"))
amide_fp_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/amide_fp.joblib"))

In [15]:
print("RR")
get_mean_perf_by_model(amide_fp_perfs, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(amide_fp_perfs, "kendall_tau")

RR
baseline 0.631
RPC 0.616
IBM 0.482
IBPL 0.5
LRRF 0.479
LRT 0.601
rfr 0.669

KT
baseline 0.319
RPC 0.368
IBM 0.228
IBPL 0.272
LRRF -0.069
LRT 0.198
rfr 0.375


In [16]:
print("RR")
get_mean_perf_by_model(amide_onehot_perfs, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(amide_onehot_perfs, "kendall_tau")

RR
baseline 0.631
RPC 0.631
IBM 0.455
IBPL 0.455
LRRF 0.586
LRT 0.417
rfr 0.521

KT
baseline 0.319
RPC 0.164
IBM 0.204
IBPL 0.204
LRRF 0.2
LRT 0.081
rfr 0.262


In [17]:
print("RR")
get_mean_perf_by_model(amide_random_perfs, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(amide_random_perfs, "kendall_tau")

RR
baseline 0.631
RPC 0.613
IBM 0.533
IBPL 0.613
LRRF 0.58
LRT 0.381
rfr 0.571

KT
baseline 0.319
RPC 0.319
IBM 0.152
IBPL 0.224
LRRF 0.064
LRT -0.13
rfr 0.262


# With balanced dataset

In [13]:
balanced_sulfon_perf = pd.DataFrame(joblib.load("performance_excels/natureHTE/balanced_sulfonamide_fp.joblib"))

In [16]:
print("RR")
get_mean_perf_by_model(balanced_sulfon_perf, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(balanced_sulfon_perf, "kendall_tau")

RR
baseline 0.645
RPC 0.614
IBM 0.645
IBPL 0.645
LRRF 0.474
LRT 0.588
rfr 0.632
MCLR 0.588
MCSVM 0.561
MCRFC 0.732

KT
baseline 0.363
RPC 0.328
IBM 0.437
IBPL 0.348
LRRF -0.093
LRT 0.258
rfr 0.404
MCLR -0.402
MCSVM -0.242
MCRFC -0.472


In [17]:
balanced_amide_perf = pd.DataFrame(joblib.load("performance_excels/natureHTE/balanced_amide_fp.joblib"))
print("RR")
get_mean_perf_by_model(balanced_amide_perf, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(balanced_amide_perf, "kendall_tau")

RR
baseline 0.456
RPC 0.535
IBM 0.544
IBPL 0.544
LRRF 0.531
LRT 0.386
rfr 0.548
MCLR 0.5
MCSVM 0.469
MCRFC 0.469

KT
baseline 0.087
RPC 0.317
IBM 0.231
IBPL 0.249
LRRF 0.11
LRT -0.13
rfr 0.228
MCLR -0.07
MCSVM 0.433
MCRFC -0.055


In [19]:
balanced_amine_perf = pd.DataFrame(joblib.load("performance_excels/natureHTE/balanced_amine_fp.joblib"))
print("RR")
get_mean_perf_by_model(balanced_amine_perf, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(balanced_amine_perf, "kendall_tau")

RR
baseline 0.597
RPC 0.571
IBM 0.546
IBPL 0.539
LRRF 0.617
LRT 0.625
rfr 0.63
MCLR 0.611
MCSVM 0.62
MCRFC 0.724

KT
baseline -0.026
RPC 0.128
IBM 0.174
IBPL 0.141
LRRF 0.162
LRT 0.247
rfr 0.204
MCLR -0.129
MCSVM -0.168
MCRFC -0.214


# Dataset structure analysis

In [3]:
RAW_DATA = pd.read_excel(
    "datasets/natureHTE/natureHTE.xlsx",
    sheet_name="Report - Variable Conditions",
    usecols=["BB SMILES", "Chemistry", "Catalyst", "Base", "Rel. % Conv."],
)

AMINE_DATA = RAW_DATA[RAW_DATA["Chemistry"] == "Amine"]
SULFON_DATA = RAW_DATA[RAW_DATA["Chemistry"] == "Sulfonamide"].reset_index()
AMIDE_DATA = RAW_DATA[RAW_DATA["Chemistry"] == "Amide"].reset_index()

In [18]:
a = np.zeros((4,3))
a[[0,2,3],[0,1,2]] = 1
print(a)

[[1. 0. 0.]
 [0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [11]:
AMINE_DATA["BB SMILES"].unique()

array(['c1cn[nH]c1', 'C1CCNC1', 'Nc1ccon1', 'Nc1nc[nH]n1', 'C1CCNCC1',
       'C1COCCN1', 'Nc1ccccn1', 'Nc1cccnc1', 'Nc1ncncn1', 'Cn1nccc1N',
       'Cc1onc(N)c1', 'Cc1coc(N)n1', 'Nc1nccs1', 'Nc1nncs1', 'Cc1ccccc1N',
       'NCc1ccccc1', 'Nc1n[nH]cc1C#N', 'Cc1ccc(N)nc1', 'Cc1ccnc(N)c1',
       'Cc1ccnc(N)n1', 'Cc1csc(N)n1', 'CCN1CCNCC1', 'Cc1nnc(N)s1',
       'CC1CNCC(C)O1', 'Nc1cccc(c1)C#N', 'Nc1ccc(cc1)C#N',
       'c1ccc2[nH]ncc2c1', 'C1NCc2ccccc12', 'C1NCc2cnccc12',
       'Cc1ccc(N)cc1C', 'CNCc1ccccc1', 'Cc1cc(C)nc(N)c1', 'COc1ccccc1N',
       'Nc1sccc1C#N', 'COc1ncccc1N', 'COc1cc(N)ncn1', 'Cc1ccc(N)cc1F',
       'CNc1ccc(F)cc1', 'CNCc1occc1', 'CC(C)N1CCNCC1', 'CCc1nnc(N)s1',
       'Nc1ccc(F)c(F)c1', 'Nc1ccc2[nH]ccc2c1', 'Nc1nc2ccccc2[nH]1',
       'Nc1n[nH]c2ccccc12', 'Nc1cccc2CCCc12', 'NC1Cc2ccccc2C1',
       'C1Cc2ccccc2CN1', 'Nc1oc2ccccc2n1', 'Nc1noc2ccccc12',
       'Nc1ccc2OCCc2c1', 'Cc1ncc2CNCc2n1', 'CNCCc1ccccc1',
       'CN(C)c1ccc(N)cc1', 'Nc1ccc2OCOc2c1', 'COc1ccc(C)c(

### Looking at which substrates to remove in each class

In [12]:
amine_yields = AMINE_DATA["Rel. % Conv."].to_numpy().reshape((96,4))
amine_rank = yield_to_ranking(amine_yields)
best_conds = np.where(amine_rank==1)[1]
for i in range(4):
    print(np.sum(best_conds==i))

amine_yields[np.argwhere(np.isnan(amine_yields))] = 0

amine_easiest_cases = {}
amines_to_remove = []
for i, row in enumerate(amine_yields) :
    if np.sum(row) == 0 :
        amines_to_remove.append(i)
    else:
        if np.argmax(row) == 0 :
            two_best = np.argpartition(row, kth=2)[:2]
            amine_easiest_cases.update({
                i:row[two_best[0]] - row[two_best[1]]
            })

sorted_easiest_cases = sorted(amine_easiest_cases.items(), key=lambda x:x[1])
more_amines_to_remove = [sorted_easiest_cases[i][0] for i in range(35)]
amines_to_remove += more_amines_to_remove # still 54 left

51
12
16
15


In [19]:
sorted_easiest_cases

[(32, -69.0),
 (66, -48.0),
 (40, -47.0),
 (45, -45.0),
 (41, -39.0),
 (44, -39.0),
 (34, -38.0),
 (7, -29.0),
 (13, -26.0),
 (72, -25.0),
 (82, -25.0),
 (73, -24.0),
 (89, -24.0),
 (3, -22.0),
 (86, -21.0),
 (74, -20.0),
 (55, -19.0),
 (93, -19.0),
 (12, -18.0),
 (18, -16.0),
 (64, -16.0),
 (76, -16.0),
 (85, -15.0),
 (69, -14.0),
 (19, -13.0),
 (54, -13.0),
 (6, -12.0),
 (88, -11.0),
 (14, -10.0),
 (92, -9.0),
 (38, -8.0),
 (5, -7.0),
 (80, -6.0),
 (10, -5.0),
 (20, -5.0),
 (57, -5.0),
 (79, -5.0),
 (23, -3.0),
 (91, -3.0),
 (95, -3.0),
 (27, -2.0),
 (35, -2.0),
 (59, -2.0),
 (83, -2.0),
 (87, -2.0),
 (8, -1.0),
 (17, -1.0),
 (30, -1.0),
 (67, -1.0),
 (70, -1.0),
 (4, 0.0),
 (9, 0.0),
 (28, 0.0),
 (51, 0.0),
 (60, 0.0),
 (62, 0.0)]

In [10]:
amide_yields = AMIDE_DATA["Rel. % Conv."].to_numpy().reshape((32,4))
amide_yields[np.argwhere(np.isnan(amide_yields))] = 0
amide_rank = yield_to_ranking(amide_yields)
best_conds = np.where(amide_rank==1)[1]
for i in range(4):
    print(np.sum(best_conds==i))
# remove 3 and 6 examples from labels 1 and 3, respectively.
amide_label1 = {}
amide_label3 = {}
amides_to_remove = []
for i, row in enumerate(amide_yields) :
    if np.sum(row) == 0 :
        amides_to_remove.append(i)
    else :
        if np.argmax(row) == 1 :
            two_best = np.argpartition(row, kth=2)[:2]
            amide_label1.update({
                i:row[two_best[0]] - row[two_best[1]]
            })
        elif np.argmax(row) == 3 :
            two_best = np.argpartition(row, kth=2)[:2]
            amide_label3.update({
                i:row[two_best[0]] - row[two_best[1]]
            })

sorted_amide_label1 = sorted(amide_label1.items(), key=lambda x:x[1])
sorted_amide_label3 = sorted(amide_label3.items(), key=lambda x:x[1])
more_amides_to_remove = [sorted_amide_label1[i][0] for i in range(3)] + [sorted_amide_label3[i][0] for i in range(6)]
amides_to_remove += more_amides_to_remove

print(len(amides_to_remove))

3
9
4
12
13


In [11]:
sulfon_yields = SULFON_DATA["Rel. % Conv."].to_numpy().reshape((32,4))
sulfon_yields[np.argwhere(np.isnan(sulfon_yields))] = 0
sulfon_rank = yield_to_ranking(sulfon_yields)
best_conds = np.where(sulfon_rank==1)[1]
for i in range(4):
    print(np.sum(best_conds==i))
# remove 3 and 6 examples from labels 1 and 3, respectively.
sulfon_label0 = {}
sulfon_label2 = {}
sulfon_to_remove = []
for i, row in enumerate(sulfon_yields) :
    if np.sum(row) == 0 :
        sulfon_to_remove.append(i)
    else :
        if np.argmax(row) == 0 :
            two_best = np.argpartition(row, kth=2)[:2]
            sulfon_label0.update({
                i:row[two_best[0]] - row[two_best[1]]
            })
        elif np.argmax(row) == 2 :
            two_best = np.argpartition(row, kth=2)[:2]
            sulfon_label2.update({
                i:row[two_best[0]] - row[two_best[1]]
            })

sorted_sulfon_label0 = sorted(sulfon_label0.items(), key=lambda x:x[1])
sorted_sulfon_label2 = sorted(sulfon_label2.items(), key=lambda x:x[1])
more_sulfon_to_remove = [sorted_sulfon_label0[i][0] for i in range(3)] + [sorted_sulfon_label2[i][0] for i in range(10)]
sulfon_to_remove += more_sulfon_to_remove
print()
print(len(sulfon_to_remove))

10
1
17
4

13


In [12]:
joblib.dump(amines_to_remove, "datasets/natureHTE/nature_amine_inds_to_remove.joblib")
joblib.dump(amides_to_remove, "datasets/natureHTE/nature_amide_inds_to_remove.joblib")
joblib.dump(sulfon_to_remove, "datasets/natureHTE/nature_sulfon_inds_to_remove.joblib")

['datasets/natureHTE/nature_sulfon_inds_to_remove.joblib']