In [1]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def get_mean_perf_by_model(perf_df, metric):
    for model in perf_df["model"].unique():
        print(model, round(perf_df[perf_df["model"]==model][metric].mean(), 3))

# Amine dataset

In [3]:
amine_fp_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/amine_fp.joblib"))

In [4]:
print("RR")
get_mean_perf_by_model(amine_fp_perfs, "reciprocal_rank")

RR
baseline 0.718
RPC 0.672
IBM 0.717
IBPL 0.629
LRRF 0.715
LRT 0.695


In [5]:
rfr_amine_fp_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/amine_fp_rfr.joblib"))
get_mean_perf_by_model(rfr_amine_fp_perfs, "reciprocal_rank")


rfr 0.71


# Sulfonamide dataset

In [2]:
### Looking at sulfonamide dataset
sulfon_random_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/sulfonamide_random.joblib"))
sulfon_onehot_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/sulfonamide_onehot.joblib"))
sulfon_fp_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/sulfonamide_fp.joblib"))

In [11]:
print("RR")
get_mean_perf_by_model(sulfon_fp_perfs, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(sulfon_fp_perfs, "kendall_tau")

RR
baseline 0.732
RPC 0.729
IBM 0.682
IBPL 0.688
LRRF 0.471
LRT 0.773
rfr 0.768

KT
baseline 0.507
RPC 0.527
IBM 0.466
IBPL 0.456
LRRF 0.03
LRT 0.548
rfr 0.521


In [12]:
print("RR")
get_mean_perf_by_model(sulfon_onehot_perfs, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(sulfon_onehot_perfs, "kendall_tau")

RR
baseline 0.732
RPC 0.732
IBM 0.591
IBPL 0.591
LRRF 0.716
LRT 0.732
rfr 0.682

KT
baseline 0.507
RPC 0.53
IBM 0.392
IBPL 0.392
LRRF 0.476
LRT 0.509
rfr 0.479


In [13]:
print("RR")
get_mean_perf_by_model(sulfon_random_perfs, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(sulfon_random_perfs, "kendall_tau")

RR
baseline 0.732
RPC 0.716
IBM 0.664
IBPL 0.69
LRRF 0.375
LRT 0.552
rfr 0.755

KT
baseline 0.507
RPC 0.467
IBM 0.342
IBPL 0.311
LRRF -0.117
LRT 0.267
rfr 0.531


# Amide dataset

In [14]:
### Looking at sulfonamide dataset
amide_random_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/amide_random.joblib"))
amide_onehot_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/amide_onehot.joblib"))
amide_fp_perfs = pd.DataFrame(joblib.load("performance_excels/natureHTE/amide_fp.joblib"))

In [15]:
print("RR")
get_mean_perf_by_model(amide_fp_perfs, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(amide_fp_perfs, "kendall_tau")

RR
baseline 0.631
RPC 0.616
IBM 0.482
IBPL 0.5
LRRF 0.479
LRT 0.601
rfr 0.669

KT
baseline 0.319
RPC 0.368
IBM 0.228
IBPL 0.272
LRRF -0.069
LRT 0.198
rfr 0.375


In [16]:
print("RR")
get_mean_perf_by_model(amide_onehot_perfs, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(amide_onehot_perfs, "kendall_tau")

RR
baseline 0.631
RPC 0.631
IBM 0.455
IBPL 0.455
LRRF 0.586
LRT 0.417
rfr 0.521

KT
baseline 0.319
RPC 0.164
IBM 0.204
IBPL 0.204
LRRF 0.2
LRT 0.081
rfr 0.262


In [17]:
print("RR")
get_mean_perf_by_model(amide_random_perfs, "reciprocal_rank")
print()
print("KT")
get_mean_perf_by_model(amide_random_perfs, "kendall_tau")

RR
baseline 0.631
RPC 0.613
IBM 0.533
IBPL 0.613
LRRF 0.58
LRT 0.381
rfr 0.571

KT
baseline 0.319
RPC 0.319
IBM 0.152
IBPL 0.224
LRRF 0.064
LRT -0.13
rfr 0.262


# Dataset structure analysis

In [2]:
RAW_DATA = pd.read_excel(
    "datasets/natureHTE/natureHTE.xlsx",
    sheet_name="Report - Variable Conditions",
    usecols=["BB SMILES", "Chemistry", "Catalyst", "Base", "Rel. % Conv."],
)

AMINE_DATA = RAW_DATA[RAW_DATA["Chemistry"] == "Amine"]
SULFON_DATA = RAW_DATA[RAW_DATA["Chemistry"] == "Sulfonamide"].reset_index()
AMIDE_DATA = RAW_DATA[RAW_DATA["Chemistry"] == "Amide"].reset_index()

In [15]:
amine_yields = AMINE_DATA["Rel. % Conv."].to_numpy().reshape((96,4))
amine_yields[np.argwhere(np.isnan(amine_yields))] = 0
np.mean(amine_yields, axis=0)

array([59.4375    , 36.48958333, 32.07291667, 33.27083333])

In [11]:
from dataset_utils import *
amine_rank = yield_to_ranking(AMINE_DATA["Rel. % Conv."].to_numpy().reshape((96,4)))
best_conds = np.where(amine_rank==1)[1]
for i in range(4):
    print(np.sum(best_conds==i))

51
12
16
15


In [12]:
sec_best_conds = np.where(amine_rank==2)[1]
for i in range(4):
    print(np.sum(sec_best_conds==i))

19
35
13
20


In [17]:
from collections import Counter
tlac_y = np.argmin(amine_yields, axis=1)
Counter(tlac_y)

Counter({2: 44, 3: 21, 0: 18, 1: 13})