In [None]:
%pip install matplotlib pandas tabulate

In [None]:
# General configuration

import os
import sys

import matplotlib.pyplot as plt
import pandas as pd

sys.path.append(os.path.abspath("src"))
from helper import path

pd.options.display.float_format = "{:.3f}".format
pd.options.display.width = 160

In [None]:
# Path management

OUTPUT_PATH = path("output")
PERF_2007_PATH = path(f"{OUTPUT_PATH}/2007.csv")
PERF_2013_PATH = path(f"{OUTPUT_PATH}/2013.csv")
PERF_2016_PATH = path(f"{OUTPUT_PATH}/2016.csv")
PERF_2019_PATH = path(f"{OUTPUT_PATH}/2019.csv")
PERF_2020_PATH = path(f"{OUTPUT_PATH}/2020.csv")

OI_PERF_2007_PATH = path(f"{OUTPUT_PATH}/orig_input/2007.csv")
OI_PERF_2013_PATH = path(f"{OUTPUT_PATH}/orig_input/2013.csv")
OI_PERF_2016_PATH = path(f"{OUTPUT_PATH}/orig_input/2016.csv")
OI_PERF_2019_PATH = path(f"{OUTPUT_PATH}/orig_input/2019.csv")
OI_PERF_2020_PATH = path(f"{OUTPUT_PATH}/orig_input/2020.csv")

In [None]:
# Data preparation


def map_perf_to_row(name, df):
    data = {
        "Benchmark": name,
        "RDKit PersonR": df["Ligand"]["PearsonR"],
        "RDKit RMSE": df["Ligand"]["RMSE"],
        "Amino count PersonR": df["Amino"]["PearsonR"],
        "Amino count RMSE": df["Amino"]["RMSE"],
        "RDKit+amino PersonR": df["Amino+Ligand"]["PearsonR"],
        "RDKit+amino RMSE": df["Amino+Ligand"]["RMSE"],
    }

    return pd.DataFrame(data, index=[0])


def create_perf_dataframe(path, name, orig_input):
    name = f"{name} [{'Original Input' if orig_input else 'Reproduction'}]"
    df = pd.read_csv(path, index_col=0)
    df = map_perf_to_row(name, df.T)
    df.set_index("Benchmark", inplace=True)
    return df


perf_2007 = create_perf_dataframe(PERF_2007_PATH, "CASF-2007", False)
perf_2013 = create_perf_dataframe(PERF_2013_PATH, "CASF-2013", False)
perf_2016 = create_perf_dataframe(PERF_2016_PATH, "CASF-2016", False)
perf_2019 = create_perf_dataframe(PERF_2019_PATH, "CASF-2019", False)
perf_2020 = create_perf_dataframe(PERF_2020_PATH, "CASF-2020", False)

oi_perf_2007 = create_perf_dataframe(OI_PERF_2007_PATH, "CASF-2007", True)
oi_perf_2013 = create_perf_dataframe(OI_PERF_2013_PATH, "CASF-2013", True)
oi_perf_2016 = create_perf_dataframe(OI_PERF_2016_PATH, "CASF-2016", True)
oi_perf_2019 = create_perf_dataframe(OI_PERF_2019_PATH, "CASF-2019", True)
oi_perf_2020 = create_perf_dataframe(OI_PERF_2020_PATH, "CASF-2020", True)

perf_dfs = [perf_2007, perf_2013, perf_2016, perf_2019, perf_2020]
perf_dfs += [oi_perf_2007, oi_perf_2013, oi_perf_2016, oi_perf_2019, oi_perf_2020]
for df in perf_dfs:
    print(df)

In [None]:
# Table 1

TABLE_1_COLS = ["RDKit", "Amino count", "RDKit+amino"]
TABLE_1_DATA = {
    "Benchmark": [
        "CASF-2007 [Original Data]",
        "CASF-2013 [Original Data]",
        "CASF-2016 [Original Data]",
        "CASF-2019 [Original Data]",
        "CASF-2020 [Original Data]",
    ],
    # Values taken from Table 1 and Table 3
    TABLE_1_COLS[0]: [
        (0.713, 1.730),
        (0.675, 1.700),
        (0.715, 1.551),
        (0.677, 1.449),
        (0, 0),
    ],
    TABLE_1_COLS[1]: [
        (0.775, 1.583),
        (0.646, 1.722),
        (0.706, 1.543),
        (0.728, 1.348),
        (0, 0),
    ],
    TABLE_1_COLS[2]: [
        (0.832, 1.365),
        (0.777, 1.480),
        (0.844, 1.233),
        (0.779, 1.233),
        (0, 0),
    ],
}


df = pd.DataFrame(TABLE_1_DATA)
for col in TABLE_1_COLS:
    df[[f"{col} PersonR", f"{col} RMSE"]] = pd.DataFrame(
        df[col].tolist(), index=df.index
    )
    df.drop(col, axis=1, inplace=True)
df.set_index("Benchmark", inplace=True)

df = pd.concat([df] + perf_dfs, ignore_index=False)
df = df.sort_index()

personR = [df.columns[i] for i in range(len(df.columns)) if i % 2 == 0]
rmse = [df.columns[i] for i in range(len(df.columns)) if i % 2 != 0]

df = df[personR + rmse]

print(df.to_markdown(floatfmt=".3f"))

In [None]:
# Plot 1

years = df.index.to_series().str.extract(r"(\d{4})")[0].unique()

dfs = {}
for year in years:
    dfs[year] = df.filter(like=year, axis=0)

BAR_COLORS = ["lightblue", "cornflowerblue", "royalblue"]
BAR_WIDTH = 0.2

fig, axs = plt.subplots(1, len(years), figsize=(10, 5), sharey=True)

for i, year in enumerate(years):
    year_data = dfs[year]
    metrics = range(len(year_data.columns))

    for y in range(len(year_data)):
        pos = [m + y * BAR_WIDTH for m in metrics]
        values = year_data.iloc[y].values
        label = year_data.index[y].replace("CASF-2020 ", "")
        axs[i].bar(pos, values, width=BAR_WIDTH, label=label, color=BAR_COLORS[y])

    axs[i].set_title(year)
    axs[i].set_xticks([r + BAR_WIDTH for r in metrics])
    axs[i].set_xticklabels(year_data.columns, rotation=90)

plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()