In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import Draw

import polaris as po
from src.data import PolarisDataset
from src.polaris import Polaris
from src.utils import scaffold_split

%load_ext autoreload
%autoreload 2

# set seaborn theme
sns.set_theme()

# Data Evaluation

## Potency

### Datasets

Show label distribution for Train Polaris, Train Scaffold, Test Scaffold
Also show number of duplicate entries



In [None]:
task = 'potency'
target_task = 'pIC50 (MERS-CoV Mpro)'
# target_task = "pIC50 (SARS-CoV-2 Mpro)"

root = Path("./data") / "polaris" / task
train_polaris = PolarisDataset(root=root, task=task, target_task=target_task, train=True, log_transform=False)
test_polaris = PolarisDataset(root=root, task=task, target_task=target_task, train=False, log_transform=False)
train_scaffold, test_scaffold = scaffold_split(dataset=train_polaris, test_size=0.1)

In [None]:
train_plot = sns.displot(data=train_polaris.y, kde=True, legend=False)
train_plot.set_xlabels(target_task)
plt.title("Train Polaris")
sns.displot(data=train_scaffold.y, kde=True, legend=False).set_xlabels(target_task)
plt.title("Train Scaffold")
sns.displot(data=test_scaffold.y, kde=True, legend=False).set_xlabels(target_task)
plt.title("Test Scaffold")

### Inference

In [None]:
# Combine results of gnn and ecfp
df_potency_gnn = pd.read_csv("./results/polaris/potency_gnn_results.csv")
df_potency_ecfp = pd.read_csv("./results/polaris/potency_ecfp_results.csv")
df_potency_test = pd.concat([df_potency_gnn, df_potency_ecfp]).reset_index(drop=True)
df_potency_test = df_potency_test.convert_dtypes()
df_potency_test

In [None]:
# Group by target_task and find the row with minimum mean_val_loss for each group
df_best_potency = df_potency_test.loc[df_potency_test.groupby(['target_task', 'repr_model'])['mae_test_scaffold'].idxmin()].reset_index(drop=True)

In [None]:
df_best_potency_hist = df_best_potency[["target_task", "repr_model", "mae_test_scaffold"]]
g_potency = sns.catplot(
    data=df_best_potency_hist,
    kind="bar",
    x="repr_model",
    y="mae_test_scaffold",
    hue="target_task",
    aspect=1.5,
)

### Submission

In [None]:
# Pick best model for each task
idx = df_best_potency.groupby("target_task")["mae_test_scaffold"].idxmin().tolist()
potency_submission_models_list = df_best_potency.iloc[idx].to_dict("records")

y_pred = {}
for model in potency_submission_models_list:
    target_task = model["target_task"]
    polaris = Polaris(model)
    polaris.train_final(polaris.train_polaris)
    preds = polaris.predict(polaris.test_polaris)
    y_pred[target_task] = preds

y_submission = {}
for k, v in y_pred.items():
    y_submission[k] = [tup[1] for tup in v]

sns.displot(y_submission)

In [None]:
model = potency_submission_models_list[1]
# target_task = 'pIC50 (MERS-CoV Mpro)'
target_tast = 'pIC50 (SARS-CoV-2 Mpro)'

mse_list = []
for _ in range(2):
    y_pred = {}
    target_task = model["target_task"]
    polaris = Polaris(model)
    polaris.train_final(polaris.train_polaris)
    preds = polaris.predict(polaris.test_polaris)
    y_pred[target_task] = preds

    y_submission = {}
    for k, v in y_pred.items():
        y_submission[k] = [tup[1] for tup in v]

    y_hat = df_potency_test_polaris[target_task]
    y = y_submission[target_task]

    mse = np.mean(np.abs(y_hat - y))
    mse_list.append(mse)

mse_list

In [None]:
import matplotlib.pyplot as plt
t = mae_total
print(t)
plt.figure(figsize=(3, 6))
plt.boxplot(t)

### Unblinded Dataset

In [None]:
potency_dataset = po.load_dataset("asap-discovery/antiviral-potency-2025-unblinded")

In [None]:
data = []
filtered_idxs = [1036, 1039, 1219, 1225, 1306]
for i in range(len(potency_dataset)):
    item: dict = potency_dataset[i] # type: ignore
    if item["Set"] == "Test":
        smiles = item["CXSMILES"]
        mers = item["pIC50 (MERS-CoV Mpro)"]
        sars = item["pIC50 (SARS-CoV-2 Mpro)"]
        if i in filtered_idxs:
            mers, sars = np.nan, np.nan
        data.append([smiles, mers, sars])

df_potency_test_polaris = pd.DataFrame(data)
df_potency_test_polaris.columns = ["smiles", "pIC50 (MERS-CoV Mpro)", "pIC50 (SARS-CoV-2 Mpro)"]

In [None]:
target_task = 'pIC50 (MERS-CoV Mpro)'
sns.displot(data=df_potency_test_polaris[target_task], kde=True, legend=False).set_xlabels(target_task)
plt.title("Test Polaris")

In [None]:
# Calculate MAE on Test Polaris Dataset
y_hat = df_potency_test["pIC50 (MERS-CoV Mpro)"]
y = y_submission["pIC50 (MERS-CoV Mpro)"]

mse = np.mean(np.abs(y_hat - y))
mse

### Submission

In [None]:
# Submit results
competition = po.load_competition("asap-discovery/antiviral-potency-2025")

competition.submit_predictions(
    predictions = y_submission,
    prediction_name = "test_2",
    prediction_owner = "aehrlich",
    report_url = "https://www.example.com",
    description = "Second submission"
)

## Admet

### Datasets

In [None]:
task = 'admet'
target_task = 'KSOL'

root = Path("./data") / "polaris" / task
train_polaris = PolarisDataset(root=root, task=task, target_task=target_task, train=True, log_transform=True)
test_polaris = PolarisDataset(root=root, task=task, target_task=target_task, train=False, log_transform=True)
train_scaffold, test_scaffold = scaffold_split(dataset=train_polaris, test_size=0.1)

In [None]:
sns.displot(data=train_polaris.y, kde=True, legend=False).set_xlabels(target_task)
plt.title("Train Polaris")
sns.displot(data=train_scaffold.y, kde=True, legend=False).set_xlabels(target_task)
plt.title("Train Scaffold")
sns.displot(data=test_scaffold.y, kde=True, legend=False).set_xlabels(target_task)
plt.title("Test Scaffold")

### Inference

In [None]:
# Combine results of gnn and ecfp
df_admet_gnn = pd.read_csv("./results/polaris/admet_gnn_results.csv")
df_admet_ecfp = pd.read_csv("./results/polaris/admet_ecfp_results.csv")
df_admet = pd.concat([df_admet_gnn, df_admet_ecfp]).reset_index(drop=True)
df_admet = df_admet.convert_dtypes()
df_admet

In [None]:
# Group by target_task and find the row with minimum mean_val_loss for each group
df_best_admet = df_admet.loc[df_admet.groupby(['target_task', 'repr_model'])['mae_test_scaffold'].idxmin()].reset_index(drop=True)
df_best_admet

In [None]:
# Create a histogram of MAE for each target_task and model
df_best_admet_hist = df_best_admet[["target_task", "repr_model", "mae_test_scaffold"]]
g_admet = sns.catplot(
    data=df_best_admet_hist,
    kind="bar",
    x="repr_model",
    y="mae_test_scaffold",
    hue="target_task",
    aspect=1.5,
)

### Submission

In [None]:
# Extract the best MAE for each target_task
idx = df_best_admet.groupby("target_task")["mae_test_scaffold"].idxmin().tolist()
admet_submission_models_list = df_best_admet.iloc[idx].to_dict("records")

y_pred = {}
for model in admet_submission_models_list:
    target_task = model["target_task"]
    model["final_avg_epochs"] = 200
    polaris = Polaris(model)
    polaris.train_final(polaris.train_polaris)
    preds = polaris.predict(polaris.test_polaris)
    y_pred[target_task] = preds

y_submission_admet = {}
for k, v in y_pred.items():
    y_submission_admet[k] = [tup[1] for tup in v]

sns.displot(y_submission_admet)

In [None]:
competition = po.load_competition("asap-discovery/antiviral-admet-2025")

competition.submit_predictions(
    predictions = y_submission_admet,
    prediction_name = "cv-5",
    prediction_owner = "aehrlich",
    report_url = "https://www.example.com",
    description = "Second submission"
)

### Unblinded

In [None]:
admet_dataset = po.load_dataset("asap-discovery/antiviral-admet-2025-unblinded")
hlm_filter_idx = [519, 524, 547]
mlm_filter_idx = [515, 518, 521, 524, 525]

data = []
for i in range(len(admet_dataset)):
    item: dict = admet_dataset[i] # type: ignore
    if item["Set"] == "Test":
        smiles = item["CXSMILES"]
        hlm = np.log10(item["HLM"])
        ksol = np.log10(item["KSOL"])
        logd = item["LogD"]
        mdr1 = np.log10(item["MDR1-MDCKII"])
        mlm = np.log10(item["MLM"])
        if i in hlm_filter_idx:
            hlm = np.nan
        if i in mlm_filter_idx:
            mlm = np.nan
        data.append([hlm, ksol, logd, mdr1, mlm])

df_admet_test_polaris = pd.DataFrame(data)
df_admet_test_polaris.columns = ['HLM', 'KSOL', 'LogD', 'MDR1-MDCKII', 'MLM']

# Take the log of all values apart from LogD
df_admet_test_polaris

In [None]:
target_task = 'LogD'
sns.displot(data=df_admet_test_polaris[target_task], kde=True, legend=False)

In [None]:
# Calculate the MSE for each task
df_y_submission_admet = pd.DataFrame(y_submission_admet)
mae_submission = abs(df_admet_test_polaris - df_y_submission_admet).mean(skipna=True)
mae_submission


In [None]:
submission_dict = mae_submission.to_dict()
best_dict = {
    'MLM': 0.317,
    'HLM': 0.275,
    'MDR1-MDCKII': 0.151,
    'KSOL': 0.101,
    'LogD': 0.254
}
baseline_dict = {
    'MLM': 0.416,
    'HLM': 0.438,
    'MDR1-MDCKII': 0.519,
    'KSOL': 0.234,
    'LogD': 0.545
}
df_plot = pd.DataFrame([submission_dict, best_dict, baseline_dict])
df_plot.index = ['Submission', 'Best', 'Baseline']

In [None]:
df = df_plot
df = df.reset_index().melt(id_vars="index", var_name="Dataset", value_name="MAE Test Polaris")
df.rename(columns={"index": "Type"}, inplace=True)
sns.set_theme()
g = sns.catplot(data=df, x="Dataset", y="MAE Test Polaris", hue="Type", kind="bar", height=6, aspect=1.5)
plt.title("Performance across Datasets")
plt.show()