In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt
from artifact_detector_model import MARKER_NAMES
plt.rcParams["font.family"] = "Serif"
from sklearn.metrics import confusion_matrix

In [2]:
df_markers = pd.read_csv("predicted_all_embed.csv")[["image_path"] + MARKER_NAMES]

In [None]:
df_main = pd.read_csv("/vol/biomedic3/data/EMBED/tables/mammo-net-csv/embed-non-negative.csv")
output_dir = "output/non-negative-baseline/version_0"
print(len(df_main))
df_main["image_id"] = df_main["image_path"].apply(
    lambda img_path: img_path.split("/")[-1]
)
df_markers["image_id"] = df_markers["image_path"].apply(
    lambda img_path: img_path.split("/")[-1]
)
df_main.drop(columns="image_path", inplace=True)
df_full = df_main.merge(df_markers, how="inner")
print(len(df_full))
df_full = df_full.loc[df_full["compression"] == 0]
print(len(df_full))

In [5]:
predictions = pd.read_csv(output_dir + "/predictions.csv")
predictions["image_id"] = predictions["image_path"].apply(
    lambda img_path: img_path.split("/")[-1]
)
img_ids = predictions["image_id"].values
preds = predictions["probability"].values
targets = predictions["label"].values

In [None]:
circle_image_id = df_full.loc[
    (df_full["image_id"].isin(img_ids)) & (df_full["circle marker"] == 1), "image_id"
]
circle_idx = np.where([img_id in circle_image_id.values for img_id in img_ids])[0]
triangle_image_id = df_full.loc[
    (df_full["image_id"].isin(img_ids)) & (df_full["triangle marker"] == 1), "image_id"
]
triangle_idx = np.where([img_id in triangle_image_id.values for img_id in img_ids])[0]
pacemaker_image_id = df_full.loc[
    (df_full["image_id"].isin(img_ids)) & (df_full["devices"] == 1), "image_id"
]
pacemaker_idx = np.where([img_id in pacemaker_image_id.values for img_id in img_ids])[0]
breast_implant_image_id = df_full.loc[
    (df_full["image_id"].isin(img_ids)) & (df_full["breast implant"] == 1), "image_id"
]
breast_implant_idx = np.where(
    [img_id in breast_implant_image_id.values for img_id in img_ids]
)[0]
compression_image_id = df_full.loc[
    (df_full["image_id"].isin(img_ids)) & (df_full["compression"] == 1), "image_id"
]
compression_idx = np.where(
    [img_id in compression_image_id.values for img_id in img_ids]
)[0]
normal_image_id = df_full.loc[
    (df_full["image_id"].isin(img_ids))
    & (df_full["compression"] == 0)
    & (df_full["devices"] == 0)
    & (df_full["circle marker"] == 0)
    & (df_full["triangle marker"] == 0)
    & (df_full["breast implant"] == 0),
    "image_id",
]
normal_idx = np.where([img_id in normal_image_id.values for img_id in img_ids])[0]
circle_idx.shape, triangle_idx.shape, pacemaker_idx.shape, breast_implant_idx.shape, compression_idx.shape, normal_idx.shape

In [None]:
print(f"All: {roc_auc_score(targets, preds):.3f}")
print(
    f"Normal images: {roc_auc_score(targets[normal_idx], preds[normal_idx]):.3f} ({targets[normal_idx].sum()} positives out of {targets[normal_idx].shape[0]})"
)
print(
    f"Images with circle: {roc_auc_score(targets[circle_idx], preds[circle_idx]):.3f} ({targets[circle_idx].sum()} positives out of {targets[circle_idx].shape[0]})"
)
print(
    f"Images with triangle: {roc_auc_score(targets[triangle_idx], preds[triangle_idx]):.3f} ({targets[triangle_idx].sum()} positives out of {targets[triangle_idx].shape[0]})"
)
print(
    f"Images with implant: {roc_auc_score(targets[breast_implant_idx], preds[breast_implant_idx]):.3f} ({targets[breast_implant_idx].sum()} positives out of {targets[breast_implant_idx].shape[0]})"
)
print(
    f"Pacemaker: {roc_auc_score(targets[pacemaker_idx], preds[pacemaker_idx]):.3f} ({targets[pacemaker_idx].sum()} positives out of {targets[pacemaker_idx].shape[0]})"
)

In [9]:
predictions_full = predictions.merge(df_markers, on ='image_id')
predictions_full["no marker"] = (
    (predictions_full["circle marker"] == 0)
    & (predictions_full["triangle marker"] == 0)
    & (predictions_full["breast implant"] == 0)
    & (predictions_full["devices"] == 0)
).astype(int)

In [None]:
for marker_col in MARKER_NAMES[:-1]:
    f, ax = plt.subplots(1, 2, figsize=(7, 3))
    for i in range(2):
        df = predictions_full.loc[(predictions_full["no marker"] == 1) | (predictions_full[marker_col] == 1)]
        df.loc[df[marker_col] == 1, 'Artefact'] = marker_col.capitalize()
        df.loc[df[marker_col] == 0, 'Artefact'] = 'No artefact'
        df['Model output'] = df['probability']
        g = sns.histplot(
            data=df.loc[df.label == i].sort_values(by=marker_col),
            x='Model output',
            hue='Artefact',
            common_norm=False,
            stat="density",
            ax=ax[i],
            bins=25,
            kde=True,
        )
        g.legend_.set_title(None)
        # handles, labels = ax[i].get_legend_handles_labels()
        # ax[i].legend(handles=handles[1:], labels=labels[1:])
        if marker_col == 'triangle marker':
            ax[i].set_title(f"Cancer screening status:\nno finding" if i == 0 else "Cancer screening status:\nnon negative")
        if i == 1:
            ax[i].set_ylabel("")
    plt.savefig(f"output/distribution_cancer_{marker_col}.pdf", bbox_inches="tight")
    plt.show()

In [None]:
def get_op_threshold(true, pred, operating_point="diag"):
    fpr, tpr, threshold = roc_curve(true, pred)
    if operating_point == "diag":
        op = np.argmin(np.abs(tpr - (1 - fpr)))
    elif operating_point == "spec90":
        op = np.argmin(np.abs(fpr - 0.10))
    else:
        raise ValueError("Operating point has to be diag, spec90")
    return threshold[op]

In [None]:
op_global = get_op_threshold(targets, preds)

In [None]:

plt.rcParams["font.family"] = "Serif"
f, ax = plt.subplots(2, 3, figsize=(8, 5), facecolor="none")
f.subplots_adjust(hspace=0.4, wspace=0.3)
ax = ax.ravel()
results = {
    'all': [np.arange(targets.shape[0]), "All images"],
    'no markers': [normal_idx, "No markers"],
    'circle': [circle_idx, "Circle markers"],
    'triangles': [triangle_idx, "Triangle markers"],
    'implants': [breast_implant_idx, "Breast implants"],
    'devices': [pacemaker_idx, "Devices"]
}
for i, (select_idx, title) in enumerate(results.values()):
    cf_matrix = confusion_matrix(targets[select_idx], preds[select_idx] > op_global)
    group_counts = [f"{value:0.0f}" for value in cf_matrix.flatten()]
    group_percentages = [
        f"{value:.2%}"
        for value in (cf_matrix / np.sum(cf_matrix, 1, keepdims=True)).flatten()
    ]
    labels = [f"{v3}\n(N={v2})" for v2, v3 in zip(group_counts, group_percentages)]
    labels = np.asarray(labels).reshape(2, 2)
    print(labels)
    sns.heatmap(
        (cf_matrix / np.sum(cf_matrix, 1, keepdims=True)),
        annot=labels,
        fmt="",
        cmap="Blues",
        ax=ax[i],
        vmin=0,
        vmax=1,
        cbar=False,
    )
    ax[i].set_title(' '.join([r'$\bf{' + t +'}$' for t in title.split(' ')]), fontsize=14)
    # ax[i].set_title(title, fontsize=15)
# [ax[i].get_images()[0].set_clim(0, 1) for i in range(5)]
[ax[i].set_xlabel('') for i in range(3)]
[ax[i].set_ylabel('') for i in [1, 2, 4, 5]]
[ax[i].set_ylabel('True label', fontsize=14) for i in [0, 3]]
[ax[i].set_xlabel('Predicted label', fontsize=14) for i in [3,4,5]]

plt.savefig("output/confusion_cancer.pdf", bbox_inches="tight")