### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to generate results for my paper, in specific a comparison between the original model and the model with basic changes. The results will be presented in a ROC curve and the AUC will be calculated. The data used for this comparison is the Brno and Tubingen data. The data is cleaned and the models are fitted. The fitted models are used to generate the probabilities for the LNM and Survival. The probabilities are used to generate the ROC curves and calculate the AUC

In [None]:
import pandas as pd
import numpy as np
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb
from collections import Counter

origineel_net = gum.loadBN("../0.3. Original_Casper_files/Results/Casper_fitted_763.net")
WOPlat = gum.loadBN("../3. Model/Fitted_Networks/R_WP_original_763.net")

df_brno = pd.read_csv("../0.1. Cleaned_data/Cleaned_Brno_model_complete.csv")
df_tubingen = pd.read_csv("../0.1. Cleaned_data/Tubingen_Validation_wMSI.csv")

In [None]:
# Get the SVG code by using gnb.getBN, for more customisability
#gnb.showBN(origineel_net)
#gnb.getBN(origineel_net)

In [None]:
df_brno_LNM = df_brno.dropna(subset=["LNM"])
df_brno_surv = df_brno.dropna(subset=["Survival5yr"])
LNM_target_brno = df_brno_LNM["LNM"].replace({"yes": 1, "no": 0})
Surv_target_brno = df_brno_surv["Survival5yr"].replace({"yes": 1, "no": 0})

df_tubingen_LNM = df_tubingen.dropna(subset=["LNM"])
df_tubingen_surv = df_tubingen.dropna(subset=["Survival5yr"])
LNM_target_tubingen = df_tubingen_LNM["LNM"].replace({"yes": 1, "no": 0})
Surv_target_tubingen = df_tubingen_surv["Survival5yr"].replace({"yes": 1, "no": 0})

In [None]:
def getProbabilities(model, evidence_LNM, evidence_Surv, Surv="Survival5yr", samples=100):
    ls_result_LNM = []
    ls_result_Surv = []

    #for i in range(1, samples):
    resultsLNM = []
    resultsSurvival = []
    net = gum.LazyPropagation(model)
    net.getNumberOfThreads()
    net.setNumberOfThreads(10)

    for j in range(len(evidence_LNM)):
        evidencerow = evidence_LNM.iloc[j]
        evidencerow = evidencerow.dropna().to_dict()

        try:
            net.setEvidence(evidencerow)

            net.makeInference()

            resultLNM = net.posterior("LNM")

            resultsLNM.append(resultLNM)
        except Exception as error:
            print("Error at row regarding LNM", j)
            print(error)

            resultsLNM.append(resultLNM)

    for j in range(len(evidence_Surv)):
        evidencerow = evidence_Surv.iloc[j]
        evidencerow = evidencerow.dropna().to_dict()

        try:
            net.setEvidence(evidencerow)

            net.makeInference()

            resultSurvival = net.posterior("Survival5yr")

            resultsSurvival.append(resultSurvival)
        except Exception as error:
            print("Error at row regarding Survival", j)
            print(error)

            resultsSurvival.append(resultSurvival)

    return resultsLNM, resultsSurvival


def getProbResults(results, target):
    res = []

    for i in range(len(results)):
        res.append(results[i][target])
    return pd.DataFrame(res)




In [None]:
from sklearn.metrics import roc_curve, f1_score
# Find the accuracy, roc auc, precision and recall for the results and the targets data
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, confusion_matrix, log_loss, \
    brier_score_loss


def getSlimMetrics(results, targets):
    if not len(results) == len(targets):
        raise Exception('Results and targets are not the same length. Results:', len(results), ' Targets:',
                        len(targets))

    # Get ROC AUC, Log Loss, Brier, and N Predicted/N Observed
    curve = roc_curve(targets, results, pos_label=1)
    ROC = round(roc_auc_score(targets, results), 4)
    LL = round(log_loss(targets, results), 4)
    Brier = round(brier_score_loss(targets, results), 4)
    N_pred = results.sum()[0]
    N_obs = int(targets.sum())
    N_br = f"{int(N_pred)}/{int(N_obs)}"
    Ratio = round(N_pred / N_obs, 4)

    x = pd.DataFrame([ROC, LL, Brier, N_br, Ratio],
                     index=["ROC AUC", "Log Loss", "Brier", "N Predicted/N Observed", "Ratio"])
    return x

In [None]:
# Select evidences and get the probabilities
evidence_columns = ["ER", "PR", "p53", "L1CAM", "CA125", "Platelets", "Cytology", "MRI_MI", "MSI", "POLE", "PreoperativeGrade"]

or_net_nodes = origineel_net.names()
wo_net_nodes = WOPlat.names()
# select evidences present in both nodes and evidence columns
evi_or_net_brno = [x for x in evidence_columns if x in or_net_nodes and x in df_brno.columns]
evi_or_net_tubingen = [x for x in evidence_columns if x in or_net_nodes and x in df_tubingen.columns]

evi_wo_net_brno = [x for x in evidence_columns if x in wo_net_nodes and x in df_brno.columns]
evi_wo_net_tubingen = [x for x in evidence_columns if x in wo_net_nodes and x in df_tubingen.columns]


In [None]:

or_net_Brno_LNM, or_net_Brno_Surv = getProbabilities(origineel_net, df_brno_LNM[evi_or_net_brno], df_brno_surv[evi_or_net_brno])
or_net_Tubingen_LNM, or_net_Tubingen_Surv = getProbabilities(origineel_net, df_tubingen_LNM[evi_or_net_tubingen], df_tubingen_surv[evi_or_net_tubingen])

wo_net_Brno_LNM, wo_net_Brno_Surv = getProbabilities(WOPlat, df_brno_LNM[evi_wo_net_brno], df_brno_surv[evi_wo_net_brno])
wo_net_Tubingen_LNM, wo_net_Tubingen_Surv = getProbabilities(WOPlat, df_tubingen_LNM[evi_wo_net_tubingen], df_tubingen_surv[evi_wo_net_tubingen])

# Probabilities extract
or_net_Brno_LNM_res = getProbResults(or_net_Brno_LNM, 1)
or_net_Brno_Surv_res = getProbResults(or_net_Brno_Surv, 1)

or_net_Tubingen_LNM_res = getProbResults(or_net_Tubingen_LNM, 1)
or_net_Tubingen_Surv_res = getProbResults(or_net_Tubingen_Surv, 1)

wo_net_Brno_LNM_res = getProbResults(wo_net_Brno_LNM, 1)
wo_net_Brno_Surv_res = getProbResults(wo_net_Brno_Surv, 1)

wo_net_Tubingen_LNM_res = getProbResults(wo_net_Tubingen_LNM, 1)
wo_net_Tubingen_Surv_res = getProbResults(wo_net_Tubingen_Surv, 1)


In [None]:
# Create ROC curves for the LNM and Survival predictions
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style
sns.set_style("whitegrid")
# Set the output as png
set_matplotlib_formats('png', 'pdf')
plt.rcParams['figure.dpi'] = 150

# Create the ROC curves
fpr_or_net_Brno_LNM, tpr_or_net_Brno_LNM, _ = roc_curve(LNM_target_brno, or_net_Brno_LNM_res)
fpr_or_net_Brno_Surv, tpr_or_net_Brno_Surv, _ = roc_curve(Surv_target_brno, or_net_Brno_Surv_res)

fpr_or_net_Tubingen_LNM, tpr_or_net_Tubingen_LNM, _ = roc_curve(LNM_target_tubingen, or_net_Tubingen_LNM_res)
fpr_or_net_Tubingen_Surv, tpr_or_net_Tubingen_Surv, _ = roc_curve(Surv_target_tubingen, or_net_Tubingen_Surv_res)

fpr_wo_net_Brno_LNM, tpr_wo_net_Brno_LNM, _ = roc_curve(LNM_target_brno, wo_net_Brno_LNM_res)
fpr_wo_net_Brno_Surv, tpr_wo_net_Brno_Surv, _ = roc_curve(Surv_target_brno, wo_net_Brno_Surv_res)

fpr_wo_net_Tubingen_LNM, tpr_wo_net_Tubingen_LNM, _ = roc_curve(LNM_target_tubingen, wo_net_Tubingen_LNM_res)
fpr_wo_net_Tubingen_Surv, tpr_wo_net_Tubingen_Surv, _ = roc_curve(Surv_target_tubingen, wo_net_Tubingen_Surv_res)

# Create the plot

plot, ax = plt.subplots(1, 1, figsize=(5, 5))

# Plot the LNM ROC curves, AUC scores in legend, and diagonal line
ax.plot(fpr_or_net_Brno_LNM, tpr_or_net_Brno_LNM, label=f"Original model (AUC = {round(roc_auc_score(LNM_target_brno, or_net_Brno_LNM_res), 2)})")
ax.plot(fpr_wo_net_Brno_LNM, tpr_wo_net_Brno_LNM, label=f"New model with changes (AUC = {round(roc_auc_score(LNM_target_brno, wo_net_Brno_LNM_res), 2)})")

ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('LNM Brno')
ax.legend(loc="lower right")

plot, ax = plt.subplots(1, 1, figsize=(5, 5))
ax.plot(fpr_or_net_Tubingen_LNM, tpr_or_net_Tubingen_LNM, label=f"Original model (AUC = {round(roc_auc_score(LNM_target_tubingen, or_net_Tubingen_LNM_res), 2)})")
ax.plot(fpr_wo_net_Tubingen_LNM, tpr_wo_net_Tubingen_LNM, label=f"New model with changes (AUC = {round(roc_auc_score(LNM_target_tubingen, wo_net_Tubingen_LNM_res), 2)})")

ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('LNM Tubingen')
ax.legend(loc="lower right")

# Plot the Survival ROC curves, AUC scores in legend, and diagonal line
plot, ax = plt.subplots(1, 1, figsize=(5, 5))
ax.plot(fpr_or_net_Brno_Surv, tpr_or_net_Brno_Surv, label=f"Original model (AUC = {round(roc_auc_score(Surv_target_brno, or_net_Brno_Surv_res), 2)})")
ax.plot(fpr_wo_net_Brno_Surv, tpr_wo_net_Brno_Surv, label=f"New model with changes (AUC = {round(roc_auc_score(Surv_target_brno, wo_net_Brno_Surv_res), 2)})")

ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Survival Brno')
ax.legend(loc="lower right")

plot, ax = plt.subplots(1, 1, figsize=(5, 5))

ax.plot(fpr_or_net_Tubingen_Surv, tpr_or_net_Tubingen_Surv, label=f"Original model (AUC = {round(roc_auc_score(Surv_target_tubingen, or_net_Tubingen_Surv_res), 2)})")
ax.plot(fpr_wo_net_Tubingen_Surv, tpr_wo_net_Tubingen_Surv, label=f"Model with changes (AUC = {round(roc_auc_score(Surv_target_tubingen, wo_net_Tubingen_Surv_res), 2)})")

ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Survival Tubingen')
ax.legend(loc="lower right")




# When using the whole evidence set

In [None]:
# Complete evidence set
evidence_columns = ["ER", "PR", "p53", "L1CAM", "CA125", "Platelets", "Cytology", "MRI_MI", "MSI", "POLE", "PostoperativeGrade", "MyometrialInvasion", "LVSI", "Therapy", "Chemotherapy", "Radiotherapy"]
# network specific evidence
or_net_nodes = origineel_net.names()
wo_net_nodes = WOPlat.names()

# select evidences present in both nodes and evidence columns
evi_or_net_brno = [x for x in evidence_columns if x in or_net_nodes and x in df_brno.columns]
evi_or_net_tubingen = [x for x in evidence_columns if x in or_net_nodes and x in df_tubingen.columns]

evi_wo_net_brno = [x for x in evidence_columns if x in wo_net_nodes and x in df_brno.columns]
evi_wo_net_tubingen = [x for x in evidence_columns if x in wo_net_nodes and x in df_tubingen.columns]


In [None]:
# Get probabilities
or_net_Brno_LNM, or_net_Brno_Surv = getProbabilities(origineel_net, df_brno_LNM[evi_or_net_brno], df_brno_surv[evi_or_net_brno])
or_net_Tubingen_LNM, or_net_Tubingen_Surv = getProbabilities(origineel_net, df_tubingen_LNM[evi_or_net_tubingen], df_tubingen_surv[evi_or_net_tubingen])

wo_net_Brno_LNM, wo_net_Brno_Surv = getProbabilities(WOPlat, df_brno_LNM[evi_wo_net_brno], df_brno_surv[evi_wo_net_brno])
wo_net_Tubingen_LNM, wo_net_Tubingen_Surv = getProbabilities(WOPlat, df_tubingen_LNM[evi_wo_net_tubingen], df_tubingen_surv[evi_wo_net_tubingen])

# Probabilities extract
or_net_Brno_LNM_res = getProbResults(or_net_Brno_LNM, 1)
or_net_Brno_Surv_res = getProbResults(or_net_Brno_Surv, 1)

or_net_Tubingen_LNM_res = getProbResults(or_net_Tubingen_LNM, 1)
or_net_Tubingen_Surv_res = getProbResults(or_net_Tubingen_Surv, 1)

wo_net_Brno_LNM_res = getProbResults(wo_net_Brno_LNM, 1) 
wo_net_Brno_Surv_res = getProbResults(wo_net_Brno_Surv, 1)\

wo_net_Tubingen_LNM_res = getProbResults(wo_net_Tubingen_LNM, 1)
wo_net_Tubingen_Surv_res = getProbResults(wo_net_Tubingen_Surv, 1)


In [None]:
# Create ROC curves for the Survival predictions
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
import seaborn as sns

# get ROC
fpr_or_net_Brno_Surv, tpr_or_net_Brno_Surv, _ = roc_curve(Surv_target_brno, or_net_Brno_Surv_res)

fpr_or_net_Tubingen_Surv, tpr_or_net_Tubingen_Surv, _ = roc_curve(Surv_target_tubingen, or_net_Tubingen_Surv_res)

fpr_wo_net_Brno_Surv, tpr_wo_net_Brno_Surv, _ = roc_curve(Surv_target_brno, wo_net_Brno_Surv_res)

fpr_wo_net_Tubingen_Surv, tpr_wo_net_Tubingen_Surv, _ = roc_curve(Surv_target_tubingen, wo_net_Tubingen_Surv_res)

# Create the plot
fig, ax = plt.subplots(1,2, figsize=(10, 4))

# Plot the Survival ROC curves, AUC scores in legend, and diagonal line
ax[0].plot(fpr_or_net_Brno_Surv, tpr_or_net_Brno_Surv, label=f"Original Casper (AUC = {round(roc_auc_score(Surv_target_brno, or_net_Brno_Surv_res), 2)})")
ax[0].plot(fpr_wo_net_Brno_Surv, tpr_wo_net_Brno_Surv, label=f"WOPlat (AUC = {round(roc_auc_score(Surv_target_brno, wo_net_Brno_Surv_res), 2)})")

ax[0].plot([0,1], [0,1], 'k--')
ax[0].set_xlabel('False Positive Rate')
ax[0].set_ylabel('True Positive Rate')
ax[0].set_title('Survival Brno')
ax[0].legend(loc="lower right")

ax[1].plot(fpr_or_net_Tubingen_Surv, tpr_or_net_Tubingen_Surv, label=f"Original Casper (AUC = {round(roc_auc_score(Surv_target_tubingen, or_net_Tubingen_Surv_res), 2)})")
ax[1].plot(fpr_wo_net_Tubingen_Surv, tpr_wo_net_Tubingen_Surv, label=f"WOPlat (AUC = {round(roc_auc_score(Surv_target_tubingen, wo_net_Tubingen_Surv_res), 2)})")

ax[1].plot([0,1], [0,1], 'k--')
ax[1].set_xlabel('False Positive Rate')
ax[1].set_ylabel('True Positive Rate')
ax[1].set_title('Survival Tubingen')
ax[1].legend(loc="lower right")
