### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to evaluate the different evidence sets and their ROC curves and AUC scores.


In [None]:
import pandas as pd
import numpy as np
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb
from collections import Counter


pd.options.mode.copy_on_write = True  # This will allow the code to run faster and keep Pandas happy. Technical detail: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html#

origineel_net = gum.loadBN("../../../0. Source_files/0.4. Original_Casper_files/Results/Casper_fitted_952.net")
net = gum.loadBN("../../1.3. Model/Fitted_Networks/R_WP_all_952.net")



df_brno = pd.read_csv("../../../0. Source_files/0.2. Cleaned_data/Cleaned_Brno_model_complete.csv")

Data preparation

In [None]:
df_brno_LNM = df_brno.dropna(subset=["LNM"]).reset_index(drop=True)
df_brno_surv = df_brno.dropna(subset=["Survival5yr"]).reset_index(drop=True)
LNM_target_brno = df_brno_LNM["LNM"].replace({"yes": 1, "no": 0})
Surv_target_brno = df_brno_surv["Survival5yr"].replace({"yes": 1, "no": 0})



Pseudocode:
- Create a function that takes a model, evidence for LNM and evidence for Survival as input
    - Create an empty list for the results
    - For each row in the evidence for LNM
        - Get the evidence for the row
        - Set the evidence in the model
        - Make the inference
        - Get the posterior for LNM
        - Append the result to the list
    - For each row in the evidence for Survival
        - Get the evidence for the row
        - Set the evidence in the model
        - Make the inference
        - Get the posterior for Survival
        - Append the result to the list
    - Return the list


In [None]:
def getProbabilities(model, evidence_LNM, evidence_Surv, Surv = "Survival5yr", samples = 100):
    ls_result_LNM = []
    ls_result_Surv = []
    
    #for i in range(1, samples):
    resultsLNM = []
    resultsSurvival = []
    net = gum.LazyPropagation(model)
    net.getNumberOfThreads()
    net.setNumberOfThreads(10)
    
    for j in range(len(evidence_LNM)):
        evidencerow = evidence_LNM.iloc[j]
        evidencerow = evidencerow.dropna().to_dict()
        
        try:
            net.setEvidence(evidencerow)
            
            net.makeInference()

            resultLNM = net.posterior("LNM")
            
            resultsLNM.append(resultLNM)
        except Exception as error:
            print("Error at row regarding LNM", j)
            print(error)
            
            resultsLNM.append(resultLNM)

    
    for j in range(len(evidence_Surv)):
        evidencerow = evidence_Surv.iloc[j]
        evidencerow = evidencerow.dropna().to_dict()
        
        try:
            net.setEvidence(evidencerow)
            
            net.makeInference()

            resultSurvival = net.posterior("Survival5yr")
            
            resultsSurvival.append(resultSurvival)
        except Exception as error:
            print("Error at row regarding Survival", j)
            print(error)
            
            resultsSurvival.append(resultSurvival)

        
    return resultsLNM, resultsSurvival


In [None]:
def getProbResults(results, target):
    res = []
    
    for i in range(len(results)):
        res.append(results[i][target])
    return pd.DataFrame(res)

Psuedocode:
- Create a function that takes the results and the targets as input
    - Check if the length of the results and the targets are the same
    - Get the ROC AUC, Log Loss, Brier, N Predicted/N Observed, and the ratio of N Predicted/N Observed
    - Return the results in a dataframe

In [None]:
from sklearn.metrics import roc_curve, f1_score
# Find the accuracy, roc auc, precision and recall for the results and the targets data
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, confusion_matrix, log_loss, brier_score_loss

def getSlimMetrics(results, targets):
    
    if not len(results) == len(targets):
        raise Exception('Results and targets are not the same length. Results:', len(results), ' Targets:', len(targets))
    
    # Get ROC AUC, Log Loss, Brier, and N Predicted/N Observed
    curve = roc_curve(targets, results, pos_label=1)
    ROC = round(roc_auc_score(targets, results), 4)
    LL = round(log_loss(targets, results), 4)
    Brier = round(brier_score_loss(targets, results), 4)
    N_pred = results.sum()[0]
    N_obs = int(targets.sum())
    N_br = f"{int(N_pred)}/{int(N_obs)}"
    Ratio = round(N_pred/N_obs, 4)
    
    x = pd.DataFrame([ROC, LL, Brier, N_br, Ratio], index=["ROC AUC", "Log Loss", "Brier", "N Predicted/N Observed", "Ratio"])
    return x

# Define different groups of sets that I would like to test and create mean ROC curves for

In [None]:
# Make different evidence groups, per group, fill in the possible combinations of evidences and create average ROC curves and AUC scores with spreads/confidence
# Different evidence groups:
# 1. CA125, Platelets, MRI_MI, MSI, POLE, ER, PR, p53, L1CAM, PreoperativeGrade (all)
# 2. [CA125 or Platelets](1 of these), [ER, PR, p53, L1CAM] (3 of these),  PreoperativeGrade
# 3. [CA125 or Platelets](1 of these), [ER, PR, p53, L1CAM] (1 of these),  PreoperativeGrade, MSI, POLE
# 4. [CA125 or Platelets](1 of these), [MSI, POLE] (1 of these),  PreoperativeGrade, MRI_MI
# 5. PreoperativeGrade, ER, PR, p53, L1CAM
# 6. CA125, Platelets, MRI_MI, MSI, POLE

# Generate all possible combinations of evidence in these categories
fullSet = ["CA125", "Platelets", "MRI_MI", "MSI", "POLE", "ER", "PR", "p53", "L1CAM", "PreoperativeGrade"]
CAbaseSet = [["CA125", "ER", "PR", "p53", "PreoperativeGrade"], 
           ["CA125", "PR", "p53", "L1CAM", "PreoperativeGrade"], 
           ["CA125", "p53", "L1CAM", "ER", "PreoperativeGrade"]]

PlatBaseSet = [["Platelets", "ER", "PR", "p53", "PreoperativeGrade"], 
               ["Platelets", "PR", "p53", "L1CAM", "PreoperativeGrade"],
               ["Platelets", "p53", "L1CAM", "ER", "PreoperativeGrade"]]

CAbaseSet_w_MSI_POLE = [["CA125", "ER", "PR", "p53", "PreoperativeGrade", "MSI", "POLE"], 
                        ["CA125", "PR", "p53", "L1CAM", "PreoperativeGrade", "MSI", "POLE"], 
                        ["CA125", "p53", "L1CAM", "ER", "PreoperativeGrade", "MSI", "POLE"]] 
                      
PlatBaseSet_w_MSI_POLE =  [["Platelets", "ER", "PR", "p53", "PreoperativeGrade", "MSI", "POLE"], 
                            ["Platelets", "PR", "p53", "L1CAM", "PreoperativeGrade", "MSI", "POLE"], 
                            ["Platelets", "p53", "L1CAM", "ER", "PreoperativeGrade", "MSI", "POLE"]]

CAbaseSet_w_MRI = [["CA125", "ER", "PR", "p53", "PreoperativeGrade", "MRI_MI"],
                ["CA125", "PR", "p53", "L1CAM", "PreoperativeGrade", "MRI_MI"], 
                ["CA125", "p53", "L1CAM", "ER", "PreoperativeGrade", "MRI_MI"]]
                
CA_one_IHC_MRI  = [["CA125", "ER", "PR", "p53", "PreoperativeGrade", "MRI_MI"], 
                ["CA125", "PR", "p53", "L1CAM", "PreoperativeGrade", "MRI_MI"], 
                ["CA125", "p53", "L1CAM", "ER", "PreoperativeGrade", "MRI_MI"]]
                
CA_one_IHC_POLE_MSI  = [["CA125", "ER", "PR", "p53", "PreoperativeGrade", "MSI", "POLE"], 
                ["CA125", "PR", "p53", "L1CAM", "PreoperativeGrade", "MSI", "POLE"], 
                ["CA125", "p53", "L1CAM", "ER", "PreoperativeGrade", "MSI", "POLE"]]

CA_MSI_POLE_MRI = [["CA125", "MSI", "POLE", "PreoperativeGrade", "MRI_MI"], 
                ["CA125", "MSI", "POLE", "ER", "PreoperativeGrade", "MRI_MI"], 
                ["CA125", "MSI", "POLE", "PR", "PreoperativeGrade", "MRI_MI"]]

IHC_PreopGrade = ["ER", "PR", "p53", "L1CAM", "PreoperativeGrade"]
CAPlatMRIPOLE = ["CA125", "Platelets", "MRI_MI", "MSI", "POLE"]

Define a function that takes a model and a set of evidence as input and returns the results

Pseudocode:
- Create a function that takes a model and a set of evidence as input
    - Create an empty list for the results
    - Check if the evidence set is a list of lists, if not, make it a list of lists
    - For each evidence set in the list
        - Get the evidence for the row
        - Set the evidence in the model
        - Make the inference
        - Get the posterior for LNM
        - Append the result to the list
    - Return the list

In [None]:
# Function to get a set of evidences and return the results
def evaluateSets(model, evidenceSet):
    results = []
    # Check if the evidence set is a list of lists, if not, make it a list of lists
    if not isinstance(evidenceSet[0], list):
        evidenceSet = [evidenceSet]
        
    for i in range(len(evidenceSet)):
        print(evidenceSet[i])
        evidence_LNM = df_brno_LNM[evidenceSet[i]]
        evidenceSurv = df_brno_surv[evidenceSet[i]]
        resultsLNM, resultsSurvival = getProbabilities(model, evidence_LNM, evidenceSurv)
        results.append([getProbResults(resultsLNM, 1), getProbResults(resultsSurvival, 1)])
    return results

def getLNMResults(results):
    res = []
    for i in range(len(results)):
        res.append(results[i][0])
    return res

def getSurvResults(results):
    res = []
    for i in range(len(results)):
        res.append(results[i][1])
    return res

In [None]:
# Get the results for the different evidence sets
fullSetResults = evaluateSets(net, fullSet)
CAbaseSetResults = evaluateSets(net, CAbaseSet)
PlatBaseSetResults = evaluateSets(net, PlatBaseSet)
CAbaseSet_w_MSI_POLE_Results = evaluateSets(net, CAbaseSet_w_MSI_POLE)
PlatBaseSet_w_MSI_POLE_Results = evaluateSets(net, PlatBaseSet_w_MSI_POLE)
CAbaseSet_w_MRI_Results = evaluateSets(net, CAbaseSet_w_MRI)
CA_one_IHC_MRI_Results = evaluateSets(net, CA_one_IHC_MRI)
CA_one_IHC_POLE_MSI_Results = evaluateSets(net, CA_one_IHC_POLE_MSI)
CA_MSI_POLE_MRI_Results = evaluateSets(net, CA_MSI_POLE_MRI)
IHC_PreopGrade_Results = evaluateSets(net, IHC_PreopGrade)
CAPlatMRIPOLE_Results = evaluateSets(net, CAPlatMRIPOLE)


In [None]:
# Get the LNM results for the different evidence sets
fullSetLNM = getLNMResults(fullSetResults)
CAbaseSetLNM = getLNMResults(CAbaseSetResults)
PlatBaseSetLNM = getLNMResults(PlatBaseSetResults)
CAbaseSet_w_MSI_POLE_LNM = getLNMResults(CAbaseSet_w_MSI_POLE_Results)
PlatBaseSet_w_MSI_POLE_LNM = getLNMResults(PlatBaseSet_w_MSI_POLE_Results)
CAbaseSet_w_MRI_LNM = getLNMResults(CAbaseSet_w_MRI_Results)
CA_one_IHC_MRI_LNM = getLNMResults(CA_one_IHC_MRI_Results)
CA_one_IHC_POLE_MSI_LNM = getLNMResults(CA_one_IHC_POLE_MSI_Results)
CA_MSI_POLE_MRI_LNM = getLNMResults(CA_MSI_POLE_MRI_Results)
IHC_PreopGrade_LNM = getLNMResults(IHC_PreopGrade_Results)
CAPlatMRIPOLE_LNM = getLNMResults(CAPlatMRIPOLE_Results)


Define a function that takes the results and the targets as input and returns the average ROC curve and AUC score

Pseudocode:
- Create a function that takes the results and the targets as input
    - Set the style and figure settings
    - Create a figure
    - Calculate the ROC curve and AUC for each model, plot them
    - Calculate the mean and std deviation of tpr
    - Plot the average ROC curve
    - Fill the area between the mean + std and mean - std
    - Plot the chance line
    - Set the limits and labels
    - Show the plot

In [None]:
from matplotlib_inline.backend_inline import set_matplotlib_formats
# Function to plot the average ROC curve and spread as an area around the average curve, AUC with interval in the title
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from itertools import cycle

def PlotMeanROC(y_scores, y_true, title = "Average ROC Curve"):
    # Set the style
    sns.set_style("whitegrid")
    # Set the output as png
    set_matplotlib_formats('png', 'pdf')
    plt.rcParams['figure.dpi'] = 150
    
    plt.figure(figsize=(5, 5))

    # Calculate ROC curve and AUC for each model
    tprs = []
    base_fpr = np.linspace(0, 1, 101)
    roc_aucs = []
    for i in range(len(y_scores)):
        fpr, tpr, _ = roc_curve(y_true, y_scores[i])
        roc_auc = auc(fpr, tpr)
        roc_aucs.append(roc_auc)
        plt.plot(fpr, tpr, 'b', alpha=0.15)  # Plot each ROC curve faintly
        tpr = np.interp(base_fpr, fpr, tpr)
        tpr[0] = 0.0
        tprs.append(tpr)
    
    # Calculate mean and std deviation of tpr
    mean_tprs = np.mean(tprs, axis=0)
    std = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tprs + std, 1)
    tprs_lower = np.maximum(mean_tprs - std, 0)
    mean_auc = np.mean(roc_aucs)
    std_auc = np.std(roc_aucs)
    
    # if y_scores is not a list of lists change the label to ROC
    if len(y_scores) == 1:
        label = "ROC (AUC = {:.2f})".format(mean_auc)
    else:
        label = 'Mean ROC (AUC = {:.2f} $\pm$ {:.2f})'.format(mean_auc, std_auc)
    # Plot the average ROC curve
    plt.plot(base_fpr, mean_tprs, 'k', label=label)
    
    # Fill the area between the mean + std and mean - std
    plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
    
    plt.plot([0, 1], [0, 1], 'r--')  # Plot the chance line
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc='lower right')
    plt.show()

In [None]:
# Plot the average ROC curve for the different evidence sets
PlotMeanROC(fullSetLNM, LNM_target_brno, "Full Set \n Average ROC Curve")
PlotMeanROC(CAbaseSetLNM, LNM_target_brno, "CA125 - 3 IHC -  PreoperativeGrade \n Average ROC Curve")
PlotMeanROC(PlatBaseSetLNM, LNM_target_brno, "Platelets - 3 IHC -  PreoperativeGrade \n Average ROC Curve")
PlotMeanROC(CAbaseSet_w_MSI_POLE_LNM, LNM_target_brno, "CA125 - 3 IHC - MMRd - POLE \n Average ROC Curve")
PlotMeanROC(PlatBaseSet_w_MSI_POLE_LNM, LNM_target_brno, "Platelets - 3 IHC - MMRd - POLE \n Average ROC Curve")
PlotMeanROC(CAbaseSet_w_MRI_LNM, LNM_target_brno, "CA125 - 3 IHC - Image_MI \n Average ROC Curve")
PlotMeanROC(CA_one_IHC_MRI_LNM, LNM_target_brno, "CA125 - 1 IHC - Image_MI \n Average ROC Curve")
PlotMeanROC(CA_one_IHC_POLE_MSI_LNM, LNM_target_brno, "CA125 - 1 IHC - MMRd - POLE \n Average ROC Curve")
PlotMeanROC(CA_MSI_POLE_MRI_LNM, LNM_target_brno, "CA125 - 1IHC - MMRd - POLE - Image_MI \n Average ROC Curve")
PlotMeanROC(IHC_PreopGrade_LNM, LNM_target_brno, "IHC - PreoperativeGrade \n Average ROC Curve")
PlotMeanROC(CAPlatMRIPOLE_LNM, LNM_target_brno, "CA125 - Platelets - Image_MI - MMRd - POLE \n Average ROC Curve")


# Nog een paar losse bedenken

In [None]:
# Geen Preopgraad bekend, wel MRI, MSI, POLE en base set
# Base sets zonder preopgrade
set1 = [["CA125", "ER", "p53", "L1CAM"], 
        ["CA125", "PR", "p53", "L1CAM"],
        ["CA125", "p53", "L1CAM", "ER"]]
# met MSI en POLE en met MRI
set2 = [["CA125", "ER", "p53", "L1CAM", "MRI_MI", "MSI", "POLE"], 
        ["CA125", "PR", "p53", "L1CAM", "MRI_MI", "MSI", "POLE"],
        ["CA125", "p53", "L1CAM", "ER", "MRI_MI", "MSI", "POLE"]]

# Resultaten
set1Results = evaluateSets(net, set1)
set2Results = evaluateSets(net, set2)

# LNM resultaten
set1LNM = getLNMResults(set1Results)
set2LNM = getLNMResults(set2Results)


In [None]:
# Plot de resultaten
PlotMeanROC(set1LNM, LNM_target_brno, "CA125 - 3 IHC \n Average ROC Curve")
PlotMeanROC(set2LNM, LNM_target_brno, "CA125 - 3 IHC - Image-MI - MMRd - POLE \n Average ROC Curve")

In [None]:
# base sets but replace CA125 with MRI_MI
set3 = [["MRI_MI", "ER", "p53", "L1CAM", "PreoperativeGrade"], 
        ["MRI_MI", "PR", "p53", "L1CAM", "PreoperativeGrade"],
        ["MRI_MI", "p53", "L1CAM", "ER", "PreoperativeGrade"]]

set4 = [["MRI_MI", "ER", "p53", "L1CAM", "MSI", "POLE", "PreoperativeGrade"], 
        ["MRI_MI", "PR", "p53", "L1CAM", "MSI", "POLE", "PreoperativeGrade"],
        ["MRI_MI", "p53", "L1CAM", "ER", "MSI", "POLE", "PreoperativeGrade"]]

set3Results = evaluateSets(net, set3)
set4Results = evaluateSets(net, set4)

set3LNM = getLNMResults(set3Results)
set4LNM = getLNMResults(set4Results)




In [None]:
# Plot de resultaten
PlotMeanROC(set3LNM, LNM_target_brno, "Imaging MI - 3 IHC \n Average ROC Curve")
PlotMeanROC(set4LNM, LNM_target_brno, "Imaging MI - 3 IHC - MMRd - POLE \n Average ROC Curve")   


In [None]:
# only CA125, MRI_MI, MSI, POLE

set5 = ["CA125", "MRI_MI", "MSI", "POLE"]

set5Results = evaluateSets(net, set5)

set5LNM = getLNMResults(set5Results)


In [None]:
# Plot de resultaten
PlotMeanROC(set5LNM, LNM_target_brno, "CA125 - Imaging MI - MMRd - POLE \n ROC Curve")

In [None]:
# CA 125 Platelets Imaging MI MSI POLE
set6 = ["CA125", "Platelets", "MRI_MI", "MSI", "POLE"]

set6Results = evaluateSets(net, set6)

set6LNM = getLNMResults(set6Results)


In [None]:
PlotMeanROC(set6LNM, LNM_target_brno, "CA125 - Platelets - Imaging MI - MMRd - POLE \n ROC Curve")