### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to get the ROC values with confidence intervals for both sets of data on the nodes 'LNM' and 'Survival5yr'
Created for the purpose of an Abstract for a conference

In [None]:
import pandas as pd
import numpy as np
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb
from collections import Counter

casper_model = gum.loadBN("../../0. Source_files/0.4. Original_Casper_files/Results/Casper_fitted_952.net")
WPlat_all = gum.loadBN("../1.3. Model/Fitted_Networks/R_WP_all_952.net")

df_brno = pd.read_csv("../../0. Source_files/0.2. Cleaned_data/Cleaned_Brno_model_complete.csv")
df_mayo = pd.read_csv("../../0. Source_files/0.2. Cleaned_data/MAYO_cleaned_model.csv")

pd.options.mode.copy_on_write = True  # This will allow the code to run faster and keep Pandas happy. Technical detail: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html#

# Fix labels in MAYO
df_mayo = df_mayo.replace({1:"yes", 0:"no", "<50%":"lt_50", ">50%":"ge_50", "No invasion":"lt_50"})
df_mayo["Cytology"] = df_mayo["Cytology"].replace({"no":"benign", "yes":"malignant"})

# Define evidence columns
evidence_columns = ["ER", "PR", "p53", "L1CAM", "Platelets", "Cytology", "MRI_MI", "MSI", "POLE", "PreoperativeGrade", "CA125"]

# Create subsets on which validation can be performed
df_brno_LNM = df_brno.dropna(subset=["LNM"])
df_brno_Survival = df_brno.dropna(subset=["Survival5yr"])

df_mayo_LNM = df_mayo.dropna(subset=["LNM"])
df_mayo_Survival = df_mayo.dropna(subset=["Survival5yr"])



Define the function to go through the data and generate results

Pseudocode:
- Create a function to get the probabilities
    - Create a list to store the results
    - For each row in the evidence set
        - Set the evidence
        - Make inference
        - Get the posterior
        - Append the result to the list
    - Return the list

In [None]:
def getProbabilities(model, evidence_LNM, evidence_Surv, Surv = "Survival5yr", samples = 100):
    ls_result_LNM = []
    ls_result_Surv = []
    
    #for i in range(1, samples):
    resultsLNM = []
    resultsSurvival = []
    net = gum.LazyPropagation(model)
    net.getNumberOfThreads()
    net.setNumberOfThreads(10)
    
    for j in range(len(evidence_LNM)):
        evidencerow = evidence_LNM.iloc[j]
        evidencerow = evidencerow.dropna().to_dict()
        
        try:
            net.setEvidence(evidencerow)
            
            net.makeInference()

            resultLNM = net.posterior("LNM")
            
            resultsLNM.append(resultLNM)
        except Exception as error:
            print("Error at row regarding LNM", j)
            print(error)
            
            resultsLNM.append(resultLNM)

    
    for j in range(len(evidence_Surv)):
        evidencerow = evidence_Surv.iloc[j]
        evidencerow = evidencerow.dropna().to_dict()
        
        try:
            net.setEvidence(evidencerow)
            
            net.makeInference()

            resultSurvival = net.posterior("Survival5yr")
            
            resultsSurvival.append(resultSurvival)
        except Exception as error:
            print("Error at row regarding Survival", j)
            print(error)
            
            resultsSurvival.append(resultSurvival)

        
    return resultsLNM, resultsSurvival

Define function to unpack the results

In [None]:
def getProbResults(results, target):
    res = []
    
    for i in range(len(results)):
        res.append(results[i][target])
    return pd.DataFrame(res)

Create evidence sets

Pseudocode:
- Copy the LNM and Survival columns
- Replace the values with 1 and 0
- Create a new dataframe with only the evidence columns
- Repeat for both datasets

In [None]:
brno_LNM_true = df_brno_LNM['LNM'].replace({"yes":1, "no":0})
df_brno_LNM = df_brno_LNM[evidence_columns]

brno_Surv_true = df_brno_Survival['Survival5yr'].replace({"yes":1, "no":0})
df_brno_Survival = df_brno_Survival[evidence_columns]

mayo_LNM_true = df_mayo_LNM['LNM_micromacro'].replace({"yes":1, "no":0})
df_mayo_LNM = df_mayo_LNM[evidence_columns]

mayo_Surv_true = df_mayo_Survival['Survival5yr'].replace({"yes":1, "no":0})
df_mayo_Survival = df_mayo_Survival[evidence_columns]

Get the results for the networks and both datasets

In [None]:
print("Brno LNM")
brno_results_LNM, brno_results_Survival = getProbabilities(WPlat_all, df_brno_LNM, df_brno_Survival)

print("Mayo LNM")
mayo_results_LNM, mayo_results_Survival = getProbabilities(WPlat_all, df_mayo_LNM, df_mayo_Survival)


Casper_df_brno_LNM = df_brno_LNM.copy().drop(columns=["MRI_MI", "MSI", "POLE"])
Casper_df_brno_Survival = df_brno_Survival.copy().drop(columns=["MRI_MI", "MSI", "POLE"])

Casper_df_mayo_LNM = df_mayo_LNM.copy().drop(columns=["MRI_MI", "MSI", "POLE"])
Casper_df_mayo_Survival = df_mayo_Survival.copy().drop(columns=["MRI_MI", "MSI", "POLE"])

print("Brno LNM, Casper")
brno_results_LNM_casper, brno_results_Survival_casper = getProbabilities(casper_model, Casper_df_brno_LNM, Casper_df_brno_Survival)

print("Mayo LNM, Casper")
mayo_results_LNM_casper, mayo_results_Survival_casper = getProbabilities(casper_model, Casper_df_mayo_LNM, Casper_df_mayo_Survival)
    

Unpack the results

In [None]:
brno_LNM_prob = getProbResults(brno_results_LNM, 1)
brno_Survival_prob = getProbResults(brno_results_Survival, 1)

mayo_LNM_prob = getProbResults(mayo_results_LNM, 1)
mayo_Survival_prob = getProbResults(mayo_results_Survival, 1)

brno_LNM_prob_casper = getProbResults(brno_results_LNM_casper, 1)
brno_Survival_prob_casper = getProbResults(brno_results_Survival_casper, 1)

mayo_LNM_prob_casper = getProbResults(mayo_results_LNM_casper, 1)
mayo_Survival_prob_casper = getProbResults(mayo_results_Survival_casper, 1)

Bootstrap the results to determine a confidence interval

Pseudocode, line by line:
- Create a list to store the results
- For each bootstrap sample
    - Sample the data
    - Get the true values
    - Get the AUC
    - Append the AUC to the list
- Return the list

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

# Generate new bootstrap samples, then 
samples = 200 
bootstrap_samples = 1000
alpha = 0.05

brno_LNM_ROC = []
brno_Survival_ROC = []

mayo_LNM_ROC = []
mayo_Survival_ROC = []

brno_LNM_ROC_casper = []
brno_Survival_ROC_casper = []

mayo_LNM_ROC_casper = []
mayo_Survival_ROC_casper = []

for i in range(bootstrap_samples):
    brno_LNM_prob_temp = brno_LNM_prob.sample(n=samples, replace=True)
    brno_Survival_prob_temp = brno_Survival_prob.sample(n=samples, replace=True)
    
    brno_LNM_res_temp = brno_LNM_true.iloc[brno_LNM_prob_temp.index]
    brno_Survival_res_temp = brno_Surv_true.iloc[brno_Survival_prob_temp.index]
    
    mayo_LNM_prob_temp = mayo_LNM_prob.sample(n=samples, replace=True)
    mayo_Survival_prob_temp = mayo_Survival_prob.sample(n=samples, replace=True)
    
    mayo_LNM_res_temp = mayo_LNM_true.iloc[mayo_LNM_prob_temp.index]
    mayo_Survival_res_temp = mayo_Surv_true.iloc[mayo_Survival_prob_temp.index]
    
    brno_LNM_ROC.append(roc_auc_score(brno_LNM_res_temp, brno_LNM_prob_temp))
    brno_Survival_ROC.append(roc_auc_score(brno_Survival_res_temp, brno_Survival_prob_temp))
    
    mayo_LNM_ROC.append(roc_auc_score(mayo_LNM_res_temp, mayo_LNM_prob_temp))
    mayo_Survival_ROC.append(roc_auc_score(mayo_Survival_res_temp, mayo_Survival_prob_temp))
    
    # Casper model
    
    brno_LNM_prob_temp_casper = brno_LNM_prob_casper.sample(n=samples, replace=True)
    brno_Survival_prob_temp_casper = brno_Survival_prob_casper.sample(n=samples, replace=True)
    
    brno_LNM_res_temp_casper = brno_LNM_true.iloc[brno_LNM_prob_temp_casper.index]
    brno_Survival_res_temp_casper = brno_Surv_true.iloc[brno_Survival_prob_temp_casper.index]
    
    mayo_LNM_prob_temp_casper = mayo_LNM_prob_casper.sample(n=samples, replace=True)
    mayo_Survival_prob_temp_casper = mayo_Survival_prob_casper.sample(n=samples, replace=True)
    
    mayo_LNM_res_temp_casper = mayo_LNM_true.iloc[mayo_LNM_prob_temp_casper.index]
    mayo_Survival_res_temp_casper = mayo_Surv_true.iloc[mayo_Survival_prob_temp_casper.index]
    
    brno_LNM_ROC_casper.append(roc_auc_score(brno_LNM_res_temp_casper, brno_LNM_prob_temp_casper))
    brno_Survival_ROC_casper.append(roc_auc_score(brno_Survival_res_temp_casper, brno_Survival_prob_temp_casper))
    
    mayo_LNM_ROC_casper.append(roc_auc_score(mayo_LNM_res_temp_casper, mayo_LNM_prob_temp_casper))
    mayo_Survival_ROC_casper.append(roc_auc_score(mayo_Survival_res_temp_casper, mayo_Survival_prob_temp_casper))
    

Create a function to define the confidence interval and print it

Pseudocode:
- Define a function to calculate the mean and confidence interval
    - Calculate the mean
    - Calculate the standard error
    - Calculate the confidence interval
    - Return the mean and confidence interval

In [None]:
# Calculate average AUC and 95% confidence interval
from scipy.stats import sem, t

def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    
    m, se = np.mean(a), sem(a)
    h = se * t.ppf((1 + confidence) / 2., n-1)
    
    return m, m-h, m+h

Print the results

Pseudocode:
- For each dataset and each network
    - Calculate the mean and confidence interval through the function
    - Print the result

In [None]:
brno_LNM_mean, brno_LNM_lower, brno_LNM_upper = mean_confidence_interval(brno_LNM_ROC)
brno_Survival_mean, brno_Survival_lower, brno_Survival_upper = mean_confidence_interval(brno_Survival_ROC)

mayo_LNM_mean, mayo_LNM_lower, mayo_LNM_upper = mean_confidence_interval(mayo_LNM_ROC)
mayo_Survival_mean, mayo_Survival_lower, mayo_Survival_upper = mean_confidence_interval(mayo_Survival_ROC)

brno_LNM_mean_casper, brno_LNM_lower_casper, brno_LNM_upper_casper = mean_confidence_interval(brno_LNM_ROC_casper)
brno_Survival_mean_casper, brno_Survival_lower_casper, brno_Survival_upper_casper = mean_confidence_interval(brno_Survival_ROC_casper)

mayo_LNM_mean_casper, mayo_LNM_lower_casper, mayo_LNM_upper_casper = mean_confidence_interval(mayo_LNM_ROC_casper)
mayo_Survival_mean_casper, mayo_Survival_lower_casper, mayo_Survival_upper_casper = mean_confidence_interval(mayo_Survival_ROC_casper)

# print the result which is in the form of mean [lower, upper] (95% CI) and rounded 
print("Brno LNM: ", round(brno_LNM_mean, 3), " [",round(brno_LNM_lower, 3),",", round(brno_LNM_upper, 3),"]")
print("Brno Survival: ", round(brno_Survival_mean, 3), " [",round(brno_Survival_lower, 3),",", round(brno_Survival_upper, 3),"]\n")

print("Mayo LNM: ", round(mayo_LNM_mean, 3), " [",round(mayo_LNM_lower, 3),",", round(mayo_LNM_upper, 3),"]")
print("Mayo Survival: ", round(mayo_Survival_mean, 3), " [",round(mayo_Survival_lower, 3),",", round(mayo_Survival_upper, 3),"]\n")

print("Brno LNM, Casper: ", round(brno_LNM_mean_casper, 3), " [",round(brno_LNM_lower_casper, 3),",", round(brno_LNM_upper_casper, 3),"]")
print("Brno Survival, Casper: ", round(brno_Survival_mean_casper, 3), " [",round(brno_Survival_lower_casper, 3),",", round(brno_Survival_upper_casper, 3),"]\n")

print("Mayo LNM, Casper: ", round(mayo_LNM_mean_casper, 3), " [",round(mayo_LNM_lower_casper, 3),",", round(mayo_LNM_upper_casper, 3),"]")
print("Mayo Survival, Casper: ", round(mayo_Survival_mean_casper, 3), " [",round(mayo_Survival_lower_casper, 3),",", round(mayo_Survival_upper_casper, 3),"]\n")

Create a function to retrieve the FNR at a certain threshold and print them at 0.1
You might get an ill-defined error if the sample does not contain both classes

Pseudocode:
- Create a function to get the FNR
    - Create a list to store the results
    - For each bootstrap sample
        - Sample the data
        - Get the true values
        - Get the FNR
        - Append the FNR to the list
    - Return the mean and confidence interval

In [None]:
# Import fnr
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
# import fnr at 10% theshold to diagnose LNM, also bootstrap this
samples = 1000 
sample_size = 50
def getFNR(true, pred, threshold):
    fnrs = []
    for i in range(bootstrap_samples):
        pred_temp = pred.sample(n=sample_size, replace=True, random_state=i)
        true_temp = true.iloc[pred_temp.index]
        
        for i in range(len(pred_temp)):
            try:
                if pred_temp.iloc[i][0] >= threshold:
                    pred_temp.iloc[i] = 1
                else:
                    pred_temp.iloc[i] = 0
            except Exception as error:
                print(error)
                print(i)
                print(pred_temp.iloc[i])
                print(pred_temp.iloc[i])
                print("-----")
                
        fnr = 1 - recall_score(y_true=true_temp, y_pred=pred_temp)
        fnrs.append(fnr)
    
    fnr, lower, upper = mean_confidence_interval(fnrs)
    
    fnr_string = str(round(fnr, 3)) + " [" + str(round(lower, 3)) + ", " + str(round(upper, 3)) + "]"
    
    return fnr_string
    
brno_LNM_FNR = getFNR(brno_LNM_true, brno_LNM_prob, 0.1)
brno_Survival_FNR = getFNR(brno_Surv_true, brno_Survival_prob, 0.1)

mayo_LNM_FNR = getFNR(mayo_LNM_true, mayo_LNM_prob,0.1)
mayo_Survival_FNR = getFNR(mayo_Surv_true, mayo_Survival_prob, 0.1)

brno_LNM_FNR_Casper = getFNR(brno_LNM_true, brno_LNM_prob_casper, 0.1)
brno_Survival_FNR_Casper = getFNR(brno_Surv_true, brno_Survival_prob_casper, 0.1)

mayo_LNM_FNR_Casper = getFNR(mayo_LNM_true, mayo_LNM_prob_casper, 0.1)
mayo_Survival_FNR_Casper = getFNR(mayo_Surv_true, mayo_Survival_prob_casper, 0.1)
print("\n")

print("Brno LNM: ", brno_LNM_FNR)
print("Brno Survival: ", brno_Survival_FNR)
print("\n")

print("Mayo LNM: ", mayo_LNM_FNR)
print("Mayo Survival: ", mayo_Survival_FNR)
print("\n")

print("Brno LNM, Casper: ", brno_LNM_FNR_Casper)    
print("Brno Survival, Casper: ", brno_Survival_FNR_Casper)
print("\n")

print("Mayo LNM, Casper: ", mayo_LNM_FNR_Casper)
print("Mayo Survival, Casper: ", mayo_Survival_FNR_Casper)


Create a function to generate the FPR at a threshold and generate and print it at 0.1

Pseudocode:
- Create a function to get the FPR
      - loop through the data
      - threshold the data
      - get the confusion matrix
      - calculate the FPR
      - return the FPR
- Print the result

In [None]:
# Import fpr
from sklearn.metrics import precision_score
# import fpr at 10% theshold to diagnose LNM
def getFPR(true, pred, threshold):
    for i in range(len(pred)):
        try:
            if pred.iloc[i][0] >= threshold:
                pred.iloc[i] = 1
            else:
                pred.iloc[i] = 0
        except Exception as error:
            print(error)
            print(i)
            print(pred.iloc[i])
            print(pred.iloc[i])
            print("-----")
    
    tn, fp, fn, tp = confusion_matrix(y_true=true, y_pred=pred).ravel()
    
    precision = precision_score(y_true=true, y_pred=pred)
    fpr = 1 - precision
    
    return fpr

brno_LNM_FPR = getFPR(brno_LNM_true, brno_LNM_prob, 0.1)
brno_Survival_FPR = getFPR(brno_Surv_true, brno_Survival_prob, 0.1)

mayo_LNM_FPR = getFPR(mayo_LNM_true, mayo_LNM_prob,0.1)
mayo_Survival_FPR = getFPR(mayo_Surv_true, mayo_Survival_prob, 0.1)

brno_LNM_FPR_Casper = getFPR(brno_LNM_true, brno_LNM_prob_casper, 0.1)
brno_Survival_FPR_Casper = getFPR(brno_Surv_true, brno_Survival_prob_casper, 0.1)

mayo_LNM_FPR_Casper = getFPR(mayo_LNM_true, mayo_LNM_prob_casper, 0.1)
mayo_Survival_FPR_Casper = getFPR(mayo_Surv_true, mayo_Survival_prob_casper, 0.1)

print("Brno LNM: ", round(brno_LNM_FPR, 3))
print("Brno Survival: ", round(brno_Survival_FPR, 3))

print("Mayo LNM: ", round(mayo_LNM_FPR, 3))
print("Mayo Survival: ", round(mayo_Survival_FPR, 3))
print("\n")
print("Brno LNM, Casper: ", round(brno_LNM_FPR_Casper, 3))
print("Brno Survival, Casper: ", round(brno_Survival_FPR_Casper, 3))
print("\n")

print("Mayo LNM, Casper: ", round(mayo_LNM_FPR_Casper, 3))
print("Mayo Survival, Casper: ", round(mayo_Survival_FPR_Casper, 3))

Create a function to generate the PPV at a threshold and generate and print it at 0.1

Pseudocode:
- Create a function to get the PPV
    - Threshold the data
    - Get the confusion matrix
    - Calculate the PPV
    - Return the PPV
- Print the result

In [None]:
# Get PPV
def getPPV(data, result, threshold = 0.1):
    for i in range(len(data)):
        try:
            if data.iloc[i][0] > threshold:
                data.iloc[i] = 1
            else:
                data.iloc[i] = 0
        except Exception as error:
            print(error)
            print(i)
            print(data.iloc[i])
            print(result.iloc[i])
            print("-----")
    tn, fp, fn, tp = confusion_matrix(result, data).ravel()
    
    return tp/(tp+fp)

brno_LNM_PPV = getPPV(brno_LNM_prob, brno_LNM_true, 0.1)
brno_Survival_PPV = getPPV(brno_Survival_prob, brno_Surv_true, 0.1)

mayo_LNM_PPV = getPPV(mayo_LNM_prob, mayo_LNM_true, 0.1)
mayo_Survival_PPV = getPPV(mayo_Survival_prob, mayo_Surv_true, 0.1)

brno_LNM_PPV_Casper = getPPV(brno_LNM_prob_casper, brno_LNM_true, 0.1)
brno_Survival_PPV_Casper = getPPV(brno_Survival_prob_casper, brno_Surv_true, 0.1)

mayo_LNM_PPV_Casper = getPPV(mayo_LNM_prob_casper, mayo_LNM_true, 0.1)
mayo_Survival_PPV_Casper = getPPV(mayo_Survival_prob_casper, mayo_Surv_true, 0.1)

print("Brno LNM: ", round(brno_LNM_PPV, 3))
print("Brno Survival: ", round(brno_Survival_PPV, 3))
print("\n")

print("Mayo LNM: ", round(mayo_LNM_PPV, 3))
print("Mayo Survival: ", round(mayo_Survival_PPV, 3))
print("\n")

print("Brno LNM, Casper: ", round(brno_LNM_PPV_Casper, 3))
print("Brno Survival, Casper: ", round(brno_Survival_PPV_Casper, 3))
print("\n")

print("Mayo LNM, Casper: ", round(mayo_LNM_PPV_Casper, 3))
print("Mayo Survival, Casper: ", round(mayo_Survival_PPV_Casper, 3))
print("\n")
