### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to validate the generated sets with the PIPENDO set



In [None]:
import pandas as pd
import numpy as np
import pyAgrum as gum

df = pd.read_csv("../../0. Source_files/0.2. Cleaned_data/Casper_PIPENDO_Cleaned.csv")

orNet = gum.loadBN("../../0. Source_files/0.4. Original_Casper_files/Results/Casper_fitted_952.net")
WOP_MRI = gum.loadBN("../1.3. Model/Fitted_Networks/R_WOP_AddMRIMI_fitted_952.net")
WOP_TCGA = gum.loadBN("../1.3. Model/Fitted_Networks/R_WOP_AddTCGA_1_fitted_952.net")
WOP_all = gum.loadBN("../1.3. Model/Fitted_Networks/R_WOP_original_952.net")
WP_none = gum.loadBN("../1.3. Model/Fitted_Networks/R_WP_952.net")
WP_all = gum.loadBN("../1.3. Model/Fitted_Networks/R_WP_all_952.net")

pd.options.mode.copy_on_write = True  # This will allow the code to run faster and keep Pandas happy. Technical detail: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html#

Definet the targets

In [None]:
target_Surv = df['Survival5yr'].replace({'no': 0, 'yes':1})
target_LNM = df['LNM'].replace({'no':0, 'yes':1})

df = df.drop(columns=['Survival5yr', 'LNM'])

df.drop(columns=['Unnamed: 0'], inplace=True)


define the function to get the probabilities from the model

Pseudocode:
- Create an empty list for the results
- Loop over the evidence
    - Get the evidence row
    - Get the probabilities for the LNM and the Survival
    - Append the results to the list
- Return the list

In [None]:
def getProbabilities(model,evidence, Surv = "Survival5yr"):
    resultsLNM = []
    resultsSurvival = []
    
    for i in range(len(evidence)):
        evidencerow = evidence.iloc[i]
        evidencerow = evidencerow.dropna().to_dict()

        result = gum.getPosterior(model, evs = evidencerow, target = "LNM")
        resultsLNM.append(result)
        
        result = gum.getPosterior(model, evs = evidencerow, target = Surv)
        resultsSurvival.append(result)
        
    return resultsLNM, resultsSurvival

Define the function to unpack the results

In [None]:
def getProbResults(results, target):
    res = []
    
    for i in range(len(results)):
        res.append(results[i][target])
    return pd.DataFrame(res)

Define the functions to retrieve the metrics

Pseudocode:
- Define getMetrics function:
    - Get the confusion matrix
    - Get the accuracy
    - Get the ROC AUC
    - Get the precision
    - Get the recall
    - Get the specificity
    - Get the F1 score
    - Get the Brier score
    - Get the log loss
    - Return the metrics
- Define getSlimMetrics function:
    - Get the ROC AUC
    - Get the Log Loss
    - Get the Brier score
    - Get the N Predicted/N Observed
    - Get the Ratio
    - Return the metrics

In [None]:
from sklearn.metrics import roc_curve, auc
# Find the accuracy, roc auc, precision and recall for the results and the targets data
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, confusion_matrix, log_loss, brier_score_loss

def getMetrics(results, targets):
    TP = confusion_matrix(targets, results)[1][1]
    TN = confusion_matrix(targets, results)[0][0]
    FP = confusion_matrix(targets, results)[0][1]
    FN = confusion_matrix(targets, results)[1][0]
    
    accuracy = accuracy_score(targets, results)
    roc_auc = roc_auc_score(targets, results)
    precision = precision_score(targets, results)
    TPR = recall_score(targets, results)
    TNR = TN / (TN + FP)
    
    f1 = 2 * (precision * TPR) / (precision + TPR)
    brier = brier_score_loss(targets, results)
    loglike = log_loss(targets, results)
    
    x = pd.DataFrame([accuracy, roc_auc, precision, TPR,TNR, f1, brier, loglike], index=["Accuracy", "ROC AUC", "Precision (PPV)", "TPR (Recall/Sens)","TNR (Spec)", "F1", "Brier", "Log Loss"])
    return x

def getSlimMetrics(results, targets):
    # Get ROC AUC, Log Loss, Brier, and N Predicted/N Observed
    curve = roc_curve(targets, results, pos_label=1)
    ROC = round(roc_auc_score(targets, results), 4)
    LL = round(log_loss(targets, results), 4)
    Brier = round(brier_score_loss(targets, results), 4)
    N_pred = results.sum()[0]
    N_obs = int(targets.sum())
    N_br = f"{int(N_pred)}/{int(N_obs)}"
    Ratio = round(N_pred/N_obs, 4)
    
    x = pd.DataFrame([ROC, LL, Brier, N_br, Ratio], index=["ROC AUC", "Log Loss", "Brier", "N Predicted/N Observed", "Ratio"])
    return x


Get the results from the models

In [None]:
evidence_columns = ["ER", "PR", "p53", "L1CAM", "CA125", "CTMRI", "Platelets", "Cytology", "PreoperativeGrade"] # Without MRI_MI, POLE, and MSI since they are not in the dataset

In [None]:
print("Started 1")
# Create a list that contains the overlap between evidence columns and the nodes in the network
overlap = list(set(evidence_columns).intersection(orNet.names()))
Or_Prob_LNM, Or_Prob_Survival = getProbabilities(orNet, df[overlap], "Survival5yr")
print("Started 2")
overlap = list(set(evidence_columns).intersection(WOP_MRI.names()))
WOP_MRI_Prob_LNM, WOP_MRI_Prob_Survival = getProbabilities(WOP_MRI, df[overlap], "Survival5yr")
print("Started 3")
overlap = list(set(evidence_columns).intersection(WOP_TCGA.names()))
WOP_TCGA_Prob_LNM, WOP_TCGA_Prob_Survival = getProbabilities(WOP_TCGA, df[overlap], "Survival5yr")
print("Started 4")
overlap = list(set(evidence_columns).intersection(WOP_all.names()))
WOP_all_Prob_LNM, WOP_all_Prob_Survival = getProbabilities(WOP_all, df[overlap], "Survival5yr")
print("Started 5")
overlap = list(set(evidence_columns).intersection(WP_none.names()))
WP_none_Prob_LNM, WP_none_Prob_Survival = getProbabilities(WP_none, df[overlap], "Survival5yr")
print("Started 6")
overlap = list(set(evidence_columns).intersection(WP_all.names()))
WP_all_Prob_LNM, WP_all_Prob_Survival = getProbabilities(WP_all, df[overlap], "Survival5yr")


Unpack the results

In [None]:
# Get the results for the LNM
#resultsLNM = getProbResults(prob_LNM, 1)
OrResultsSurv = getProbResults(Or_Prob_Survival, 1)
WOP_MRI_ResultsSurv = getProbResults(WOP_MRI_Prob_Survival, 1)
WOP_TCGA_ResultsSurv = getProbResults(WOP_TCGA_Prob_Survival, 1)
WOP_all_ResultsSurv = getProbResults(WOP_all_Prob_Survival, 1)
WP_none_ResultsSurv = getProbResults(WP_none_Prob_Survival, 1)
WP_all_ResultsSurv = getProbResults(WP_all_Prob_Survival, 1)


Retrieve the metrics for the model

In [None]:
# Get the metrics for the LNM
#metricsLNM = getSlimMetrics(resultsLNM, target_LNM)
OrMetricsSurv = getSlimMetrics(OrResultsSurv, target_Surv)
WOP_MRI_MetricsSurv = getSlimMetrics(WOP_MRI_ResultsSurv, target_Surv)
WOP_TCGA_MetricsSurv = getSlimMetrics(WOP_TCGA_ResultsSurv, target_Surv)
WOP_all_MetricsSurv = getSlimMetrics(WOP_all_ResultsSurv, target_Surv)
WP_none_MetricsSurv = getSlimMetrics(WP_none_ResultsSurv, target_Surv)
WP_all_MetricsSurv = getSlimMetrics(WP_all_ResultsSurv, target_Surv)

Create a dataframe of the metrics for comparison

In [None]:
metricsSurv = pd.concat([OrMetricsSurv, WOP_MRI_MetricsSurv, WOP_TCGA_MetricsSurv, WOP_all_MetricsSurv, WP_none_MetricsSurv, WP_all_MetricsSurv], axis=1)

metricsSurv.columns = ["Original", "-Plat +MRI", "-Plat +TCGA", "-Plat+MRI+TCGA", "+Plat -MRI -TCGA", "+Plat +TCGA +MRI"]

In [None]:
metricsSurv