### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to compare the pure MAYO validation with CA125 imputed MAYO validation and the complete imputed MAYO validation. 



In [None]:
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#net = gum.loadBN("../0.3. Original_Casper_files/Results/Casper_fitted_763.net")
net = gum.loadBN("../3. Model/Fitted_Networks/R_WP_all_train_952.net")

Load the data, select only the rows that have a LNM value in the original dataset and select these rows in the imputed datasets.

In [None]:
df_or = pd.read_csv("../0.1. Cleaned_data/MAYO_subdag.csv").dropna(subset=["LNM"])
df_CA125 = pd.read_csv("../0.2. Imputed_data/Informed_imputation_CA125.csv", sep=",")
df_compl = pd.read_csv("../0.2. Imputed_data/MAYO-imputed-complete.csv", sep=";")

index = df_or.index
df_CA125 = df_CA125.iloc[index]
df_compl = df_compl.iloc[index]

df_or = df_or.reset_index(drop=True)
df_CA125 = df_CA125.reset_index(drop=True)
df_compl = df_compl.reset_index(drop=True)


Fix labels

In [None]:
df_or["Cytology"].replace({"no": "benign", "yes":"malignant"}, inplace=True)
df_CA125["Cytology"].replace({"no": "benign", "yes":"malignant"}, inplace=True)
df_compl["Cytology"].replace({"no": "benign", "yes":"malignant"}, inplace=True)

Create the evidence sets and the target sets

In [None]:
evidence_columns = ["ER", "PR", "p53", "L1CAM", "CA125", "Platelets", "POLE", "MSI", "PreoperativeGrade", "Cytology"]

df_or_evi = df_or[evidence_columns]
df_CA125_evi = df_CA125[evidence_columns]
df_compl_evi = df_compl[evidence_columns]

targets = df_or[["LNM_micromacro", "Survival5yr"]]
targets.rename(columns={"LNM_micromacro": "LNM"}, inplace=True)

Create a function to loop through the evidence and get the results

In [None]:
# Function to loop through the evidence and get the results

def getProbabilities(model,evidence, Surv = "Survival5yr"):
    resultsLNM = []
    resultsSurvival = []
    
    for i in range(len(evidence)):
        evidencerow = evidence.iloc[i]
        evidencerow = evidencerow.dropna().to_dict()

        result = gum.getPosterior(model, evs = evidencerow, target = "LNM")
        resultsLNM.append(result)
        
        result = gum.getPosterior(model, evs = evidencerow, target = Surv)
        resultsSurvival.append(result)
        
    return resultsLNM, resultsSurvival

Get the probabilities for the original dataset, the CA125 dataset and the complete dataset

In [None]:
# Get the probabilities for the original dataset
print("Original dataset")
or_LNM_res, or_Surv_res = getProbabilities(net, df_or_evi)
# Get the probabilities for the CA125 dataset
print("CA125 dataset")
CA125_LNM_res, CA125_Surv_res = getProbabilities(net, df_CA125_evi)
# Get the probabilities for the complete dataset
print("Complete dataset")
compl_LNM_res, compl_Surv_res = getProbabilities(net, df_compl_evi)


Define a function to get the results based on a threshold for the probability

In [None]:
# Define a function to get the results based on a threshold for the probability
def getResultsLNM(results, threshold, target):
    res = []
    
    for i in range(len(results)):
        if results[i].argmax()[0][0][target] == 1 and results[i].argmax()[1] > threshold:
            res.append(1)
        else:
            res.append(0)
    return pd.DataFrame(res)

def getResultsSurv(results, threshold, target, Surv_tar = 1):
    res = []
    
    for i in range(len(results)):
        if results[i].argmax()[0][0][target] == Surv_tar and results[i].argmax()[1] > threshold:
            res.append(1)
        else:
            res.append(0)
    return pd.DataFrame(res)

Define a function to get the probability results

In [None]:
def getProbResults(results, target):
    res = []
    
    for i in range(len(results)):
        res.append(results[i][target])
    return pd.DataFrame(res)

Get the results for the original dataset, the CA125 dataset and the complete dataset

In [None]:
org_LNM_res_prob = getProbResults(or_LNM_res, 1)
org_Surv_res_prob = getProbResults(or_Surv_res, 1)

CA125_LNM_res_prob = getProbResults(CA125_LNM_res, 1)
CA125_Surv_res_prob = getProbResults(CA125_Surv_res, 1)

compl_LNM_res_prob = getProbResults(compl_LNM_res, 1)
compl_Surv_res_prob = getProbResults(compl_Surv_res, 1)

defing a function to get the metrics and the slim (non-threshold) metrics

In [None]:
from sklearn.metrics import roc_curve, brier_score_loss
# Find the accuracy, roc auc, precision and recall for the results and the targets data
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, confusion_matrix, log_loss

def getMetrics(results, targets):
    TP = confusion_matrix(targets, results)[1][1]
    TN = confusion_matrix(targets, results)[0][0]
    FP = confusion_matrix(targets, results)[0][1]
    FN = confusion_matrix(targets, results)[1][0]
    
    accuracy = accuracy_score(targets, results)
    roc_auc = roc_auc_score(targets, results)
    precision = precision_score(targets, results)
    TPR = recall_score(targets, results)
    TNR = TN / (TN + FP)
    
    f1 = 2 * (precision * TPR) / (precision + TPR)
    brier = np.mean((results - targets)**2)
    loglike = log_loss(targets, results)
    
    x = pd.DataFrame([accuracy, roc_auc, precision, TPR,TNR, f1, brier, loglike], index=["Accuracy", "ROC AUC", "Precision (PPV)", "TPR (Recall/Sens)","TNR (Spec)", "F1", "Brier", "Log Loss"])
    return x

def getSlimMetrics(results, targets):
    # Get ROC AUC, Log Loss, Brier, and N Predicted/N Observed
    curve = roc_curve(targets, results, pos_label=1)
    ROC = round(roc_auc_score(targets, results), 4)
    LL = round(log_loss(targets, results), 4)
    Brier = round(brier_score_loss(targets, results), 4)
    N_pred = results.sum()[0]
    N_obs = int(targets.sum())
    N_br = f"{int(N_pred)}/{int(N_obs)}"
    Ratio = round(N_pred/N_obs, 4)
    
    x = pd.DataFrame([ROC, LL, Brier, N_br, Ratio], index=["ROC AUC", "Log Loss", "Brier", "N Predicted/N Observed", "Ratio"])
    return x


Encode the targets

In [None]:
# Encode targets
targets["LNM"] = targets["LNM"].replace({"yes": 1, "no": 0})
targets["Survival5yr"] = targets["Survival5yr"].replace({"yes": 1, "no": 0})



Get the metrics for the original data, the CA125 data and the complete data

In [None]:
# Get the metrics for the original data
org_LNM_metrics = getSlimMetrics(org_LNM_res_prob, targets["LNM"])
org_Surv_metrics = getSlimMetrics(org_Surv_res_prob, targets["Survival5yr"])
# Get the metrics for the CA125 data
CA125_LNM_metrics = getSlimMetrics(CA125_LNM_res_prob, targets["LNM"])
CA125_Surv_metrics = getSlimMetrics(CA125_Surv_res_prob, targets["Survival5yr"])
# Get the metrics for the complete data
compl_LNM_metrics = getSlimMetrics(compl_LNM_res_prob, targets["LNM"])
compl_Surv_metrics = getSlimMetrics(compl_Surv_res_prob, targets["Survival5yr"])



Concatenate the metrics to compare between the models LNM

In [None]:
LNM_metrics = pd.concat([org_LNM_metrics, CA125_LNM_metrics, compl_LNM_metrics], axis=1)
LNM_metrics = LNM_metrics.round(3)
LNM_metrics.columns = ["Original data", "CA125", "Imputed data"]
LNM_metrics


Concatenate the metrics to compare between the models Survival

In [None]:
Surv_metrics = pd.concat([org_Surv_metrics, CA125_Surv_metrics, compl_Surv_metrics], axis=1)
Surv_metrics = Surv_metrics.round(3)
Surv_metrics.columns = ["Original data", "CA125", "Imputed data"]
Surv_metrics

load other sets to compare the distribution of the CA125 values

In [None]:
# Compare between the distribution of imputed CA125 and PIPENDO/Training CA125
df_PIP = pd.read_csv("../0.1. Cleaned_data/Pipendo_with_risk_levels.csv")
df_Training = pd.read_csv("../0.1. Cleaned_data/Training_TCGA_Risk_levels.csv")
df_Tub = pd.read_csv("../0.1. Cleaned_data/Tubingen_risk_groups.csv")

# Get the CA125 values for the original data
CA125_orig = df_or["CA125"].replace({"lt_35": "<35", "ge_35": ">=35"})
# Get the CA125 values for the imputed data
CA125_imputed = df_compl["CA125"].replace({"lt_35": "<35", "ge_35": ">=35"})
# Get the CA125 values for the PIPENDO data
CA125_PIP = df_PIP["CA125_PREOP_bi"].replace({"<35 U/mL (=normal)": "<35", "=/>35 U/mL (=abnormal)": ">=35"})
# Get the CA125 values for the Training data
CA125_Training = df_Training["CA125_PREOP_bi"].replace({0: "<35", 1: ">=35"})
# Get the CA125 values for the Tubingen data
CA125_Tub = df_Tub["CA125_bi"].replace({"<=35": "<35", ">35": ">=35"})


Create a plot to compare the distribution of the CA125 values

In [None]:
# Create a dataframe with the CA125 count values normalized with an N
N_or = len(CA125_orig.dropna())
N_imputed = len(CA125_imputed.dropna())
N_PIP = len(CA125_PIP.dropna())
N_Training = len(CA125_Training.dropna())
N_Tub = len(CA125_Tub.dropna())

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_inline.backend_inline import set_matplotlib_formats

sns.set_style("whitegrid")
set_matplotlib_formats("png")

CA125_counts = pd.DataFrame([CA125_orig.value_counts(normalize=True), CA125_imputed.value_counts(normalize=True), CA125_PIP.value_counts(normalize=True), CA125_Training.value_counts(normalize=True), CA125_Tub.value_counts(normalize=True)], index=[f"Original N={N_or}", f"Imputed N={N_imputed}", f"PIPENDO N={N_PIP}", f"Training N={N_Training}", f"Tubingen N={N_Tub}"])

CA125_counts.plot(kind="bar", figsize=(10,5))
plt.title("CA125 distribution")
plt.ylabel("Normalized frequency")
plt.xlabel("Datasets")
plt.xticks(rotation=0)