### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to load in the data from the KFold cross-validation in R and plot the mean ROC curves for the different models.



In [None]:
import numpy as np
import pandas as pd
import os

filenameList = os.listdir('Transfer_results')
filenameList = [x for x in filenameList if x.endswith('.csv')]

The following code loads in the fold validation and result data 

In [None]:
noPlat_MRI = pd.DataFrame()
noPlat_noRest = pd.DataFrame()
noPlat_TCGA = pd.DataFrame()
noPlat_TCGA_MRI = pd.DataFrame()
Or_nets = pd.DataFrame()
Plat_NoRest = pd.DataFrame()
Plat_all = pd.DataFrame()

for i in range(len(filenameList)):
    filename = filenameList[i]
    
    modelType = filename[8:-10] if filename.__contains__("_LNM") else (filename[:-6] if filename.__contains__("Testset") else filename[8:-11])
    number = int(filename[-5])
    node = 'LNM' if filename.__contains__("_LNM") else ('surv' if filename.__contains__("_Surv") else 'test')
    
    data = pd.read_csv('Transfer_results/' + filename, index_col=0).transpose().reset_index(drop=True)
    if number == 1: # First is all NaN in LNM
        continue
    try:
        if modelType == 'NoPlat_MRI':
            if node == 'LNM':
                col = 'LNM_' + str(number)
                noPlat_MRI[col] = data
            elif node == 'surv':
                col = 'surv_' + str(number)
                noPlat_MRI[col] = data
        elif modelType == 'NoPlat_NoRest':
            if node == 'LNM':
                col = 'LNM_' + str(number)
                noPlat_noRest[col] = data
            elif node == 'surv':
                col = 'surv_' + str(number)
                noPlat_noRest[col] = data
        elif modelType == 'NoPlat_TCGA':
            if node == 'LNM':
                col = 'LNM_' + str(number)
                noPlat_TCGA[col] = data
            elif node == 'surv':
                col = 'surv_' + str(number)
                noPlat_TCGA[col] = data
        elif modelType == 'NoPlat_TCGA_MRI':
            if node == 'LNM':
                col = 'LNM_' + str(number)
                noPlat_TCGA_MRI[col] = data
            elif node == 'surv':
                col = 'surv_' + str(number)
                noPlat_TCGA_MRI[col] = data
        elif modelType == 'Or':
            if node == 'LNM':
                col = 'LNM_' + str(number)
                Or_nets[col] = data
            elif node == 'surv':
                col = 'surv_' + str(number)
                Or_nets[col] = data
        elif modelType == 'Plat_NoRest':
            if node == 'LNM':
                col = 'LNM_' + str(number)
                Plat_NoRest[col] = data
            elif node == 'surv':
                col = 'surv_' + str(number)
                Plat_NoRest[col] = data
        elif modelType == 'Plat_TCGA_MRI':
            if node == 'LNM':
                col = 'LNM_' + str(number)
                Plat_all[col] = data
            elif node == 'surv':
                col = 'surv_' + str(number)
                Plat_all[col] = data
        else:
            print('Error: Non matching - ' + filename + " - " + modelType)
    except Exception as e:
        print(e)
        print(filename)
        
            
        
Testset_2 = pd.read_csv('Transfer_results/Testset_2.csv', index_col=0)
Testset_3 = pd.read_csv('Transfer_results/Testset_3.csv', index_col=0)
Testset_4 = pd.read_csv('Transfer_results/Testset_4.csv', index_col=0)
Testset_5 = pd.read_csv('Transfer_results/Testset_5.csv', index_col=0)
Testset_6 = pd.read_csv('Transfer_results/Testset_6.csv', index_col=0)

TestSets = pd.DataFrame({'idx':[2,3,4,5,6], 'sets':[Testset_2, Testset_3, Testset_4, Testset_5, Testset_6]})
        

The following code defines two functions to get the mean ROC curve and plot the mean ROC curve for the different models.

In [None]:
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from itertools import cycle

def getmeanROC(modelData, TestSets, target):    
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    for i in range(2,7):
        testset = TestSets['sets'].iloc[i-2][target].replace({'yes':1, 'no':0})
        targetx = 'surv' if target == 'Survival5yr' else 'LNM'
        model = modelData[targetx+'_' + str(i)]
            
        fpr[i], tpr[i], _ = roc_curve(testset, model)
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(2,7)]))
    
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(2,7):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    
    mean_tpr /= len(TestSets)
    
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    tpr["macro"][0] = 0.0

    
    return fpr, tpr, roc_auc

def plotMeanROC(fpr, tpr, roc_auc, target, title):
    plt.figure(figsize=(6,6))
    plt.plot(fpr["macro"], tpr["macro"],
             label='Mean ROC curve (AUC = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle='-', linewidth=2)
    
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red', 'green', 'yellow'])
    
    for i, color in zip(range(2,7), colors):
        plt.plot(fpr[i], tpr[i], color=color, linestyle=':', lw=2, alpha=0.5,
                 label='Set {0} (AUC = {1:0.2f})'
                 ''.format(i, roc_auc[i]))
    
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([-0.05, 1.0])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show()
    
    
    

Get the mean ROC curves for the different models on LNM

In [None]:
# LNM 
fpr_noPlat_norest, tpr_noPlat_norest, roc_auc_noPlat_norest = getmeanROC(noPlat_noRest, TestSets, 'LNM')
plotMeanROC(fpr_noPlat_norest, tpr_noPlat_norest, roc_auc_noPlat_norest, 'LNM', 'NoPlat_NoRest')

fpr_noPlat_MRI, tpr_noPlat_MRI, roc_auc_noPlat_MRI = getmeanROC(noPlat_MRI, TestSets, 'LNM')
plotMeanROC(fpr_noPlat_MRI, tpr_noPlat_MRI, roc_auc_noPlat_MRI, 'LNM', 'NoPlat_MRI')

fpr_noPlat_TCGA, tpr_noPlat_TCGA, roc_auc_noPlat_TCGA = getmeanROC(noPlat_TCGA, TestSets, 'LNM')
plotMeanROC(fpr_noPlat_TCGA, tpr_noPlat_TCGA, roc_auc_noPlat_TCGA, 'LNM', 'NoPlat_TCGA')

fpr_noPlat_TCGA_MRI, tpr_noPlat_TCGA_MRI, roc_auc_noPlat_TCGA_MRI = getmeanROC(noPlat_TCGA_MRI, TestSets, 'LNM')
plotMeanROC(fpr_noPlat_TCGA_MRI, tpr_noPlat_TCGA_MRI, roc_auc_noPlat_TCGA_MRI, 'LNM', 'NoPlat_TCGA_MRI')

fpr_Or_nets, tpr_Or_nets, roc_auc_Or_nets = getmeanROC(Or_nets, TestSets, 'LNM')
plotMeanROC(fpr_Or_nets, tpr_Or_nets, roc_auc_Or_nets, 'LNM', 'Or')

fpr_Plat_NoRest, tpr_Plat_NoRest, roc_auc_Plat_NoRest = getmeanROC(Plat_NoRest, TestSets, 'LNM')
plotMeanROC(fpr_Plat_NoRest, tpr_Plat_NoRest, roc_auc_Plat_NoRest, 'LNM', 'Plat_NoRest')

fpr_Plat_all, tpr_Plat_all, roc_auc_Plat_all = getmeanROC(Plat_all, TestSets, 'LNM')
plotMeanROC(fpr_Plat_all, tpr_Plat_all, roc_auc_Plat_all, 'LNM', 'Brno cross-validation on LNM \n New model with additions')






Get the mean ROC curves for the all included model on Survival5yr

In [None]:
fpr_Plat_all, tpr_Plat_all, roc_auc_Plat_all = getmeanROC(Plat_all, TestSets, 'Survival5yr')
plotMeanROC(fpr_Plat_all, tpr_Plat_all, roc_auc_Plat_all, 'Survival5yr', 'Brno cross-validation on five-year-survival \n New model with additions')


Plot all the mean ROC curves in one plot

In [None]:
# Comparison of all the mean ROC curves in one plot
plt.figure(figsize=(7,7))

plt.plot(fpr_noPlat_norest["macro"], tpr_noPlat_norest["macro"],
         label='NoPlat_NoRest (area = {0:0.2f})'
               ''.format(roc_auc_noPlat_norest["macro"]), linewidth=2)

plt.plot(fpr_noPlat_MRI["macro"], tpr_noPlat_MRI["macro"],
            label='NoPlat_MRI (area = {0:0.2f})'
                ''.format(roc_auc_noPlat_MRI["macro"]), linewidth=2)

plt.plot(fpr_noPlat_TCGA["macro"], tpr_noPlat_TCGA["macro"],
            label='NoPlat_TCGA (area = {0:0.2f})'
                ''.format(roc_auc_noPlat_TCGA["macro"]), linewidth=2)

plt.plot(fpr_noPlat_TCGA_MRI["macro"], tpr_noPlat_TCGA_MRI["macro"],
            label='NoPlat_TCGA_MRI (area = {0:0.2f})'
                ''.format(roc_auc_noPlat_TCGA_MRI["macro"]), linewidth=2)

plt.plot(fpr_Or_nets["macro"], tpr_Or_nets["macro"],
            label='Or_nets (area = {0:0.2f})'
                ''.format(roc_auc_Or_nets["macro"]),  linewidth=2)

plt.plot(fpr_Plat_NoRest["macro"], tpr_Plat_NoRest["macro"],
            label='Plat_NoRest (area = {0:0.2f})'
                ''.format(roc_auc_Plat_NoRest["macro"]),  linewidth=2)

plt.plot(fpr_Plat_all["macro"], tpr_Plat_all["macro"],
            label='Plat_all (area = {0:0.2f})'
                ''.format(roc_auc_Plat_all["macro"]),  linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.title('ROC curve for LNM prediction using all models')
plt.legend(loc="lower right")
plt.show()

