## Plot the ROC curves with different random seeds in one plot.

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors


import sys
sys.path.append("../")
from utils.VisualizationUtils import get_color_list
from mySettings import get_plot_ROC_setting_dict

In [None]:
def plot_mutiple_ROCs(data_df_dict, task_name, save_results_path):
    """
    Plot multiple ROC curves in one plot.
    """

    fpr_dict=dict()
    tpr_dict=dict()
    roc_auc_dict=dict()
    
    ## --- calcuate for random seed. -------
    y_true=[]
    y_predicted_prob=[]
    num_labels=len(data_df_dict)
    for label, data_df in data_df_dict.items():
        y_true_i=data_df[task_name+"-true"].values
        y_pred_i=data_df[task_name+"-predicted_prob"].values
        
        # save all the values for calculate the micro average.
        if len(y_true)==0:
            y_true=y_true_i
            y_predicted_prob=y_pred_i
        else:
            y_true=[*y_true, *y_true_i]
            y_predicted_prob=[*y_predicted_prob, *y_pred_i]
        
        # calculate the fpr/tpr values and AUC
        fpr, tpr, thresholds = metrics.roc_curve(y_true_i, y_pred_i)
        roc_auc_score = metrics.auc(fpr, tpr)  
        
        # save the fpr/tpr/AUC in the dict.
        fpr_dict[label]=fpr
        tpr_dict[label]=tpr
        roc_auc_dict[label]=roc_auc_score
   

    ## --- calculate the macro averaage. ----------
    # aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr_dict[key] for key in fpr_dict.keys()]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for key in fpr_dict.keys():
        mean_tpr += np.interp(all_fpr, fpr_dict[key], tpr_dict[key])
        
    mean_tpr /= num_labels

    fpr_dict["macro"] = all_fpr
    tpr_dict["macro"] = mean_tpr
    roc_auc_dict["macro"] = metrics.auc(fpr_dict["macro"], tpr_dict["macro"])
    
    ## ---- calculate the micro average. --------
    fpr_dict["micro"], tpr_dict["micro"], _ = metrics.roc_curve(y_true, y_predicted_prob)
    roc_auc_dict["micro"] = metrics.auc(fpr_dict["micro"], tpr_dict["micro"]) 
    
    
    ## ----- Plot the ROC curve. -------------
    #colors=get_color_list()
    colors=list(mcolors.CSS4_COLORS)
    plt.figure(figsize=(6, 5))
    i=0
    for label, auc in roc_auc_dict.items():
        fpr=fpr_dict[label]
        tpr=tpr_dict[label]
        if label in ["micro", "macro"]:
            plt.plot(fpr, tpr,  color=colors[i],  linestyle=":",  lw=4, label=label+"-average ROC curve (area = {0:0.4f})".format(roc_auc_dict[label]))
        else:
            plt.plot(fpr, tpr,  color=colors[i],  lw=2,  label="ROC curve for {0} (area = {1:0.4f})".format(label, roc_auc_dict[label]))

        i=i+1
        
    plt.plot([0, 1], [0, 1], "k--", lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    #plt.title("Receiver Operating Characteristic Curves")
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig(os.path.join(save_results_path, task_name+"-ROC_curves_all.jpeg"))  
    plt.show()
        

### Main

In [None]:
## Plot the ROC curves with different random seeds in one plot;
plot_ROC_setting_dict=get_plot_ROC_setting_dict()
for setting_name, plot_ROC_setting in plot_ROC_setting_dict.items():
    # basic settings
    base_dataPath=plot_ROC_setting["base_dataPath"]
    random_seed_list=plot_ROC_setting["random_seed_list"]
    save_results_path=plot_ROC_setting["save_results_basepath"]
    data_excel_name=plot_ROC_setting["data_excel_name"]
    task_list=plot_ROC_setting["task_list"]
    
    # put the data frame for different random seeds into a list.
    data_df_dict={}
    for random_seed in random_seed_list:
        data_excel_path=os.path.join(base_dataPath, "seed"+str(random_seed), data_excel_name) 
        data_df=pd.read_excel(data_excel_path, index_col=0)
        data_df_dict["seed "+str(random_seed)]=data_df
        
    # plot the ROC curves.
    for task_name in task_list:
        plot_mutiple_ROCs(data_df_dict, task_name, save_results_path)