### Arrange the results;
- Compare different normalization methods;
- Compare different features;
- Compare different image filters;


In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects
from mySettings import get_arrange_results_settings_dict


import sys
sys.path.append("../")
from utils.myUtils import traversalDir_FirstDir

In [None]:
"""
Arrange and save all the experiment results into an excel file.
"""
def arrange_results_to_excel(experiment_results_bathpath):
    
    # Data frame to save the arranged results;
    Arranged_Results=None
    
    image_filter_list=traversalDir_FirstDir(experiment_results_bathpath)
    for image_filter in image_filter_list:
        # folder 1: image_type, for example, {original, exponential, ...}
        
        # add some additional descriptions for the experiments;
        image_filter_split=image_filter.split("_")
        if len(image_filter_split)>1:
            image_filter_name=image_filter_split[0]
            additional_description=image_filter_split[1]
        else:
            image_filter_name=image_filter
            additional_description=""
        
        basePath_image_filter=os.path.join(experiment_results_bathpath, image_filter)
        experiment_description_list=traversalDir_FirstDir(basePath_image_filter)
        
        for experiment_description in experiment_description_list:
            # folder 2: description of the expriments, for example, {TCGA_IDH_extracted_features_fcm, TCGA_IDH_extracted_features_no_normalization, TCGA_IDH_extracted_features_zscore}
            normalization_method=experiment_description.split("-")[-1]
            
            basePath_experiment_description=os.path.join(basePath_image_filter, experiment_description)
            harmonization_method_list=traversalDir_FirstDir(basePath_experiment_description)
            
            for harmonization_method in harmonization_method_list:
                # folder 3: Harmonization method and data imbalance strategy;
                ComBat_method=harmonization_method.split("-")[0]
                Data_imblance_strategy=harmonization_method.split("-")[1]
                
                basePath_harmonization=os.path.join(basePath_experiment_description, harmonization_method)
                task_list=traversalDir_FirstDir(basePath_harmonization)
                
                for task in task_list:
                    # folder 4: Task list, for example,["TCGA_1.101_isGBM_base", "TCGA_2.101_isIDHMutant_base", "TCGA_3.101_is1p19qCodeleted_base"]
                    base_task=task.split("_")[2]
                    task_additional_description=task.split("_")[3]
               
                    #print("\n- image_type={}, experiment_description={}, harmonization_method={}, task={}".format(image_type, experiment_description, harmonization_method, task))
                    basePath_task=os.path.join(basePath_harmonization, task)
                    AUC_results_txt_file=os.path.join(basePath_task, "AUC_results_all_models.txt")
                    
                    AUC_results=pd.read_csv(AUC_results_txt_file, header=0, index_col=0)
                    AUC_results.insert(0, "image_filter", image_filter_name)
                    AUC_results.insert(1, "normalization_method", normalization_method)
                    AUC_results.insert(2, "ComBat_method", ComBat_method)
                    AUC_results.insert(3, "Data_imblance_strategy", Data_imblance_strategy)
                    AUC_results.insert(4, "task", task)
                    AUC_results.insert(5, "base_task", base_task)
                    AUC_results.insert(6, "task_additional_description", additional_description+" "+task_additional_description)
                    AUC_results.insert(7, "additional_description", additional_description)
                    
                    if isinstance(Arranged_Results, pd.DataFrame):
                        Arranged_Results=pd.concat([Arranged_Results, AUC_results], axis=0)
                    else:
                        Arranged_Results=AUC_results
                    
    # add a column to tell feature selection method and classifier
    Arranged_Results["feature_selection"]=Arranged_Results["model_name"].apply(lambda x: x.split("_")[0])
    Arranged_Results["classifier"]=Arranged_Results["model_name"].apply(lambda x: x.split("_")[1])
    
    # save the results
    save_arranged_excel_path=os.path.join(experiment_results_bathpath, "arranged_results.xlsx")
    Arranged_Results.reset_index(drop=True, inplace=True)
    Arranged_Results.to_excel(save_arranged_excel_path)
    
    return Arranged_Results, save_arranged_excel_path


def convert_list_in_dataframe(data_df):
    """
    In the data frame, "AUC" correpsonds to a list of AUC cross validation values,
    now we want to expand one record to len(AUC) records, so that we can draw the mean/media and std of the AUC values.
    """
    
    temp_AUC=data_df["AUC"].str.strip("[").str.strip("]").str.replace(" ", ",").str.split(",", expand=True).stack()
    temp_AUC=temp_AUC.reset_index(level=1, drop=True).rename("AUC_values")
    data_df=data_df.join(temp_AUC)
    data_df["AUC_values"] = pd.to_numeric(data_df["AUC_values"], errors='raise')
    
    return data_df


def add_median_labels(ax, precision='.4f'):
    """
    add annotation for the box plots.
    
    References: https://stackoverflow.com/questions/38649501/labeling-boxplot-in-seaborn-with-median-value.
    """
    lines = ax.get_lines()
    boxes = [c for c in ax.get_children() if type(c).__name__ == 'PathPatch']
    lines_per_box = int(len(lines) / len(boxes))
    for median in lines[4:len(lines):lines_per_box]:
        x, y = (data.mean() for data in median.get_data())
        # choose value depending on horizontal or vertical plot orientation
        value = x if (median.get_xdata()[1] - median.get_xdata()[0]) == 0 else y
        text = ax.text(x, y, f'{value:{precision}}', ha='center', va='center',
                       fontweight='normal', fontsize=5, color='white')
        # create median-colored border around white text for contrast
        text.set_path_effects([
            path_effects.Stroke(linewidth=3, foreground=median.get_color()),
            path_effects.Normal(),
        ])
        
        
def visualize_rranged_results(arranged_excel_path, groupby_column, plot_setting, plot_type="boxplot"):
    """
    Plot the results in a bar plot for better visualization.
    """
    ## read plot settings
    x_column=plot_setting["x_column"]
    hue_column=plot_setting["hue_column"]
    rename_hue_values=plot_setting["rename_hue_values"]
    exclude_hue_value_list=plot_setting["exclude_hue_value"]
    ncol=plot_setting["ncol"]
        
    
    ## read data
    Arranged_Results=pd.read_excel(arranged_excel_path, index_col=0)
    Arranged_Results[hue_column+"_renamed"] = Arranged_Results[hue_column].map(rename_hue_values)
    
    ## do not show XGBClassifier results because XGBClassifier are not totally reproducible for now.
    Arranged_Results.drop(Arranged_Results[Arranged_Results["classifier"]=="XGBClassifier"].index, inplace=True)
    
    ## do not plot some experiments;
    if len(exclude_hue_value_list)>0:
        for exclude_hue_value in exclude_hue_value_list:
            Arranged_Results.drop(Arranged_Results[Arranged_Results[hue_column]==exclude_hue_value].index, inplace=True)
            
    ## save the results  
    save_bathpath=os.path.dirname(arranged_excel_path)
    save_excel_path=os.path.join(save_bathpath, "arranged_results_"+x_column+"-"+hue_column+".xlsx")
    writer = pd.ExcelWriter(save_excel_path)

    ## Analyze the results.
    for task, task_results_df in Arranged_Results.groupby([groupby_column], sort=True):
        print("\n\n ***********  task={} ******************".format(task))
        ## plot
        plt.figure(figsize=(15,5))
        plt.xticks(size=12)
        plt.yticks(size=12)
        plt.xlabel(x_column,size=16)
        
        if plot_type=="barplot": 
            # bar plot
            ax=sns.barplot(x=x_column, y="median_AUC", hue=hue_column+"_renamed", hue_order=rename_hue_values.values(), data=task_results_df, palette="Paired") 
            
            # add text on the bar plots
            for p in ax.patches:
                color=p.get_facecolor()
                box = p.get_bbox()
                ax.annotate("%.4f" % p.get_height(), xy=((box.x0 + box.x1)/2-0.02, p.get_height()+0.02), color=color, 
                            rotation=90, fontsize=8, weight='bold')
                
            # some settings
            plt.ylim((0.5, 1.1))
            plt.ylabel('median AUC',size=16)
            save_fig_name=os.path.join(save_bathpath, task.replace(".", "-")+"_barplot.jpeg")
                                       
        elif plot_type=="boxplot":  
            # box plot
            converted_task_results_df=convert_list_in_dataframe(task_results_df)
            ax=sns.boxplot(x=x_column, y="AUC_values",  hue=hue_column+"_renamed", hue_order=rename_hue_values.values(), 
                           showmeans=True, data=converted_task_results_df, palette="Paired") 
            
            # add text next to the box.
            add_median_labels(ax)
                                       
            # some settings
            plt.ylabel('AUC',size=16)
            save_fig_name=os.path.join(save_bathpath, task.replace(".", "-")+"_boxplot.jpeg")
        

        # settings of the plots
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width , box.height* 0.8])
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2), ncol=ncol)
        #ax.legend(loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0, ncol=1)
        plt.subplots_adjust(left=0.07, bottom=0.2, right=0.98, top=0.85, wspace =0, hspace =0)
        plt.xticks(rotation=15)
        plt.grid()
        plt.savefig(save_fig_name, dpi=300)
        plt.show()


        #save to the excel
        task_results_df.sort_values("median_AUC", ascending=False, inplace=True)
        task_results_df.to_excel(writer, sheet_name=task)
    writer.save()

### Main

In [None]:
arrange_results_settings_dict=get_arrange_results_settings_dict()
for arrange_name, arrange_results_settings in arrange_results_settings_dict.items():
    results_basepath=arrange_results_settings["results_basepath"]
    groupby_column=arrange_results_settings["groupby_column"]
    plot_setting=arrange_results_settings["plot_setting"]
    
    ## Arrange the results;
    Arranged_Results, save_arranged_excel_path=arrange_results_to_excel(results_basepath)
    print("\n\n =========================== Arranged Results ====================================")
    display(Arranged_Results.head())
    
    ## Visualize the arranged results;
    visualize_rranged_results(save_arranged_excel_path, groupby_column, plot_setting)
    