### Compare the mean CV AUC for the 51 random seeds before and after pipeline tuning.

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib 
import plotly.graph_objects as go
from palettable.colorbrewer.diverging import Spectral_8

from mySettings import get_compare_pipeline_tuning_dict
import sys
sys.path.append("../")
from utils.VisualizationUtils import plot_crosstab, get_color_list

In [None]:

                        
def visualize_violin_plots(dataframe, x_column, y_column, hue_column, save_figure_path):
    """
    Compare the mean cross-validation AUC before and after pipeline tuning.
    
    Ref:https://juejin.cn/post/6972831447035953160
    https://seaborn.pydata.org/generated/seaborn.violinplot.html
    """
    
    palette="Paired"  
    hue_order=["before tuning", "after tuning"]

    figure=plt.figure(figsize=(9,3.5))  
    
    # violin plot
    ax = sns.violinplot(x=x_column, y=y_column, hue=hue_column, hue_order=hue_order, data=dataframe, dodge=False,
                        palette=palette, split=True, inner=None, scale="count", width=0.6)
    old_len_collections = len(ax.collections)
    print("old_len_collections={}".format(old_len_collections))
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    
    # plot
    inner = None # Note: 'box' is default
    delta = 0.15
    for ii, item in enumerate(ax.collections):
        # axis contains PolyCollections and PathCollections
        if isinstance(item, matplotlib.collections.PolyCollection):
            # get path
            path, = item.get_paths()
            vertices = path.vertices

            # shift x-coordinates of path
            if not inner:
                if ii % 2: # -> to right
                    vertices[:,0] += delta
                else: # -> to left
                    vertices[:,0] -= delta
            else: # inner='box' adds another type of PollyCollection
                if ii % 3 == 0:
                    vertices[:,0] -= delta
                elif ii % 3 == 1:
                    vertices[:,0] += delta
                else: # ii % 3 = 2
                    pass
                

    # box plot
    sns.boxplot(x=x_column, y=y_column, hue=hue_column, hue_order=hue_order, data=dataframe, saturation=1, 
                showfliers=False, width=0.6, boxprops={'zorder': 3, 'facecolor': 'none'}, dodge=True,ax=ax)
     
    
    old_len_collections = len(ax.collections)
    print("old_len_collections={}".format(old_len_collections))
    
    # plot the dots
    sns.stripplot(x=x_column, y=y_column, hue=hue_column, hue_order=hue_order, data=dataframe, palette=palette, 
                  dodge=False, split=True, ax=ax)
    
    for i, dots in enumerate(ax.collections[old_len_collections:]):
        if i % 2: 
            dots.set_offsets(dots.get_offsets() + np.array([-0.12, 0]))
        else:  
            dots.set_offsets(dots.get_offsets() + np.array([0.12, 0]))
        
       
    print(xlim)
    ax.set_xlim((-0.7, 6.7))
    ax.set_ylim(ylim)
    ax.legend_.remove()
    ax.set_ylabel("std cross-validation AUC")
    
    handles, labels = ax.get_legend_handles_labels()
    ax.legend([handles[0],handles[1]], [labels[0], labels[1]], loc='upper left') #'lower left'
    
    plt.xticks(rotation=15)
    plt.tight_layout()
    plt.savefig(save_figure_path)
    plt.show()

In [None]:
compare_pipeline_tuning_dict=get_compare_pipeline_tuning_dict()
for setting_name, compare_pipeline_tuning_setting in compare_pipeline_tuning_dict.items():
    base_dataPath=compare_pipeline_tuning_setting["base_dataPath"]
    compare_dict=compare_pipeline_tuning_setting["compare_dict"]
    plot_setting=compare_pipeline_tuning_setting["plot_setting"]
    x_column=plot_setting["x_column"]
    y_column=plot_setting["y_column"]
    hue_column=plot_setting["hue_column"]
    
    dataframe_list=[]
    for compare_item, excel_path in compare_dict.items():
        dataframe=pd.read_excel(excel_path, index_col=0)
        dataframe.insert(0, "compare_item", compare_item)
        dataframe_list.append(dataframe)
        
    ArrangedResults=pd.concat(dataframe_list, axis=0, join="outer") 
    ArrangedResults["classifier"]=ArrangedResults["model_name"].map(lambda x: x.split("_")[1])
    
    for binary_task_name, binary_task_data_df in ArrangedResults.groupby(['binary_task_name'], sort=False):
        print("======================= {} ===============".format(binary_task_name))
        save_figure_path=os.path.join(base_dataPath, "violin_plots_"+binary_task_name+"-"+y_column+".jpeg")
        visualize_violin_plots(binary_task_data_df, x_column, y_column, hue_column, save_figure_path)