## Statistics of the selected features and the corresponding feature importances

In [None]:
import os
import pandas as pd

## For plots
import seaborn as sns
import matplotlib.pyplot as plt
from palettable.colorbrewer.diverging import Spectral_8

import sys
sys.path.append("../")
from utils.VisualizationUtils import plot_crosstab, get_color_list
from mySettings import get_random_seed_list, get_convert_binary_to_multiclass_setting_dict

In [None]:
def statistics_by_pieplots(data_df, column_name, save_pieplot_path):
    ## data statistics
    statistics=data_df[column_name].value_counts(normalize=False)
    statistics = pd.DataFrame(statistics)
    statistics.index.name = column_name
    statistics.columns = ['counts']  
    statistics.sort_values(by='counts', ascending=False, inplace=True)
    num_features=statistics["counts"].sum()
    statistics["percent"] = 100.*statistics["counts"]/num_features
    display(statistics)
    
    ## visualize the data regarding the interested "column_name" by pieplots.
    fig, ax = plt.subplots()
    colors = Spectral_8.hex_colors
    colors.reverse()
    patches, texts, autotexts = plt.pie(statistics['counts'], startangle=90, radius=1.2, colors=colors[1:], 
                                        autopct=lambda p: '{:.0f}%({:.0f})'.format(p, (p/100)*num_features),
                                        textprops={'color':"dimgray", 'weight':'bold', 'fontsize':9})
    
    #edge color of the pieplots
    for w in patches:
        w.set_linewidth(2)
        w.set_edgecolor('white')
    
    #legend settings
    plt.rcParams['text.color'] = 'dimgray'
    labels=statistics.index
    plt.legend(patches, labels, loc='upper left', bbox_to_anchor=(-0.3, 1.), 
              prop={'weight':'bold', 'size':9})
    #save the plots
    plt.tight_layout()
    plt.savefig(save_pieplot_path)
    plt.show()


def analyze_feature_importances(task_name, task_basepath, save_results_basepath):
    ## read the feature importances from the excel
    feature_importance_excel_path=os.path.join(task_basepath, "feature_supports.xlsx")
    feature_impotances_df=pd.read_excel(feature_importance_excel_path, index_col=0)
    feature_impotances_df.reset_index(inplace=True)

    ## drop the non-radiomics features.
    feature_impotances_df["feature_name_splits"]=feature_impotances_df["feature_names"].apply(lambda x: x.split("_"))
    #print("\n-Deleted these features before doing the statistics: {}".format(feature_impotances_df[feature_impotances_df["feature_name_splits"].map(len)<5]))
    feature_impotances_df.drop(feature_impotances_df[feature_impotances_df["feature_name_splits"].map(len)<5].index, inplace=True)
    
    ## analyze the MRI sequences, gliomas tumor subregion and feature types.
    feature_impotances_df["MRI_sequences"]=feature_impotances_df["feature_name_splits"].apply(lambda x: x[0])
    feature_impotances_df["tumor_region"]=feature_impotances_df["feature_name_splits"].apply(lambda x: x[1])
    feature_impotances_df["feature_extraction_region"]=feature_impotances_df["feature_name_splits"].apply(lambda x: x[0]+"_"+x[1])
    feature_impotances_df["image_filter"]=feature_impotances_df["feature_name_splits"].apply(lambda x: x[2])
    feature_impotances_df["feature_type"]=feature_impotances_df["feature_name_splits"].apply(lambda x: x[3])
    feature_impotances_df["feature_name"]=feature_impotances_df["feature_name_splits"].apply(lambda x: x[4])
    
    ## display dataframe
    #display(feature_impotances_df)
    
    sorted_feature_impotances_df=feature_impotances_df.sort_values("scores", ascending=False)
    filtered_feature_impotances_df=sorted_feature_impotances_df[:20]
    #filtered_feature_impotances_df=sorted_feature_impotances_df.loc[sorted_feature_impotances_df['support'] == True]
    #display(filtered_feature_impotances_df)
    
    ## Statistics
    column_name="feature_type"
    save_pieplot_path=os.path.join(save_results_basepath, "pieplot-"+task_name+"-"+column_name+".jpeg")
    statistics_by_pieplots(filtered_feature_impotances_df, column_name, save_pieplot_path)

In [None]:
def visualize_feature_importance(task_name, task_basepath, save_results_basepath):
    """
    Visualize the feature importance.
    """
    ## read the feature importances from the excel 
    feature_importance_excel_path=os.path.join(task_basepath, "feature_supports.xlsx")
    Feature_importance_df=pd.read_excel(feature_importance_excel_path, index_col=0)
    
    ## rename is_GBM_CC and is_IDH_mutant
    Feature_importance_df["feature_names"]= Feature_importance_df["feature_names"].map(lambda x: x.replace("is_GBM_CC", "is_GBM"))
    Feature_importance_df["feature_names"]= Feature_importance_df["feature_names"].map(lambda x: x.replace("is_IDH_mutant_CC", "is_IDH_mutant"))
    
    ## Final features used in the model after feature selection.
    supported_features=Feature_importance_df.loc[Feature_importance_df['support'] == True]
    print("\n In total, {} features are used for the the final model.".format(supported_features.shape[0]))
    
    ## Plot the feature importance;
    fig, ax = plt.subplots(figsize=(18, 9))
    sns.barplot(x="feature_names", y="scores", data=supported_features.sort_values("scores", ascending=False))       
    # add text on the bar plots
    ymin, ymax = ax.get_ylim()
    for p in ax.patches:
        color=p.get_facecolor()
        box = p.get_bbox()
        ax.annotate("%.4f" % p.get_height(), xy=((box.x0 + box.x1)/2-0.08, p.get_height()+0.025*ymax), color=color, 
                    rotation=90, fontsize=10, weight='bold')
    
    plt.xticks(rotation=90)
    plt.xlabel("Feature Name", fontsize=10)
    plt.ylabel("Feature Importance", fontsize=10)
    #plt.subplots_adjust(left=0.07, bottom=0.6, right=0.98, top=0.9, wspace =0, hspace =0)
    plt.tight_layout()
    plt.savefig(os.path.join(save_results_basepath, task_name+"-FeatureImportance.jpeg"))
    plt.show()

    #display(supported_features)
    return supported_features

## Main: plot feature importance for each random seed, and save the feature importance from all random seeds into an excel.

In [None]:
def arrange_feature_importance_from_different_seeds():
    random_seed_list=get_random_seed_list()
    num_random_seeds=len(random_seed_list)

    dataframe_list=[]
    for i, random_seed in enumerate(random_seed_list):
        print("\n\n ====================== {}/{}: random_seed={}===================".format(i+1, num_random_seeds, random_seed))
        ## read the settings
        setting_dict=get_convert_binary_to_multiclass_setting_dict(random_seed)

        ## perform convertion for each setting.
        for setting_name, settings in setting_dict.items():
            binary_task_path_dict=settings["binary_task_path_dict"]
            save_results_basepath=settings["save_results_basepath"]

            for binary_task_name, binary_task_path in binary_task_path_dict.items():
                print("============ {}  ========".format(binary_task_name))

                # visualize the feature_importance
                supported_features=visualize_feature_importance(binary_task_name, binary_task_path, save_results_basepath)
                #analyze_feature_importances(binary_task_name, binary_task_path, save_results_basepath)

                # save the feature importance data from a certain random seed to a data frame list.
                supported_features.insert(0, "setting_name", setting_name)
                supported_features.insert(1, "random_seed", random_seed)
                supported_features.insert(2, "binary_task_name", binary_task_name)
                dataframe_list.append(supported_features)

    ## Connect the results from different random seeds;
    SupportedFeaturesDF=pd.concat(dataframe_list, axis=0, join="outer") 

    ## save the results to excel
    save_feature_importance_excel=os.path.join(os.path.dirname(save_results_basepath), "feature_importance.xlsx")
    SupportedFeaturesDF.to_excel(save_feature_importance_excel)
    display(SupportedFeaturesDF.head())

In [None]:
arrange_feature_importance_from_different_seeds()

### Main: statistics about the feature importances based on the 51 random seeds.

In [None]:
def statistic_number_of_selected_features(SupportedFeaturesDF, save_basepath):
    """
    Visualize the distribution of the number of selected features based on 51 random seeds for each task;
    The number of the selected radiomics features lies in [20, 40, 60, 80, 100]
    """
    ## only statistics on the radiomics features
    SupportedFeaturesDF=SupportedFeaturesDF[SupportedFeaturesDF["is_radiomics_feature"]==True]
    
    ## statistics
    groupby_df=SupportedFeaturesDF.groupby(["binary_task_name", "random_seed"]).agg(['count'])
    groupby_df.reset_index(inplace=True)
    display(groupby_df)
    
    ## show the statistics
    binary_task_list=SupportedFeaturesDF["binary_task_name"].value_counts().keys()
    for binary_task_name in binary_task_list:
        print("======  {} ======".format(binary_task_name))
        
        ## Statistics of the number of selected features;
        binary_task_data=groupby_df[groupby_df["binary_task_name"]==binary_task_name]
        value_counts=binary_task_data[("support", "count")].value_counts()
        print(value_counts)
        
        # plot the histogram of the number of selected features;
        fig, ax = plt.subplots(figsize=(8, 5))
        sns.barplot(x=value_counts.index, y=value_counts.values)
        ax.set(xlabel="Counts", ylabel="Number of features") 
        plt.savefig(os.path.join(save_basepath, "statistics_NumOfSelectedFeatures-"+binary_task_name+".jpeg"))
        plt.show()
        
def statistics_selected_features(SupportedFeaturesDF, interested_column, show_first_n_radiomics_features, statistic_feature_type, save_basepath):
    """
    - interested_column==("scores", "mean"): Statistics the mean feature importance for the 51 random seeds for each feature;
    - interested_column==("support", "count"): Statistics the selected time among the 51 random seeds for each feature.
    """
    # Statistic of the features;
    groupby_df=SupportedFeaturesDF.groupby(["binary_task_name", "feature_names"]).agg(['mean', 'count'])
    groupby_df.reset_index(inplace=True)
    display(groupby_df)

    binary_task_list=SupportedFeaturesDF["binary_task_name"].value_counts().keys()
    for binary_task_name in binary_task_list:
        print("======== {} =====".format(binary_task_name))
        
        ## filter the data for this binary classification task.
        binary_task_data=groupby_df[groupby_df["binary_task_name"]==binary_task_name]
        binary_task_data=binary_task_data.sort_values(interested_column, ascending=False)
        display(binary_task_data)

        ## only analysis the first n radiomics features and other features.
        radiomics_feature_data=binary_task_data[binary_task_data[("is_radiomics_feature", "mean")]==True]
        non_radiomics_feature_data=binary_task_data[binary_task_data[("is_radiomics_feature", "mean")]==False]
        radiomics_feature_data=radiomics_feature_data[:show_first_n_radiomics_features]
        feature_data_df=pd.concat([radiomics_feature_data, non_radiomics_feature_data], axis=0, join="outer") 
        feature_data_df=feature_data_df.sort_values(interested_column, ascending=False)
        print("The feature importance of {} features are ploted!".format(feature_data_df.shape[0]))

        ##===== Visualize the statistics  ===============
        fig, ax = plt.subplots(figsize=(18, 9))
        sns.barplot(x="feature_names", y=interested_column,  data=feature_data_df, dodge=False)

        # add text on the bar plots
        ymin, ymax = ax.get_ylim()
        for p in ax.patches:
            color=p.get_facecolor()
            box = p.get_bbox()
            ax.annotate("%.2f" % p.get_height(), xy=((box.x0 + box.x1)/2-0.08, p.get_height()+0.02*ymax), color=color, 
                        rotation=90, fontsize=10, weight='bold')

        # y label name
        if interested_column==("scores", "mean"):
            y_label="Mean Feature Importance"
        elif interested_column==("support", "count"):
            y_label="Count"
            
        plt.xticks(rotation=90)
        #plt.ylim(0, 55)
        plt.xlabel("Feature Name", fontsize=10)
        plt.ylabel(y_label, fontsize=10)
        plt.tight_layout()
        plt.savefig(os.path.join(save_basepath, binary_task_name+"-SelectedFeatures.jpeg"))
        plt.show()

        ## === Statistics of the feature types of the selected features ===
        save_pieplot_path=os.path.join(save_basepath, "pieplot-"+binary_task_name+"-"+statistic_feature_type+".jpeg")
        analyze_selected_feature_types(feature_data_df, statistic_feature_type, save_pieplot_path)
        
        
def analyze_selected_feature_types(binary_task_data, statistic_feature_type, save_pieplot_path):
    """
    Analyze the feature types of the selected features; 
    """

    ##=============== analyze the feature type of the selected features ===================================
    ## drop the non-radiomics features.
    radiomics_features_df=binary_task_data[binary_task_data[("is_radiomics_feature", "mean")]==True].copy()

    ## analyze the MRI sequences, gliomas tumor subregion and feature types.
    radiomics_features_df["feature_name_splits"]=radiomics_features_df["feature_names"].apply(lambda x: x.split("_"))
    radiomics_features_df["MRI_sequences"]=radiomics_features_df["feature_name_splits"].apply(lambda x: x[0])
    radiomics_features_df["tumor_region"]=radiomics_features_df["feature_name_splits"].apply(lambda x: x[1])
    radiomics_features_df["feature_extraction_region"]=radiomics_features_df["feature_name_splits"].apply(lambda x: x[0]+"_"+x[1])
    radiomics_features_df["image_filter"]=radiomics_features_df["feature_name_splits"].apply(lambda x: x[2])
    radiomics_features_df["feature_type"]=radiomics_features_df["feature_name_splits"].apply(lambda x: x[3])
    radiomics_features_df["feature_name"]=radiomics_features_df["feature_name_splits"].apply(lambda x: x[4])

    ## display dataframe
    #display(radiomics_features_df)

    ## Statistics
    statistics_by_pieplots(radiomics_features_df, statistic_feature_type, save_pieplot_path)
        
def main_analyze_features_from_different_seeds():
    """
    Analyze the selected features with different random seeds;
    1-Count the number of selected features for each task;
    2-Visualize the mean feature importances among the 51 random seeds for each features;
     -Visualize the number of the selected times among the 51 random seeds for each features;
    3-Analyze the feature types of the first n features;
    """
    ## use the first random seed, just to get its folder path where the well-orgarnized feature importances from all random seeds are saved.
    random_seed_list=get_random_seed_list()
    random_seed=random_seed_list[0]
    setting_dict=get_convert_binary_to_multiclass_setting_dict(random_seed)

    for setting_name, settings in setting_dict.items():
        # excel path which saves the feature importance.
        save_results_basepath=settings["save_results_basepath"]
        save_results_basepath=os.path.dirname(save_results_basepath)
        feature_importance_excel=os.path.join(save_results_basepath, "feature_importance.xlsx")

        # read the feature importance data.
        SupportedFeaturesDF=pd.read_excel(feature_importance_excel, index_col=0)
        SupportedFeaturesDF.reset_index(drop=True, inplace=True)
        
        # mark the radiomics features and non-radiomics features;
        SupportedFeaturesDF["feature_name_splits"]=SupportedFeaturesDF["feature_names"].apply(lambda x: x.split("_"))
        SupportedFeaturesDF["is_radiomics_feature"]=SupportedFeaturesDF["feature_name_splits"].map(len)>=5
        display(SupportedFeaturesDF)
    
        # Statistics of the number of selected features for each binary classification task.
        statistic_number_of_selected_features(SupportedFeaturesDF, save_results_basepath)

        # Statistics of the selected features each binary classification task.
        interested_column= ("scores", "mean") #("support", "count")
        show_first_n_radiomics_features=20
        statistic_feature_type="feature_type"#"feature_extraction_region" #"feature_type"
        statistics_selected_features(SupportedFeaturesDF, interested_column, show_first_n_radiomics_features, statistic_feature_type, save_results_basepath)

In [None]:
main_analyze_features_from_different_seeds()