## Statistics of the selected features and the corresponding feature importances

In [None]:
import os
import pandas as pd

## For plots
import seaborn as sns
import matplotlib.pyplot as plt
from palettable.colorbrewer.diverging import Spectral_8

import sys
sys.path.append("../")
from utils.VisualizationUtils import plot_crosstab, get_color_list
from mySettings import get_convert_binary_to_multiclass_setting_dict

In [None]:
def statistics_by_pieplots(data_df, column_name, save_pieplot_path):
    ## data statistics
    statistics=data_df[column_name].value_counts(normalize=False)
    statistics = pd.DataFrame(statistics)
    statistics.index.name = column_name
    statistics.columns = ['counts']  
    statistics.sort_values(by='counts', ascending=False, inplace=True)
    num_features=statistics["counts"].sum()
    statistics["percent"] = 100.*statistics["counts"]/num_features
    display(statistics)
    
    ## visualize the data regarding the interested "column_name" by pieplots.
    fig, ax = plt.subplots()
    colors = Spectral_8.hex_colors
    colors.reverse()
    patches, texts, autotexts = plt.pie(statistics['counts'], startangle=90, radius=1.2, colors=colors[1:], 
                                        autopct=lambda p: '{:.2f}%({:.0f})'.format(p, (p/100)*num_features),
                                        textprops={'color':"dimgray", 'weight':'bold', 'fontsize':11})
    
    #edge color of the pieplots
    for w in patches:
        w.set_linewidth(2)
        w.set_edgecolor('white')
    
    #legend settings
    plt.rcParams['text.color'] = 'dimgray'
    labels=statistics.index
    plt.legend(patches, labels, loc='upper left', bbox_to_anchor=(-0.3, 1.), 
              prop={'weight':'bold', 'size':11})
    #save the plots
    plt.tight_layout()
    plt.savefig(save_pieplot_path)
    plt.show()


def analyze_feature_importances(task_name, task_basepath, save_results_basepath):
    ## read the feature importances from the excel
    feature_importance_excel_path=os.path.join(task_basepath, "feature_supports.xlsx")
    feature_impotances_df=pd.read_excel(feature_importance_excel_path, index_col=0)
    feature_impotances_df.reset_index(inplace=True)

    ## drop the non-radiomics features.
    feature_impotances_df["feature_name_splits"]=feature_impotances_df["feature_names"].apply(lambda x: x.split("_"))
    #print("\n-Deleted these features before doing the statistics: {}".format(feature_impotances_df[feature_impotances_df["feature_name_splits"].map(len)<5]))
    feature_impotances_df.drop(feature_impotances_df[feature_impotances_df["feature_name_splits"].map(len)<5].index, inplace=True)
    
    ## analyze the MRI sequences, gliomas tumor subregion and feature types.
    feature_impotances_df["MRI_sequences"]=feature_impotances_df["feature_name_splits"].apply(lambda x: x[0])
    feature_impotances_df["tumor_region"]=feature_impotances_df["feature_name_splits"].apply(lambda x: x[1])
    feature_impotances_df["feature_extraction_region"]=feature_impotances_df["feature_name_splits"].apply(lambda x: x[0]+"_"+x[1])
    feature_impotances_df["image_filter"]=feature_impotances_df["feature_name_splits"].apply(lambda x: x[2])
    feature_impotances_df["feature_type"]=feature_impotances_df["feature_name_splits"].apply(lambda x: x[3])
    feature_impotances_df["feature_name"]=feature_impotances_df["feature_name_splits"].apply(lambda x: x[4])
    
    ## display dataframe
    #display(feature_impotances_df)
    
    sorted_feature_impotances_df=feature_impotances_df.sort_values("scores", ascending=False)
    filtered_feature_impotances_df=sorted_feature_impotances_df[:20]
    #filtered_feature_impotances_df=sorted_feature_impotances_df.loc[sorted_feature_impotances_df['support'] == True]
    #display(filtered_feature_impotances_df)
    
    ## Statistics
    column_name="feature_type"
    save_pieplot_path=os.path.join(save_results_basepath, "pieplot-"+task_name+"-"+column_name+".jpeg")
    statistics_by_pieplots(filtered_feature_impotances_df, column_name, save_pieplot_path)

In [None]:
def visualize_feature_importance(task_name, task_basepath, save_results_basepath):
    """
    Visualize the feature importance.
    """
    ## read the feature importances from the excel 
    feature_importance_excel_path=os.path.join(task_basepath, "feature_supports.xlsx")
    Feature_importance_df=pd.read_excel(feature_importance_excel_path, index_col=0)
    
    ## rename is_GBM_CC and is_IDH_mutant
    Feature_importance_df["feature_names"]= Feature_importance_df["feature_names"].map(lambda x: x.replace("is_GBM_CC", "is_GBM"))
    Feature_importance_df["feature_names"]= Feature_importance_df["feature_names"].map(lambda x: x.replace("is_IDH_mutant_CC", "is_IDH_mutant"))
    
    ## Final features used in the model after feature selection.
    supported_features=Feature_importance_df.loc[Feature_importance_df['support'] == True]
    print("\n In total, {} features are used for the the final model.".format(supported_features.shape[0]))
    
    ## Plot the feature importance;
    fig, ax = plt.subplots(figsize=(18, 9))
    sns.barplot(x="feature_names", y="scores", data=supported_features.sort_values("scores", ascending=False))       
    # add text on the bar plots
    ymin, ymax = ax.get_ylim()
    for p in ax.patches:
        color=p.get_facecolor()
        box = p.get_bbox()
        ax.annotate("%.4f" % p.get_height(), xy=((box.x0 + box.x1)/2-0.08, p.get_height()+0.025*ymax), color=color, 
                    rotation=90, fontsize=10, weight='bold')
    
    plt.xticks(rotation=90)
    plt.xlabel("Feature Name", fontsize=10)
    plt.ylabel("Feature Importance", fontsize=10)
    #plt.subplots_adjust(left=0.07, bottom=0.6, right=0.98, top=0.9, wspace =0, hspace =0)
    plt.tight_layout()
    plt.savefig(os.path.join(save_results_basepath, task_name+"-FeatureImportance.jpeg"))
    plt.show()


## Main

In [None]:
# read the settings
setting_dict=get_convert_binary_to_multiclass_setting_dict()

# perform convertion for each setting.
for setting_name, settings in setting_dict.items():

    binary_task_path_dict=settings["binary_task_path_dict"]
    save_results_basepath=settings["save_results_basepath"]
    
    for binary_task_name, binary_task_path in binary_task_path_dict.items():
        print("===========================  {}  =================".format(binary_task_name))
        # visualize the feature_importance
        visualize_feature_importance(binary_task_name, binary_task_path, save_results_basepath)
        analyze_feature_importances(binary_task_name, binary_task_path, save_results_basepath)
    