### Gliomas project: transform the 3 binary classification tasks into 1 multiclass classification task.

**Three binary classification tasks:**
- Tumor Grade: GBM vs. LGG;
- IDH mutant vs. IDH wildtype.
- 1p/19q codeleted vs. 1p/19q intact;


**One multiclass classification problem:**
- LGG, IDH mutant, 1p/19q codeleted:1
- LGG, IDH mutant, 1p/19q non-codeleted:2
- LGG, IDH wildtype: 3
- GBM, IDH mutant: 4
- GBM, IDH wildtype: 5

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

## For plots
import seaborn as sns
import matplotlib.pyplot as plt

import sys
sys.path.append("../")
from utils.myUtils import traversalDir_FirstDir, save_dict, load_dict

from mySettings import get_convert_binary_to_multiclass_setting_dict

#### Functions to convert three binary labels to one multi-class label.

In [None]:
def caculate_tumor_subtype(data):
    """
    Define the tumor type according to the different combinations of tumor grade, IDH mutant and 1p/19q codeleted status.
    """
    
    if data["is_GBM"]==0 and data["is_IDH_mutant"]==1 and data["is_1p19q_codeleted"]==1:
        tumor_subtype_description="LGG, IDH mutant, 1p/19q codeleted"
        tumor_subtype=1
        
    elif data["is_GBM"]==0 and data["is_IDH_mutant"]==1 and data["is_1p19q_codeleted"]==0:
        tumor_subtype_description="LGG, IDH mutant, 1p/19q non-codeleted"
        tumor_subtype=2
        
    elif data["is_GBM"]==0 and data["is_IDH_mutant"]==0:
        tumor_subtype_description="LGG, IDH wildtype"
        tumor_subtype=3
        
    elif data["is_GBM"]==1 and data["is_IDH_mutant"]==1:
        tumor_subtype_description="GBM, IDH mutant"  
        tumor_subtype=4
        
    elif data["is_GBM"]==1 and data["is_IDH_mutant"]==0:
        tumor_subtype_description="GBM, IDH wildtype"
        tumor_subtype=5
    
    return tumor_subtype_description, tumor_subtype

def get_tumor_subtype_description(data):
    """
    Used to add new columns to describe the tumor_subtype by words;
    """
    tumor_subtype_description, tumor_subtype=caculate_tumor_subtype(data)
    
    return tumor_subtype_description

def get_tumor_subtype(data):
    """
    Used to add new columns to describe the tumor_subtype by number in {1,2,3,4,5}.
    """
    tumor_subtype_description, tumor_subtype=caculate_tumor_subtype(data)
    
    return tumor_subtype

#### Function to perform the convertion.

In [None]:
def calculate_metrics(y_true, y_predicted):
    """
    Calcualte the metrics for evaluation.
    """
    
    result_metrics={}
    result_metrics["accuracy"]=metrics.accuracy_score(y_true, y_predicted)

    return result_metrics

def plot_confusion_matrix(y_true, predicted, save_results_path):
    """
    Plot the confusion matrix.
    """
    classes = np.unique(y_true)
    cm = metrics.confusion_matrix(y_true, predicted, labels=classes)
    
    # plot the confusion matrix.
    fig, ax = plt.subplots()
    sns.heatmap(cm, annot=True, fmt='d', cmap=plt.cm.Blues, cbar=False)
    ax.set(xlabel="Pred", ylabel="True", title="Confusion matrix")
    ax.set_xticklabels(labels=classes)
    ax.set_yticklabels(labels=classes)
    plt.savefig(save_results_path)
    plt.show()
    
def main_convert_binary_to_multiclass(binary_task_path_dict, save_results_basepath, ground_truth_target_excel_dict, data_folder):
    """
    Main function to perform the convertion: convert from the 3 binary classification problem to 1 multi-class classification problem.
    """
    threshold=0.5
    
    print("\n\n==============  {}   =================".format(data_folder))
    save_multiclass_results_excel=os.path.join(save_results_basepath, "multiclass_predicted_results-"+data_folder+".xlsx")
    
    ## read the results from the three binary classification tasks into a data frame list, to preprare for data frame connection.
    dataframe_list=[]
    for binary_task_name, binary_task_path in binary_task_path_dict.items():
        best_model_name=traversalDir_FirstDir(binary_task_path)
        assert len(best_model_name)==1

        # read the results;
        predicted_results_file=os.path.join(binary_task_path, best_model_name[0], data_folder, "predicted.csv")
        data_df=pd.read_csv(predicted_results_file, index_col=0)
        
        # rename the columns
        data_df[binary_task_name+"-best_model_name"]=best_model_name[0]
        data_df.rename(columns={"predicted": binary_task_name}, inplace=True)
        
        # append the dataframe to the list
        dataframe_list.append(data_df)

    ## Connect the results from the 3 binary classification tasks;
    ArrangedData=pd.concat(dataframe_list, axis=1, join="outer") 
    
    ## convert from 3 binary classification results to 1 multiclass results;
    ArrangedData["predicted_tumor_subtype_description"]=ArrangedData.apply(get_tumor_subtype_description, axis=1)
    ArrangedData["predicted_tumor_subtype"]=ArrangedData.apply(get_tumor_subtype, axis=1)
    
    ## read the ground truth data to get the ground truth label.
    GT_data=pd.read_excel(ground_truth_target_excel_dict[data_folder], index_col=0)
    GT_data=GT_data.loc[:, ["tumor_subtype_description", "tumor_subtype"]]
    
    ## Connet the predicted multiclass label with the ground truth label.
    ArrangedData=pd.concat([ArrangedData, GT_data], axis=1, join="outer") 
    
    ## save the results to excel
    ArrangedData.to_excel(save_multiclass_results_excel)
    display(ArrangedData.head())

    ## calculate the accuracy
    y_true=ArrangedData["tumor_subtype"]
    y_predicted=ArrangedData["predicted_tumor_subtype"]
    result_metrics=calculate_metrics(y_true, y_predicted)
    classification_reports=classification_report(y_true, y_predicted, output_dict=True)
    classification_reports=pd.DataFrame(classification_reports)
    print("\n\n Multiclass classification metrics: \n {}.".format(result_metrics))
    print("\n\n classification reports:\n{}.".format(classification_reports))
    
    ## save the metrics
    save_dict(result_metrics, os.path.join(save_results_basepath, "metrics-"+data_folder+".txt"))
    classification_reports.to_csv(os.path.join(save_results_basepath, "classification_reports-"+data_folder+".txt"))
    
    ## plot confusion matrix
    save_confusion_matrix_path=os.path.join(save_results_basepath, "confusion_matrix-"+data_folder+".jpeg")
    plot_confusion_matrix(y_true, y_predicted, save_confusion_matrix_path)
    

### Main

In [None]:
# read the settings
convert_binary_to_multiclass_setting_dict=get_convert_binary_to_multiclass_setting_dict()

# perform convertion for each setting.
for setting_name, convert_binary_to_multiclass_setting in convert_binary_to_multiclass_setting_dict.items():

    binary_task_path_dict=convert_binary_to_multiclass_setting["binary_task_path_dict"]
    save_results_basepath=convert_binary_to_multiclass_setting["save_results_basepath"]
    ground_truth_target_excel_dict=convert_binary_to_multiclass_setting["ground_truth_target_excel_dict"]
    
    # convertion for train data.
    main_convert_binary_to_multiclass(binary_task_path_dict, save_results_basepath, 
                                      ground_truth_target_excel_dict, data_folder="train_data")
    # convertion for test data.
    main_convert_binary_to_multiclass(binary_task_path_dict, save_results_basepath, 
                                      ground_truth_target_excel_dict, data_folder="test_data")