### Perform feature selection and classification to predict the gene status of the gliomas patients.

For example, we can use this code and the radiomic features to predict:
- 1. LGG vs. GBM;
- 2. IDH mutant vs. IDH wildtype;
- 3. 1p/19q codeleted vs. 1p/19q intact;
- 4. MGMT methylated vs. MGMT unmethylated.

In [None]:
import pandas as pd
import os
 
from sklearn.model_selection import train_test_split
from mySettings import get_basic_settings
from classificationUtils import perform_binary_classification

In [None]:
def get_classification_tasks_dict(experiment_class, task_name):
    
    if experiment_class=="BraTS2021":
        classification_tasks_dict=get_classification_tasks_dict_BraTS2021(task_name)

    elif experiment_class=="TCGA_IDH" or experiment_class=="TCGA_MGMT":
        classification_tasks_dict=get_classification_tasks_dict_TCGA(experiment_class, task_name) 
        
    else:
        raise Exception("Undefined experiment class: {}.".format(experiment_class))
        
    return classification_tasks_dict

### Define classification settings for BraTS2021 dataset.

In [None]:

def convert_complex_to_real(datadf, feature_namelist):
    # convert_complex_to_real
    datadf_converted=datadf.copy()
    datadf_converted[feature_namelist]=datadf[feature_namelist].applymap(lambda x: complex(x).real)
    
    return datadf_converted


def get_feature_columns(train_data, feature_filter_dict):
    """
    Filter features according to modalities, image types, and tumor subregions.
    Note that, for the same subregion, the shape features should be the same for different modalities and image types.
    
    Feature_filter_dict has three keys:
    - modality_list: ["t1", "t1ce"/"t1Gd", "t2", "flair"];
    - imageType_list: ["original", "gradient", "log-sigma-1-0-mm-3D", "log-sigma-3-0-mm-3D"];
    - tumor_subregion_list: ["NCR", "ED", "ET", "TC", "wholeTumor"]
    """
    
    modality_list=feature_filter_dict["modality_list"]
    imageType_list=feature_filter_dict["imageType_list"]
    tumor_subregion_list=feature_filter_dict["tumor_subregion_list"]
    
    all_columns=train_data.columns
           
    #1. filter the shape features. 
    # - Different modalities have the same shape features. 
    # - The shape features are always saved in "original" image type.
    shape_feature_columns=[]
    for column in all_columns:
        column_name_split=column.split("_")
        example_modality_name=modality_list[0]
        if column_name_split[0]==example_modality_name and (column_name_split[1] in tumor_subregion_list) and column_name_split[2]=="original" and column_name_split[3]=="shape":
            shape_feature_columns.append(column)
       
    # 2. filter other features (not include shape features) from different modalities, different image types, and different subregions;
    other_feature_columns=[]
    for column in all_columns:
        column_name_split=column.split("_")
        if (column_name_split[0] in modality_list) and (column_name_split[1] in tumor_subregion_list) and (column_name_split[2] in imageType_list) and (column_name_split[3]!="shape"): 
            other_feature_columns.append(column)
                
    # connect the shape features and other features.
    final_feature_columns=shape_feature_columns+other_feature_columns
    print("There are {} radiomic features in total, including {} shape features and {} other features!".format(len(final_feature_columns), len(shape_feature_columns), len(other_feature_columns)))
    
    return final_feature_columns




#======================================= BraTS2021 classification task settings =======================================================
"""
Perform MGMT classification for BraTS2021 competition.
"""
def get_classification_tasks_dict_BraTS2021(task_name):
    basepath="G://PhDProjects/RadiogenomicsProjects/BraTS2021"
    basic_settings=get_basic_settings() 
    feature_filter_dict=basic_settings["feature_filter_dict"]
    imageType_list=feature_filter_dict["imageType_list"][0]
    harmonization_detail= "withoutComBat" if basic_settings["harmonization_method"]=="withoutComBat" else basic_settings["harmonization_method"]+"_"+basic_settings["harmonization_label"]+"_refbat-"+str(basic_settings["harmonization_ref_batch"])
    base_results_path=basepath+"/Results/"+imageType_list+"/Results_BraTS2021_MGMT/"+harmonization_detail+"-"+basic_settings["imbalanced_data_strategy"]
    classification_tasks_dict={}
    
    
    ## basic excel path settings
    classification_tasks_dict["BraTS2021_5.101_segNiiData_base"]={
        "train_excel_path": basepath+"/Features/final_metadata/features_BraTS2021_train.xlsx",
        "test_excel_path_dict": {"validation_data": basepath+"/Features/final_metadata/features_BraTS2021_validation.xlsx"},
    }
    
#     classification_tasks_dict["BraTS2021_5.201_dcmToNiiData_base"]={
#         "train_excel_path": basepath+"/Features/final_metadata/features_BraTS2021_train_dcm_to_nii.xlsx",
#         "test_excel_path_dict": {"validation_data": basepath+"/Features/final_metadata/features_BraTS2021_validation_dcm_to_nii.xlsx"},
#     }
    
    #Add other settings ("train_data", "test_data_dict", "label_column", "base_results_path", "feature_columns") for each task. 
    if task_name is not None:
        find_task=False
        for task, classification_settings in classification_tasks_dict.items():
            if task_name==task:
                find_task=True
                train_excel_path=classification_settings["train_excel_path"]
                test_excel_path_dict=classification_settings["test_excel_path_dict"]

                # preprocessing train data
                train_data=pd.read_excel(train_excel_path, index_col=0)
                feature_columns=get_feature_columns(train_data, feature_filter_dict)
                train_data=preprocessing_data(train_data, feature_columns)

                # preprocessing test data
                test_data_dict={}
                for description, test_excel_path in test_excel_path_dict.items():
                    test_data=pd.read_excel(test_excel_path, index_col=0)
                    test_data=preprocessing_data(test_data, feature_columns)
                    test_data_dict[description]=test_data

                # set and save the settings
                classification_settings["train_data"]=train_data
                classification_settings["test_data_dict"]=test_data_dict
                classification_settings["label_column"]="MGMT_value" 
                classification_settings["base_results_path"]=base_results_path
                classification_settings["feature_columns"]=feature_columns
                classification_tasks_dict[task]=classification_settings

        if not find_task:
            raise Exception("Task {} not defined!".format(task_name))
        
    return classification_tasks_dict                       
 

### Define settings for TCGA dataset, for predicting tumor grade, IDH and 1p/19q, and for predicting MGMT.

In [None]:

"""
For the public features of TCGA dataset, classify the columns into features, interested clinical info and the classification targets.
"""
def get_column_list_for_TCGA(columns):

    feature_namelist=[]
    clinical_namelist=[]
    classification_label_namelist=[]
    ET_related_feature_namelist=[]
    for column in columns:
        if column.startswith("ET_related_feature_"): 
            ET_related_feature_namelist.append(column)
            
        elif column in ["age", "is_female"]:
            clinical_namelist.append(column)

        elif column in ["is_GBM", "is_IDH_mutant", "is_1p19q_codeleted", "is_MGMT_Methylated"]:
            classification_label_namelist.append(column)

        else:
            column_prefix=column.split("_")[0]
            if column_prefix in ["VOLUME", "DIST", "INTENSITY", "HISTO","SPATIAL", "ECCENTRICITY", "SOLIDITY", "TEXTURE", "TGM"]:
                feature_namelist.append(column)
                
    feature_column_list=feature_namelist+ET_related_feature_namelist
    print("\n len(feature_column_list)={}".format(len(feature_column_list)))
    print("\n clinical_namelist={}".format(clinical_namelist))
    print("\n classification_label_namelist={}".format(classification_label_namelist))
    
    return feature_column_list, clinical_namelist, classification_label_namelist 


#========== classification task settings for TCGA datast: GBM, IDH and 1p/19q, and MGMT =========================
def get_classification_tasks_dict_TCGA(experiment_class, task_name):
    # basic paths
    basepath="G://PhDProjects/RadiogenomicsProjects/GliomasSubtypes" 
    basic_settings=get_basic_settings() 
    features_for_TCGA=basic_settings["features_for_TCGA"]
    normalization_method="" if features_for_TCGA=="public_features" else basic_settings["normalization_method"]
    feature_filter_dict=basic_settings["feature_filter_dict"]
    random_seed=basic_settings["random_seed"]
    imageType_list=feature_filter_dict["imageType_list"][0]
    harmonization_detail= "withoutComBat" if basic_settings["harmonization_method"]=="withoutComBat" else basic_settings["harmonization_method"]+"_"+basic_settings["harmonization_label"]+"_refbat-"+str(basic_settings["harmonization_ref_batch"])
    base_results_path=basepath+"/Results_randomseed"+str(random_seed)+"/"+imageType_list+"/"+experiment_class+"_"+features_for_TCGA+"_"+normalization_method+"/"+harmonization_detail+"-"+basic_settings["imbalanced_data_strategy"]
    
    
    ##--------------------- Prepare the data -------------------
    # data path
    if experiment_class=="TCGA_IDH" and features_for_TCGA=="extracted_features":
        train_data_excel_path=os.path.join(basepath, "Features", "final_metadata", normalization_method, "TCGA_extracted_features_IDH_train.xlsx") 
        test_data_excel_path=os.path.join(basepath, "Features", "final_metadata", normalization_method, "TCGA_extracted_features_IDH_test.xlsx")
        
    elif experiment_class=="TCGA_IDH" and features_for_TCGA=="public_features":
        train_data_excel_path=os.path.join(basepath, "Features", "final_metadata", "TCGA_public_features_IDH_train.xlsx") 
        test_data_excel_path=os.path.join(basepath, "Features", "final_metadata", "TCGA_public_features_IDH_test.xlsx")
    
    elif experiment_class=="TCGA_MGMT" and features_for_TCGA=="extracted_features":
        train_data_excel_path=os.path.join(basepath, "Features", "final_metadata", normalization_method, "TCGA_extracted_features_MGMT_train.xlsx") 
        test_data_excel_path=os.path.join(basepath, "Features", "final_metadata", normalization_method,  "TCGA_extracted_features_MGMT_test.xlsx")
        
    elif experiment_class=="TCGA_MGMT" and features_for_TCGA=="public_features":
        train_data_excel_path=os.path.join(basepath, "Features", "final_metadata", "TCGA_public_features_MGMT_train.xlsx") 
        test_data_excel_path=os.path.join(basepath, "Features", "final_metadata", "TCGA_public_features_MGMT_test.xlsx")
        
    else:
        raise Exception("Unfined feature type for TCGA dataset!")

        
    ## ==================== Resplit the data with stratified 1p/19q codeletion status. ============================
    if experiment_class=="TCGA_IDH":
        stratify_column="is_1p19q_codeleted" 
    elif experiment_class=="TCGA_MGMT":
        stratify_column="is_MGMT_Methylated" 
        
    train_data=pd.read_excel(train_data_excel_path, index_col=0)
    test_data=pd.read_excel(test_data_excel_path, index_col=0)
    data=pd.concat([train_data, test_data], axis=0) 
    train_data, test_data, y_train, y_test = train_test_split(data, data[stratify_column], stratify= data[stratify_column], 
                                                              test_size=0.3, random_state=random_seed)
    
    train_data_excel_path=train_data_excel_path[:-5]+"_resplited_randomseed_"+str(random_seed)+".xlsx"
    test_data_excel_path=test_data_excel_path[:-5]+"_resplited_randomseed_"+str(random_seed)+".xlsx"
    train_data.to_excel(train_data_excel_path)
    test_data.to_excel(test_data_excel_path)
    print("\n\n======Resplit the train and test data, stratified by {}, random_seed={}=======".format(stratify_column, random_seed))
    print("-Before split: data.shape={}; \After split, train_data.shape={}, test_data.shape={}".format(data.shape, train_data.shape, test_data.shape))
    #====================================================================================
    
    # test data        
    test_excel_path_dict={"test_data": test_data_excel_path}
    
    ## === preprocess the train data===
    #read train data
    train_data=pd.read_excel(train_data_excel_path, index_col=0)
    
    #get different kinds of feature columns;
    if features_for_TCGA=="public_features":
        feature_namelist, clinical_namelist, classification_label_namelist=get_column_list_for_TCGA(train_data)
    else:
        _, clinical_namelist, classification_label_namelist=get_column_list_for_TCGA(train_data)
        feature_namelist=get_feature_columns(train_data, feature_filter_dict)
    
    #preprocess train data.
    train_data=preprocessing_data(train_data, feature_namelist)
    train_data_LGG=train_data[train_data["is_GBM"] == 0] # Filter the LGG data for predicting 1p/19q status.
    train_data_GBM=train_data[train_data["is_GBM"] == 1] # Filter the GBM data for predicting MGMT methylation status.
    
    ## ==== preprocess the test dataset ===
    test_data_dict={}
    test_data_LGG_dict={}
    test_data_GBM_dict={}
    for description, test_excel_path in test_excel_path_dict.items():
        test_data=pd.read_excel(test_excel_path, index_col=0)
        test_data=preprocessing_data(test_data, feature_namelist)
        test_data_LGG=test_data[test_data["is_GBM"] == 0]
        test_data_GBM=test_data[test_data["is_GBM"] == 1]
        test_data_dict[description]=test_data
        test_data_LGG_dict[description]=test_data_LGG
        test_data_GBM_dict[description]=test_data_GBM
    
   
    #-------------------------- Define the classification tasks ----------------------------------
    classification_tasks_dict={}
    if experiment_class=="TCGA_IDH":
        #========= predict LGG vs. GBM  ========
        classification_tasks_dict["TCGA_1.101_isGBM_base"]={
            "train_data": train_data, 
            "test_data_dict": test_data_dict, 
            "feature_columns":feature_namelist, 
            "keep_feature_columns": [],
            "label_column":"is_GBM",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA_1.102_isGBM_withClinicalInfo"]={
            "train_data": train_data, 
            "test_data_dict": test_data_dict, 
            "feature_columns":feature_namelist,
            "keep_feature_columns": clinical_namelist,
            "label_column":"is_GBM",
            "base_results_path":base_results_path}


        #========= predict IDH mutation status  ========= 
        classification_tasks_dict["TCGA_2.101_isIDHMutant_base"]= {
            "train_data": train_data, 
            "test_data_dict": test_data_dict, 
            "feature_columns":feature_namelist, 
            "keep_feature_columns": [],
            "label_column":"is_IDH_mutant",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA_2.102_isIDHMutant_withClinicalInfo"]={
            "train_data": train_data, 
            "test_data_dict": test_data_dict, 
            "feature_columns":feature_namelist,
            "keep_feature_columns": clinical_namelist,
            "label_column":"is_IDH_mutant",
            "base_results_path":base_results_path}

        #========= predict 1p/19q codeletion status ========= 
        classification_tasks_dict["TCGA_3.101_is1p19qCodeleted_base"]={
            "train_data": train_data, 
            "test_data_dict": test_data_dict, 
            "feature_columns":feature_namelist, 
            "keep_feature_columns": [],
            "label_column":"is_1p19q_codeleted",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA_3.102_is1p19qCodeleted_withClinicalInfo"]={
            "train_data": train_data, 
            "test_data_dict": test_data_dict, 
            "feature_columns":feature_namelist,
            "keep_feature_columns": clinical_namelist,
            "label_column":"is_1p19q_codeleted",
            "base_results_path":base_results_path}
        
        classification_tasks_dict["TCGA-LGG_3.201_is1p19qCodeleted_base"]={
            "train_data": train_data_LGG, 
            "test_data_dict": test_data_LGG_dict, 
            "feature_columns":feature_namelist, 
            "keep_feature_columns": [],
            "label_column":"is_1p19q_codeleted",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA-LGG_3.202_is1p19qCodeleted_withClinicalInfo"]={
            "train_data": train_data_LGG, 
            "test_data_dict": test_data_LGG_dict, 
            "feature_columns":feature_namelist, 
            "keep_feature_columns": clinical_namelist,
            "label_column":"is_1p19q_codeleted",
            "base_results_path":base_results_path}
        
    elif experiment_class=="TCGA_MGMT":
        #========= predict MGMT methylated vs. unmethylated for LGG and GBM data ========
        classification_tasks_dict["TCGA_4.101_isMGMTMethylated_base"]={
            "train_data": train_data, 
            "test_data_dict": test_data_dict, 
            "feature_columns":feature_namelist,
            "keep_feature_columns": [],
            "label_column":"is_MGMT_Methylated",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA_4.102_isMGMTMethylated_withClinicalInfo"]={
            "train_data": train_data, 
            "test_data_dict": test_data_dict, 
            "feature_columns":feature_namelist,
            "keep_feature_columns": clinical_namelist,
            "label_column":"is_MGMT_Methylated",
            "base_results_path":base_results_path}

        #========= predict MGMT methylated vs. unmethylated for GBM data ========
        classification_tasks_dict["TCGA-GBM_4.201_isMGMTMethylated_base"]={
            "train_data": train_data_GBM, 
            "test_data_dict": test_data_GBM_dict, 
            "feature_columns":feature_namelist, 
            "keep_feature_columns": [],
            "label_column":"is_MGMT_Methylated",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA-GBM_4.202_isMGMTMethylated_withClinicalInfo"]={
            "train_data": train_data_GBM, 
            "test_data_dict": test_data_GBM_dict, 
            "feature_columns":feature_namelist,
            "keep_feature_columns": clinical_namelist,
            "label_column":"is_MGMT_Methylated",
            "base_results_path":base_results_path}
        
         #========= predict MGMT methylated vs. unmethylated for LGG data ========
        classification_tasks_dict["TCGA-LGG_4.301_isMGMTMethylated_base"]={
            "train_data": train_data_LGG, 
            "test_data_dict": test_data_LGG_dict, 
            "feature_columns":feature_namelist, 
            "keep_feature_columns": [],
            "label_column":"is_MGMT_Methylated",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA-LGG_4.302_isMGMTMethylated_withClinicalInfo"]={
            "train_data": train_data_LGG, 
            "test_data_dict": test_data_LGG_dict, 
            "feature_columns":feature_namelist,
            "keep_feature_columns": clinical_namelist,
            "label_column":"is_MGMT_Methylated",
            "base_results_path":base_results_path}
          
    else:
        raise Exception("Unfined experiments!")
      
    
    for task, classification_settings in classification_tasks_dict.items():
        # save the data excel path
        classification_settings["train_excel_path"]=train_data_excel_path
        classification_settings["test_excel_path_dict"]=test_excel_path_dict
        classification_tasks_dict[task]=classification_settings
      
    return classification_tasks_dict
   
    

#### Data preprocessing: Important!!! 

In [None]:
def preprocessing_data(datadf, feature_columns):
     #convert the complex data to real data
    datadf=convert_complex_to_real(datadf, feature_columns)
    
    #fill nan values with 0.
    #datadf.fillna(value=0, inplace=True)
    
    return datadf

### Main 

In [None]:
def main():
    basic_settings=get_basic_settings()
    print("\n === basic_settings={} =======".format(basic_settings))
    
    # choose the experiment settings.
    experiment_class=basic_settings["experiment_class"]
    task_list=basic_settings["task_list"]
    
    # if the task list in the settings is empty, then all the tasks in this experiment class will be done!
    if task_list==[]:
        classification_tasks_dict=get_classification_tasks_dict(experiment_class, task_name=None)
        task_list=list(classification_tasks_dict.keys())
    
    # Perform the classification task one by one!
    for task in task_list:
        classification_tasks_dict=get_classification_tasks_dict(experiment_class, task) 
        classification_task_settings=classification_tasks_dict[task]

        #Perform classification
        perform_binary_classification(task, classification_task_settings, basic_settings)

In [None]:
main()