### Perform feature selection and classification to predict the gene status of the gliomas patients.

For example, we can use this code and the radiomic features to predict:
- 1. LGG vs. GBM;
- 2. IDH mutant vs. IDH wildtype;
- 3. 1p/19q codeleted vs. 1p/19q intact;
- 4. MGMT methylated vs. MGMT unmethylated.

In [None]:
import pandas as pd
import os

from mySettings import get_basic_settings
from classificationUtils import perform_binary_classification

In [None]:
def get_classification_task_settings():
    basic_settings=get_basic_settings()
    experiment_class=basic_settings["experiment_class"]
    task_name=basic_settings["task_name"]
    
    if experiment_class=="BraTS2021":
        classification_tasks_dict=get_classification_tasks_dict_BraTS2021()
        
    elif experiment_class=="TCGA_IDH":
        classification_tasks_dict=get_classification_tasks_dict_TCGA_IDH()  
        
    elif experiment_class=="TCGA_MGMT":
        classification_tasks_dict=get_classification_tasks_dict_TCGA_MGMT()  
    
    classification_task_settings=classification_tasks_dict[task_name]
    
    return task_name, classification_task_settings

### Define classification settings for BraTS2021 dataset.

In [None]:

def convert_complex_to_real(datadf, feature_namelist):
    # convert_complex_to_real
    datadf_converted=datadf.copy()
    datadf_converted[feature_namelist]=datadf[feature_namelist].applymap(lambda x: complex(x).real)
    
    return datadf_converted

def preprocessing_data(datadf, feature_columns):
     #convert the complex data to real data
    datadf=convert_complex_to_real(datadf, feature_columns)
    
    #fill nan values with 0.
    datadf.fillna(value=0, inplace=True)
    
    return datadf



def get_feature_columns(train_data, modality_list=["t1", "t1ce", "t2", "flair"], tumor_subregion_list=None):
    all_columns=train_data.columns
    
    # 1. filter features from different modalities;
    modalities_feature_columns=[]
    for column in all_columns:
        column_name_split=column.split("_")
        for modality_name in modality_list:
            if column.startswith(modality_name+"_"): 
                modalities_feature_columns.append(column)
                
    # 2. filter features from different tumor subregions;
    if tumor_subregion_list is None:
        subregion_feature_columns=modalities_feature_columns
    else:
        subregion_feature_columns=[]
        for column in modalities_feature_columns:
            column_name_split=column.split("_")
            if len(column_name_split)>1 and column_name_split[1] in tumor_subregion_list:
                subregion_feature_columns.append(column)
      
    #3. the shape feature are the same for all modalities, so only keep them for one modality
    shape_features=[]
    for column in subregion_feature_columns:
        column_name_split=column.split("_")
        for modality_name in modality_list[:-1]:
            if column.startswith(modality_name+"_") and column_name_split[3]=="shape":
                shape_features.append(column)
     
    final_feature_columns=list(set(subregion_feature_columns).difference(set(shape_features)))                
    print("There are {} radiomic features in total!".format(len(final_feature_columns)))
    
    return final_feature_columns




#======================================= BraTS2021 classification task settings =======================================================
"""
Perform MGMT classification for BraTS2021 competition.
"""
def get_classification_tasks_dict_BraTS2021(task_name):
    basepath="G://PhDProjects/RadiogenomicsProjects/BraTS2021"
    basic_settings=get_basic_settings() 
    base_results_path=basepath+"/Results/Results_BraTS2021_MGMT/"+basic_settings["harmonization_method"]+"_"+basic_settings["harmonization_label"]
    classification_tasks_dict={}
    
    
    ## basic excel path settings
    classification_tasks_dict["BraTS2021_501.01_segNiiData_base"]={
        "train_excel_path": basepath+"/Features/final_metadata/features_BraTS2021_train.xlsx",
        "test_excel_path_dict": {"test_data": basepath+"/Features/final_metadata/features_BraTS2021_validation.xlsx"},
    }
    
    classification_tasks_dict["BraTS2021_601.01_dcmToNiiData_base"]={
        "train_excel_path": basepath+"/Features/final_metadata/features_BraTS2021_train_dcm_to_nii.xlsx",
        "test_excel_path_dict": {"test_data": basepath+"/Features/final_metadata/features_BraTS2021_validation_dcm_to_nii.xlsx"},
    }
    
#     classification_tasks_dict["BraTS2021_701.01_segNiiData-zscore_base"]={
#         "train_excel_path": basepath+"/Features/final_metadata/features_BraTS2021_train_zscore.xlsx",
#         "test_excel_path_dict": {"test_data": basepath+"/Features/final_metadata/features_BraTS2021_validation_zscore.xlsx"},
#     }
           
    
    ## Other settings like "train_data", "test_data_dict", "label_column", "base_results_path", "feature_columns"
    find_task=False
    for task, classification_settings in classification_tasks_dict.items():
        if task_name==task:
            find_task=True
            train_excel_path=classification_settings["train_excel_path"]
            test_excel_path_dict=classification_settings["test_excel_path_dict"]

            # preprocessing train data
            train_data=pd.read_excel(train_excel_path, index_col=0)
            feature_columns=get_feature_columns(train_data)
            train_data=preprocessing_data(train_data, feature_columns)

            # preprocessing test data
            test_data_dict={}
            for description, test_excel_path in test_excel_path_dict.items():
                test_data=pd.read_excel(test_excel_path, index_col=0)
                test_data=preprocessing_data(test_data, feature_columns)
                test_data_dict[description]=test_data

            # set and save the settings
            classification_settings["train_data"]=train_data
            classification_settings["test_data_dict"]=test_data_dict
            classification_settings["label_column"]="MGMT_value" 
            classification_settings["base_results_path"]=base_results_path
            classification_settings["feature_columns"]=feature_columns
            classification_tasks_dict[task_name]=classification_settings
        
            if not os.path.exists(base_results_path):
                os.makedirs(base_results_path)
    
    if not find_task:
        raise Exception("Task {} not defined!".format(task_name))
        
    return classification_tasks_dict[task_name]                       
 

### Define settings for TCGA dataset, for predicting tumor grade, IDH and 1p/19q.

In [None]:

"""
For TCGA dataset, classify the columns into features, interested clinical info and the classification targets.
"""
def get_column_list_for_TCGA(columns):

    feature_namelist=[]
    clinical_namelist=[]
    classification_label_namelist=[]
    ET_related_feature_namelist=[]
    for column in columns:
        if column.startswith("ET_related_feature_"): 
            ET_related_feature_namelist.append(column)
            
        elif column in ["age", "is_female"]:
            clinical_namelist.append(column)

        elif column in ["is_GBM", "is_IDH_mutant", "is_1p19q_codeleted", "is_MGMT_Methylated"]:
            classification_label_namelist.append(column)

        else:
            column_prefix=column.split("_")[0]
            if column_prefix in ["VOLUME", "DIST", "INTENSITY", "HISTO","SPATIAL", "ECCENTRICITY", "SOLIDITY", "TEXTURE", "TGM"]:
                feature_namelist.append(column)
 
    print("\n len(feature_namelist)={}".format(len(feature_namelist)))
    print("\n len(ET_related_feature_namelist)={}".format(len(ET_related_feature_namelist)))
    print("\n clinical_namelist={}".format(clinical_namelist))
    print("\n classification_label_namelist={}".format(classification_label_namelist))
    return feature_namelist, ET_related_feature_namelist, clinical_namelist, classification_label_namelist 


#=====================  classification task settings for TCGA datast: GBM, IDH and 1p/19q ========================================
def get_classification_tasks_dict_TCGA(experiment_class, task_name):
    # basic paths
    basepath="G://PhDProjects/RadiogenomicsProjects/GliomasSubtypes" 
    basic_settings=get_basic_settings() 
    experiment_class=basic_settings["experiment_class"]
    features_for_TCGA=basic_settings["features_for_TCGA"]
    base_results_path=basepath+"/Results/"+experiment_class+"_"+features_for_TCGA+"/"+basic_settings["harmonization_method"]+"_"+basic_settings["harmonization_label"]
    
    
    ##--------------------- Prepare the data -------------------
    # data path
    if experiment_class=="TCGA_IDH" and features_for_TCGA=="extracted_features":
        train_data_excel_path=os.path.join(basepath, "Features", "final_metadata", "TCGA_extracted_features_IDH_train.xlsx") 
        test_data_excel_path=os.path.join(basepath, "Features", "final_metadata", "TCGA_extracted_features_IDH_test.xlsx")
        
    elif experiment_class=="TCGA_IDH" and features_for_TCGA=="public_features":
        train_data_excel_path=os.path.join(basepath, "Features", "final_metadata", "TCGA_public_features_IDH_train.xlsx") 
        test_data_excel_path=os.path.join(basepath, "Features", "final_metadata", "TCGA_public_features_IDH_test.xlsx")
    
    elif experiment_class=="TCGA_MGMT" and features_for_TCGA=="extracted_features":
        train_data_excel_path=os.path.join(basepath, "Features", "final_metadata", "TCGA_extracted_features_MGMT_train.xlsx") 
        test_data_excel_path=os.path.join(basepath, "Features", "final_metadata", "TCGA_extracted_features_MGMT_test.xlsx")
        
    elif experiment_class=="TCGA_MGMT" and features_for_TCGA=="public_features":
        train_data_excel_path=os.path.join(basepath, "Features", "final_metadata", "TCGA_public_features_MGMT_train.xlsx") 
        test_data_excel_path=os.path.join(basepath, "Features", "final_metadata", "TCGA_public_features_MGMT_test.xlsx")
        
    else:
        raise Exception("Unfined feature type for TCGA dataset!")

    ##  reading the train data and test data.
    train_data=pd.read_excel(train_data_excel_path, index_col=0)
    test_data=pd.read_excel(test_data_excel_path, index_col=0)
    print("\n****Train and test set split!**** \nTrain: {} patients; \nTest: {} patients.".format(train_data.shape[0], test_data.shape[0]))
 
    #get different kinds of feature columns;
    if features_for_TCGA=="public_features":
        feature_namelist, ET_related_feature_namelist, clinical_namelist, classification_label_namelist=get_column_list_for_TCGA(train_data)
    else:
        feature_namelist, ET_related_feature_namelist, clinical_namelist, classification_label_namelist=get_column_list_for_TCGA(train_data)
        feature_namelist=get_feature_columns(train_data, modality_list=["t1", "t1Gd", "t2", "flair"], tumor_subregion_list=None)
    
    ##preprocessing data
    train_data=preprocessing_data(train_data, feature_namelist)
    test_data=preprocessing_data(test_data, feature_namelist)
    
    ## Filter the LGG data for predicting 1p/19q status.
    train_data_LGG=train_data[train_data["is_GBM"] == 0]
    test_data_LGG=test_data[test_data["is_GBM"] == 0]
    print("\n****Train and test set split for LGG patients!**** \nTrain: {} patients; \nTest: {} patients.".format(train_data_LGG.shape[0], test_data_LGG.shape[0]))

    ## Filter the GBM data for predicting MGMT methylation status.
    train_data_GBM=train_data[train_data["is_GBM"] == 1]
    test_data_GBM=test_data[test_data["is_GBM"] == 1]
    print("\n****Train and test set split for GBM patients!**** \nTrain: {} patients; \nTest: {} patients.".format(train_data_GBM.shape[0], test_data_GBM.shape[0]))

    #-------------------------- Define the classification tasks ----------------------------------
    classification_tasks_dict={}
    if experiment_class=="TCGA_IDH":
        #========= predict LGG vs. GBM  ========
        classification_tasks_dict["TCGA_1.01_isGBM_base"]={
            "train_data": train_data, 
            "test_data": test_data, 
            "feature_columns":feature_namelist, 
            "label_column":"is_GBM",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA_1.02_isGBM_with_clinicalInfo"]={
            "train_data": train_data, 
            "test_data": test_data, 
            "feature_columns":feature_namelist+clinical_namelist,
            "label_column":"is_GBM",
            "base_results_path":base_results_path}


        #========= predict IDH mutation status  ========= 
        classification_tasks_dict["TCGA_2.01_isIDHMutant_base"]= {
            "train_data": train_data, 
            "test_data": test_data, 
            "feature_columns":feature_namelist, 
            "label_column":"is_IDH_mutant",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA_2.02_isIDHMutant_with_clinicalInfo"]={
            "train_data": train_data, 
            "test_data": test_data, 
            "feature_columns":feature_namelist+clinical_namelist,
            "label_column":"is_IDH_mutant",
            "base_results_path":base_results_path}

        #========= predict 1p/19q codeletion status ========= 
        classification_tasks_dict["TCGA_3.01_is1p19qCodeleted_base"]={
            "train_data": train_data_LGG, 
            "test_data": test_data_LGG, 
            "feature_columns":feature_namelist, 
            "label_column":"is_1p19q_codeleted",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA_3.02_is1p19qCodeleted_with_clinicalInfo"]={
            "train_data": train_data_LGG, 
            "test_data": test_data_LGG, 
            "feature_columns":feature_namelist+clinical_namelist, 
            "label_column":"is_1p19q_codeleted",
            "base_results_path":base_results_path}
        
    elif experiment_class=="TCGA_MGMT":
         #========= predict MGMT methylated vs. unmethylated for LGG and GBM data ========
        classification_tasks_dict["TCGA_4.01_isMGMTMethylated_base"]={
            "train_data": train_data, 
            "test_data": test_data, 
            "feature_columns":feature_namelist, 
            "label_column":"is_MGMT_Methylated",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA_4.02_isMGMTMethylated_with_clinicalInfo"]={
            "train_data": train_data, 
            "test_data": test_data, 
            "feature_columns":feature_namelist+clinical_namelist,
            "label_column":"is_MGMT_Methylated",
            "base_results_path":base_results_path}

        #========= predict MGMT methylated vs. unmethylated for GBM data ========
        classification_tasks_dict["TCGA_5.01_GBM-isMGMTMethylated_base"]={
            "train_data": train_data_GBM, 
            "test_data": test_data_GBM, 
            "feature_columns":feature_namelist, 
            "label_column":"is_MGMT_Methylated",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA_5.02_GBM-isMGMTMethylated_with_clinicalInfo"]={
            "train_data": train_data_GBM, 
            "test_data": test_data_GBM, 
            "feature_columns":feature_namelist+clinical_namelist,
            "label_column":"is_MGMT_Methylated",
            "base_results_path":base_results_path}

        #========= predict MGMT methylated vs. unmethylated for GBM data, with ET features ========
        classification_tasks_dict["TCGA_6.01_GBM-isMGMTMethylated-withETFeatures_base"]={
            "train_data": train_data_GBM, 
            "test_data": test_data_GBM, 
            "feature_columns":feature_namelist+ET_related_feature_namelist, 
            "label_column":"is_MGMT_Methylated",
            "base_results_path":base_results_path}

        classification_tasks_dict["TCGA_6.02_GBM-isMGMTMethylated-withETFeatures_with_clinicalInfo"]= {
            "train_data": train_data_GBM, 
            "test_data": test_data_GBM, 
            "feature_columns":feature_namelist+ET_related_feature_namelist+clinical_namelist, 
            "label_column":"is_MGMT_Methylated",
            "base_results_path":base_results_path}
    
    else:
        raise Exception("Unfined experiments!")
        
    
    find_task=False
    for task, classification_settings in classification_tasks_dict.items():
        if task_name==task:
            find_task=True
       
            # set and save the settings
            classification_settings["train_excel_path"]=train_data_excel_path
            classification_settings["test_excel_path_dict"]={"test_data": test_data_excel_path}
            classification_settings["test_data_dict"]={"test_data": classification_settings["test_data"]}
            classification_tasks_dict[task_name]=classification_settings

            if not os.path.exists(base_results_path):
                os.makedirs(classification_settings["base_results_path"])
    
    if not find_task:
        raise Exception("Task {} not defined! Possible task is {}.".format(task_name, classification_tasks_dict.keys()))
        
    return classification_tasks_dict[task_name]
                              

### Define settings for TCGA dataset, for predicting MGMT.

### Main 

In [None]:
def main():
    basic_settings=get_basic_settings()
    print("\n === basic_settings={} =======".format(basic_settings))
    
    # choose the experiment settings.
    experiment_class=basic_settings["experiment_class"]
    task_name=basic_settings["task_name"]
    
    if experiment_class=="BraTS2021":
        classification_task_settings=get_classification_tasks_dict_BraTS2021(task_name)
        
    elif experiment_class=="TCGA_IDH" or experiment_class=="TCGA_MGMT":
        classification_task_settings=get_classification_tasks_dict_TCGA(experiment_class, task_name)  
     
    
    #Perform classification
    perform_binary_classification(task_name, classification_task_settings, basic_settings)

In [None]:
main()