### Task 1: Encode category data.

- For the the specified columns, drop the rows with nan values;
- Encode the category data by label-encoder;
- Rename the columns for easy reference;

In [None]:
import pandas as pd

from mySettings import get_category_data_encoder_setting_dict

In [None]:
def encode_category_data(data, category_column_list):
    """
    Encode the category data by label-encoder.
    """
    dummy = pd.get_dummies(data[category_column_list], drop_first=False)
    data= pd.concat([data, dummy], axis=1)
    
    return data

def count_null_values(data_df):
    """
    Count the null values for the columns;
    """
    
    data_df_isnull=data_df.isnull()
    count_isnull=data_df_isnull.sum()
    count_hasNull=count_isnull[count_isnull>0]
    print("\n Among the {} features, there are {} features have null values, that is, \n - {}".format(data_df.shape[1], len(count_hasNull.keys()), count_hasNull))
    
def main_encode_category_data(category_data_encoder_setting):
    """
    1) For the the specified columns, drop the rows with nan values;
    2) Encode the category data by label-encoder;
    3) Rename the columns for easy reference;
    """
    # read the path
    original_data_excel=category_data_encoder_setting["original_data_excel"]
    save_encoded_data_excel_path=category_data_encoder_setting["save_encoded_data_excel_path"]
    drop_patient_id_list=category_data_encoder_setting["drop_patient_id_list"]
    category_column_list=category_data_encoder_setting["category_column_list"]
    drop_nan_column_list=category_data_encoder_setting["drop_nan_column_list"]
    column_rename_dict=category_data_encoder_setting["column_rename_dict"]

    # read the data;
    original_data=pd.read_excel(original_data_excel, index_col=0)
    print("\n- Before processing: data.shape={}.".format(original_data.shape))

    # drop some the specific patients,like the patients with bad image quality in TCGA data;
    if isinstance(drop_patient_id_list, list):
        original_data.drop(drop_patient_id_list, axis=0, inplace=True)
        print("After droping data of {}, data.shape={}".format(drop_patient_id_list, original_data.shape))
        
    # count the null values
    count_null_values(original_data)
    
    # delete the nan values for these columns
    if isinstance(drop_nan_column_list, list):
        print("\n Droping null values of these columns {}...".format(drop_nan_column_list))
        original_data.dropna(subset=drop_nan_column_list, inplace=True)
        count_null_values(original_data)

    # Encode the category data using label-encoder; rename some columns.
    original_data.fillna(value="unknown", inplace=True)
    encoded_data=encode_category_data(original_data, category_column_list)
    encoded_data=encoded_data.rename(columns=column_rename_dict)

    ## save the data
    encoded_data.to_excel(save_encoded_data_excel_path)
    print("\n- After processing: data.shape={}.".format(encoded_data.shape))


#### Main

In [None]:
category_data_encoder_setting_dict=get_category_data_encoder_setting_dict()
for task_name, category_data_encoder_setting in category_data_encoder_setting_dict.items():
    print("\n===== Encode category data for {}. ======".format(task_name))
    main_encode_category_data(category_data_encoder_setting)


### Task 2: After encoding the data, add subtypes columns for TCGA-IDH;
- LGG, IDH mutant, 1p/19q codeleted:1
- LGG, IDH mutant, 1p/19q non-codeleted:2
- LGG, IDH wildtype: 3
- GBM, IDH mutant: 4
- GBM, IDH wildtype: 5

In [None]:
def caculate_tumor_subtype(data):
    """
    Define the tumor type according to the different combinations of tumor grade, IDH mutant and 1p/19q codeleted status.
    """
    
    if data["is_GBM"]==0 and data["is_IDH_mutant"]==1 and data["is_1p19q_codeleted"]==1:
        tumor_subtype_description="LGG, IDH mutant, 1p/19q codeleted"
        tumor_subtype=1
        
    elif data["is_GBM"]==0 and data["is_IDH_mutant"]==1 and data["is_1p19q_codeleted"]==0:
        tumor_subtype_description="LGG, IDH mutant, 1p/19q non-codeleted"
        tumor_subtype=2
        
    elif data["is_GBM"]==0 and data["is_IDH_mutant"]==0:
        tumor_subtype_description="LGG, IDH wildtype"
        tumor_subtype=3
        
    elif data["is_GBM"]==1 and data["is_IDH_mutant"]==1:
        tumor_subtype_description="GBM, IDH mutant"  
        tumor_subtype=4
        
    elif data["is_GBM"]==1 and data["is_IDH_mutant"]==0:
        tumor_subtype_description="GBM, IDH wildtype"
        tumor_subtype=5
    
    return tumor_subtype_description, tumor_subtype

def get_tumor_subtype_description(data):
    """
    Used to add new columns to describe the tumor_subtype by words;
    """
    tumor_subtype_description, tumor_subtype=caculate_tumor_subtype(data)
    
    return tumor_subtype_description

def get_tumor_subtype(data):
    """
    Used to add new columns to describe the tumor_subtype by number in {1,2,3,4,5}.
    """
    tumor_subtype_description, tumor_subtype=caculate_tumor_subtype(data)
    
    return tumor_subtype
    
    
def add_tumor_subtype_columns(data_excel_path):
    
    #read data
    gene_df=pd.read_excel(data_excel_path)
    
    print("\n Before adding subtype columns.....")
    display(gene_df.head())
    
    #add a column to reclassify the tumor according to tumor grade, IDH mutant and 1p/19q codeletion status
    gene_df["tumor_subtype_description"]=gene_df.apply(get_tumor_subtype_description, axis=1)
    gene_df["tumor_subtype"]=gene_df.apply(get_tumor_subtype, axis=1)
    
    gene_df.to_excel(data_excel_path)
    
    print("\n After adding subtype columns.....")
    display(gene_df.head())
    



#### Main

In [None]:
category_data_encoder_setting_dict=get_category_data_encoder_setting_dict()
category_data_encoder_setting=category_data_encoder_setting_dict["encode_category_data_TCGA-IDH"]
encoded_data_excel_path=category_data_encoder_setting["save_encoded_data_excel_path"]
add_tumor_subtype_columns(encoded_data_excel_path)