### Encode category data.

- For the the specified columns, drop the rows with nan values;
- Encode the category data by label-encoder;
- Rename the columns for easy reference;

In [None]:
import pandas as pd

from mySettings import get_category_data_encoder_setting_dict

In [None]:
def encode_category_data(data, category_column_list):
    """
    Encode the category data by label-encoder.
    """
    dummy = pd.get_dummies(data[category_column_list], drop_first=False)
    data= pd.concat([data, dummy], axis=1)
    
    return data

def main_encode_category_data(category_data_encoder_setting):
    """
    1) For the the specified columns, drop the rows with nan values;
    2) Encode the category data by label-encoder;
    3) Rename the columns for easy reference;
    """
    # read the path
    original_data_excel=category_data_encoder_setting["original_data_excel"]
    save_encoded_data_excel_path=category_data_encoder_setting["save_encoded_data_excel_path"]
    category_column_list=category_data_encoder_setting["category_column_list"]
    drop_nan_column_list=category_data_encoder_setting["drop_nan_column_list"]
    column_rename_dict=category_data_encoder_setting["column_rename_dict"]

    # read the data;
    original_data=pd.read_excel(original_data_excel, index_col=0)
    print("\n====Before processing: data.shape={}==== \n{}.".format(original_data.shape, original_data.head()))

    # delete the nan values for these columns
    if isinstance(drop_nan_column_list, list):
        original_data.dropna(subset=drop_nan_column_list, inplace=True)

    # Encode the category data using label-encoder; rename some columns.
    original_data.fillna(value="unknown", inplace=True)
    encoded_data=encode_category_data(original_data, category_column_list)
    encoded_data=encoded_data.rename(columns=column_rename_dict)

    ## save the data
    encoded_data.to_excel(save_encoded_data_excel_path)
    print("\n====After processing: data.shape={}===== \n{}.".format(encoded_data.shape, encoded_data.head()))


### Main

In [None]:
category_data_encoder_setting_dict=get_category_data_encoder_setting_dict()
for task_name, category_data_encoder_setting in category_data_encoder_setting_dict.items():
    print("\n===== Encode category data for {}. ======".format(task_name))
    main_encode_category_data(category_data_encoder_setting)
