# Preprocessing Step


In [None]:
! pip install -U pip
! pip install -U clearml==0.16.2rc0
! pip install -U pandas==1.0.4
! pip install -U numpy==1.18.4

In [None]:
import pandas as pd
import numpy as np
from collections import Counter

from clearml import Task

## Configure Task
Instantiate a ClearML Task using `Task.init`. 

A Configuration dictionary is connected to the task using `Task.connect`. This will enable the [pipeline controller](https://github.com/allegroai/clearml/blob/master/examples/frameworks/pytorch/notebooks/table/tabular_ml_pipeline.ipynb) to access this task's configurations and override the value when the pipeline is executed. 

In [None]:
task = Task.init(project_name="Tabular Example", task_name="tabular preprocessing")
logger = task.get_logger()
configuration_dict = {
    "data_task_id": "39fbf86fc4a341359ac6df4aa70ff91b",
    "fill_categorical_NA": True,
    "fill_numerical_NA": True,
}
configuration_dict = task.connect(
    configuration_dict
)  # enabling configuration override by clearml
print(
    configuration_dict
)  # printing actual configuration (after override in remote mode)

## Get Data

ClearML retrieves that data which will be processed. First, the data task is fetched using `Task.get_task` and inputting the task's ID from the configuration dictionary. Then the data task's artifacts are accessed in order to retrieve the training and validations sets. 


In [None]:
data_task = Task.get_task(configuration_dict.get("data_task_id"))
train_set = data_task.artifacts["train_data"].get().drop(columns=["Unnamed: 0"])
val_set = data_task.artifacts["val_data"].get().drop(columns=["Unnamed: 0"])
logger.report_table(
    title="Trainset - raw",
    series="pandas DataFrame",
    iteration=0,
    table_plot=train_set.head(),
)

## Preprocess Data

In [None]:
# Remove hour and year from DateTime data
def change_time_format(data_frame):
    timestamp = pd.to_datetime(data_frame["DateTime"])
    months = [d.month for d in timestamp]
    data_frame["Month"] = pd.DataFrame(months).astype("object")
    data_frame.drop(columns=["DateTime"], inplace=True)
    return data_frame


train_set = change_time_format(train_set)
val_set = change_time_format(val_set)

In [None]:
def change_age_format(data_frame):
    age = data_frame["AgeuponOutcome"]
    months_age = []
    for val in age:
        if pd.isnull(val):
            months_age.append(val)
        else:
            amount, time_type = val.split(" ")
            if "day" in time_type:
                mult = 1.0 / 30
            if "week" in time_type:
                mult = 1.0 / 4
            if "month" in time_type:
                mult = 1.0
            if "year" in time_type:
                mult = 12.0
            months_age.append(int(amount) * mult)
    data_frame["Age"] = pd.DataFrame(months_age).astype(np.float32)
    data_frame.drop(columns=["AgeuponOutcome"], inplace=True)
    return data_frame


train_set = change_age_format(train_set)
val_set = change_age_format(val_set)

In [None]:
def change_sex_format(data_frame):
    sex_neutered = data_frame["SexuponOutcome"]
    sex = []
    neutered = []
    for val in sex_neutered:
        if pd.isnull(val):
            sex.append(val)
            neutered.append(val)
        elif "Unknown" in val:
            sex.append(np.nan)
            neutered.append(np.nan)
        else:
            n, s = val.split(" ")
            if n in ["Neutered", "Spayed"]:
                neutered.append("Yes")
            else:
                neutered.append("No")
            sex.append(s)

    data_frame["Sex"] = pd.DataFrame(sex)
    data_frame["Neutered"] = pd.DataFrame(neutered)
    data_frame.drop(columns=["SexuponOutcome"], inplace=True)
    return data_frame


train_set = change_sex_format(train_set)
val_set = change_sex_format(val_set)

In [None]:
# Remove irrelevant columns
def remove_columns(data_frame, list_columns_names=None):
    if list_columns_names is not None:
        data_frame.drop(columns=list_columns_names, inplace=True)
    return data_frame


train_set = remove_columns(train_set, ["Name", "OutcomeSubtype", "AnimalID"])
val_set = remove_columns(val_set, ["Name", "OutcomeSubtype", "AnimalID"])

logger.report_table(
    title="Trainset - after preprocessing",
    series="pandas DataFrame",
    iteration=0,
    table_plot=train_set.head(),
)

## *Fill NA Values*

In [None]:
object_columns = train_set.select_dtypes(include=["object"]).copy()
numerical_columns = train_set.select_dtypes(include=["number"]).copy()

Notice that the configuration dictionary is accessed below to access `fill_categorical_NA`'s value. This value can be overridden by the pipeline controller. 

In [None]:
if configuration_dict.get("fill_categorical_NA", True):
    for col in object_columns.columns:
        if object_columns[col].isnull().sum() > 0:
            most_common = Counter(object_columns[col]).most_common(1)[0][0]
            print(
                'Column "{}": replacing null values with "{}"'.format(col, most_common)
            )
            train_set[col].fillna(most_common, inplace=True)
            val_set[col].fillna(most_common, inplace=True)

Notice that the configuration dictionary is accessed below to access `fill_numerical_NA`'s value. This value can be overridden by the pipeline controller. 

In [None]:
if configuration_dict.get("fill_numerical_NA", True):
    for col in numerical_columns.columns:
        if numerical_columns[col].isnull().sum() > 0:
            median_val = numerical_columns[col].median()
            print(
                'Column "{}": replacing null values with "{}"'.format(col, median_val)
            )
            train_set[col].fillna(median_val, inplace=True)
            val_set[col].fillna(median_val, inplace=True)

In [None]:
# Drop rows with NA values if were chosen not to be filled
train_set.dropna(inplace=True)
val_set.dropna(inplace=True)
if configuration_dict.get("fill_categorical_NA", True) or configuration_dict.get(
    "fill_numerical_NA", True
):
    logger.report_table(
        title="Trainset - after filling missing values",
        series="pandas DataFrame",
        iteration=0,
        table_plot=train_set.head(),
    )

## *Labels Encoding*

In [None]:
all_data = pd.concat([train_set, val_set])
outcome_categories = all_data["OutcomeType"].astype("category").cat.categories
outcome_dict = {key: val for val, key in enumerate(outcome_categories)}
task.upload_artifact("Outcome dictionary", outcome_dict)

In [None]:
for col in object_columns.columns:
    all_data[col] = all_data[col].astype("category").cat.codes
train_set = all_data.iloc[: len(train_set.index), :]
val_set = all_data.iloc[len(train_set.index) :, :]
logger.report_table(
    title="Trainset - after labels encoding",
    series="pandas DataFrame",
    iteration=0,
    table_plot=train_set.head(),
)

In [None]:
# making all variables categorical
object_columns_names = object_columns.drop(columns=["OutcomeType"]).columns
for col in object_columns_names:
    all_data[col] = all_data[col].astype("category")
columns_categories = {
    col: len(all_data[col].cat.categories) for col in object_columns_names
}
task.upload_artifact("Categories per column", columns_categories)

In [None]:
task.upload_artifact("train_data", artifact_object=train_set)
task.upload_artifact("val_data", artifact_object=val_set)