In [None]:
! pip install -U pip
! pip install -U torch==1.5.1
! pip install -U clearml>=0.15.1
! pip install -U pandas==1.0.4
! pip install -U numpy==1.18.4
! pip install -U pathlib2==2.3.5
! pip install -U scikit-learn==0.23.1

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import torch
from datetime import datetime
from pathlib2 import Path
from clearml import Task

In [None]:
task = Task.init(project_name="Table Example", task_name="tabular preprocessing")
logger = task.get_logger()
configuration_dict = {"test_size": 0.1, "split_random_state": 0}
configuration_dict = task.connect(
    configuration_dict
)  # enabling configuration override by clearml
print(
    configuration_dict
)  # printing actual configuration (after override in remote mode)

In [None]:
# Download shelter-animal-outcomes dataset (https://www.kaggle.com/c/shelter-animal-outcomes)
# This dataset aims to improve understanding trends in animal outcome,
# Which could help shelters focus their energy on specific animals who need extra help finding a new home.
path_to_ShelterAnimal = "./data"

In [None]:
train_set = pd.read_csv(Path(path_to_ShelterAnimal) / "train.csv")
logger.report_table(
    title="Trainset - raw",
    series="pandas DataFrame",
    iteration=0,
    table_plot=train_set.head(),
)

# **Pre-processing**

In [None]:
# Remove hour and year from DateTime data
timestamp = pd.to_datetime(train_set["DateTime"])
months = [d.month for d in timestamp]
train_set["Month"] = pd.DataFrame(months).astype("object")

In [None]:
age = train_set["AgeuponOutcome"]
months_age = []
for val in age:
    if pd.isnull(val):
        months_age.append(val)
    else:
        amount, time_type = val.split(" ")
        if "day" in time_type:
            mult = 1.0 / 30
        if "week" in time_type:
            mult = 1.0 / 4
        if "month" in time_type:
            mult = 1.0
        if "year" in time_type:
            mult = 12.0
        months_age.append(int(amount) * mult)
train_set["Age"] = pd.DataFrame(months_age).astype(np.float32)

In [None]:
sex_neutered = train_set["SexuponOutcome"]
sex = []
neutered = []
for val in sex_neutered:
    if pd.isnull(val):
        sex.append(val)
        neutered.append(val)
    elif "Unknown" in val:
        sex.append(np.nan)
        neutered.append(np.nan)
    else:
        n, s = val.split(" ")
        if n in ["Neutered", "Spayed"]:
            neutered.append("Yes")
        else:
            neutered.append("No")
        sex.append(s)

train_set["Sex"] = pd.DataFrame(sex)
train_set["Neutered"] = pd.DataFrame(neutered)

In [None]:
# Remove irrelevant columns
train_set.drop(
    columns=[
        "Name",
        "OutcomeSubtype",
        "AnimalID",
        "DateTime",
        "AgeuponOutcome",
        "SexuponOutcome",
    ],
    inplace=True,
)
logger.report_table(
    title="Trainset - after preprocessing",
    series="pandas DataFrame",
    iteration=0,
    table_plot=train_set.head(),
)

## *Fill NA Values*

In [None]:
object_columns = train_set.select_dtypes(include=["object"]).copy()
numerical_columns = train_set.select_dtypes(include=["number"]).copy()

In [None]:
for col in object_columns.columns:
    if object_columns[col].isnull().sum() > 0:
        most_common = Counter(object_columns[col]).most_common(1)[0][0]
        print('Column "{}": replacing null values with "{}"'.format(col, most_common))
        train_set[col].fillna(most_common, inplace=True)

In [None]:
for col in numerical_columns.columns:
    if numerical_columns[col].isnull().sum() > 0:
        median_val = numerical_columns[col].median()
        print('Column "{}": replacing null values with "{}"'.format(col, median_val))
        train_set[col].fillna(median_val, inplace=True)

In [None]:
logger.report_table(
    title="Trainset - after filling missing values",
    series="pandas DataFrame",
    iteration=0,
    table_plot=train_set.head(),
)

## *Labels Encoding*

In [None]:
out_encoding = train_set["OutcomeType"].astype("category").cat.categories
outcome_dict = {key: val for val, key in enumerate(out_encoding)}
task.upload_artifact("Outcome dictionary", outcome_dict)

In [None]:
for col in object_columns.columns:
    train_set[col] = train_set[col].astype("category").cat.codes
logger.report_table(
    title="Trainset - after labels encoding",
    series="pandas DataFrame",
    iteration=0,
    table_plot=train_set.head(),
)

## *Splitting dataset*

In [None]:
X = train_set.drop(columns=["OutcomeType"])
Y = train_set["OutcomeType"]
X_train, X_val, Y_train, Y_val = train_test_split(
    X,
    Y,
    test_size=configuration_dict.get("test_size", 0.1),
    random_state=configuration_dict.get("split_random_state", 0),
)

In [None]:
# making all variables categorical
object_columns_names = object_columns.drop(columns=["OutcomeType"]).columns
for col in object_columns_names:
    X[col] = X[col].astype("category")
columns_categories = {col: len(X[col].cat.categories) for col in object_columns_names}
task.upload_artifact("Categries per column", columns_categories)

In [None]:
train_df = X_train.join(Y_train)
train_df.to_csv(Path(path_to_ShelterAnimal) / "train_processed.csv", index=False)
val_df = X_val.join(Y_val)
val_df.to_csv(Path(path_to_ShelterAnimal) / "val_processed.csv", index=False)

In [None]:
paths = {
    "train_data": str(Path(path_to_ShelterAnimal) / "train_processed.csv"),
    "val_data": str(Path(path_to_ShelterAnimal) / "val_processed.csv"),
}
task.upload_artifact("Processed data", paths)