In [None]:
! pip install -U pip
! pip install -U clearml==0.16.2rc0
! pip install -U pandas==1.0.4
! pip install -U scikit-learn==0.23.1
! pip install -U pathlib2==2.3.5

In [None]:
import pandas as pd
from pathlib2 import Path
from sklearn.model_selection import train_test_split

from clearml import Task

In [None]:
task = Task.init(
    project_name="Tabular Example", task_name="Download and split tabular dataset"
)
logger = task.get_logger()
configuration_dict = {"test_size": 0.1, "split_random_state": 0}
configuration_dict = task.connect(
    configuration_dict
)  # enabling configuration override by clearml
print(
    configuration_dict
)  # printing actual configuration (after override in remote mode)

# **Downloading**

In [None]:
# Download the shelter-animal-outcomes dataset (https://www.kaggle.com/c/shelter-animal-outcomes)
# and save it to your cloud storage or your mounted local storage
# If the data is on your cloud storage, you can use clearml' storage manager to get a local copy of it:
#    from clearml.storage import StorageManager
#    path_to_ShelterAnimal = StorageManager.get_local_copy("https://allegro-datasets.s3.amazonaws.com/clearml/UrbanSound8K.zip",
#                                                          extract_archive=True)
path_to_ShelterAnimal = "/home/sam/Datasets/shelter-animal-outcomes"

In [None]:
train_set = pd.read_csv(Path(path_to_ShelterAnimal) / "train.csv")
logger.report_table(
    title="Trainset - raw",
    series="pandas DataFrame",
    iteration=0,
    table_plot=train_set.head(),
)

# **Splitting to train and val**

In [None]:
X = train_set.drop(columns=["OutcomeType"])
Y = train_set["OutcomeType"]
X_train, X_val, Y_train, Y_val = train_test_split(
    X,
    Y,
    test_size=configuration_dict.get("test_size", 0.1),
    random_state=configuration_dict.get("split_random_state", 0),
)

In [None]:
train_df = X_train.join(Y_train)
val_df = X_val.join(Y_val)

In [None]:
task.upload_artifact("train_data", artifact_object=train_df)
task.upload_artifact("val_data", artifact_object=val_df)