# Training Step


In [None]:
! pip install -U pip
! pip install -U torch==1.5.1
! pip install -U clearml==0.16.2rc0
! pip install -U pandas==1.0.4
! pip install -U numpy==1.18.4
! pip install -U tensorboard==2.2.1

In [None]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter

from clearml import Task

## Configure Task
Instantiate a ClearML Task using `Task.init`. 

A Configuration dictionary is connected to the task using `Task.connect`. This will enable the pipeline controller to access this task's configurations and override the values when the pipeline is executed.

Notice in the [pipeline controller script](https://github.com/allegroai/clearml/blob/master/examples/frameworks/pytorch/notebooks/table/tabular_ml_pipeline.ipynb) that when this task is added as a step in the pipeline, the value of `data_task_id` is overridden with the ID of another task in the pipeline. 

In [None]:
task = Task.init(project_name="Tabular Example", task_name="tabular prediction")
logger = task.get_logger()
configuration_dict = {
    "data_task_id": "b605d76398f941e69fc91b43420151d2",
    "number_of_epochs": 15,
    "batch_size": 100,
    "dropout": 0.3,
    "base_lr": 0.1,
}
configuration_dict = task.connect(
    configuration_dict
)  # enabling configuration override by clearml
print(
    configuration_dict
)  # printing actual configuration (after override in remote mode)

In [None]:
data_task = Task.get_task(configuration_dict.get("data_task_id"))

In [None]:
train_set = data_task.artifacts["train_data"].get().drop(columns=["Unnamed: 0"])
test_set = data_task.artifacts["val_data"].get().drop(columns=["Unnamed: 0"])

In [None]:
columns_categories = data_task.artifacts["Categries per column"].get()
columns_categories_ordered = {
    key: columns_categories[key]
    for key in train_set.columns
    if key in columns_categories.keys()
}
columns_numerical = [
    key
    for key in train_set.drop(columns=["OutcomeType"])
    .drop(columns=columns_categories_ordered)
    .keys()
]
embedding_sizes = [
    (n_categories, min(32, (n_categories + 1) // 2))
    for _, n_categories in columns_categories_ordered.items()
]

In [None]:
outcome_dict = data_task.artifacts["Outcome dictionary"].get()
reveresed_outcome_dict = {val: key for key, val in outcome_dict.items()}

## Train Model

In [None]:
class ShelterDataset(Dataset):
    def __init__(self, X, Y, embedded_col_names):
        X = X.copy()
        self.X1 = (
            X.loc[:, embedded_col_names].copy().values.astype(np.int64)
        )  # categorical columns
        self.X2 = (
            X.drop(columns=embedded_col_names).copy().values.astype(np.float32)
        )  # numerical columns
        self.y = Y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.y[idx]


# creating train and valid datasets
train_ds = ShelterDataset(
    train_set.drop(columns=["OutcomeType"]),
    train_set["OutcomeType"],
    columns_categories_ordered.keys(),
)
valid_ds = ShelterDataset(
    test_set.drop(columns=["OutcomeType"]),
    test_set["OutcomeType"],
    columns_categories_ordered.keys(),
)

In [None]:
class ShelterModel(nn.Module):
    def __init__(self, embedding_sizes, n_cont):
        super().__init__()
        self.embeddings = nn.ModuleList(
            [nn.Embedding(categories, size) for categories, size in embedding_sizes]
        )
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.n_emb, self.n_cont = n_emb, n_cont
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 200)
        self.lin2 = nn.Linear(200, 70)
        self.lin3 = nn.Linear(70, 5)
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(200)
        self.bn3 = nn.BatchNorm1d(70)
        self.emb_drop = nn.Dropout(0.6)
        self.drops = nn.Dropout(configuration_dict.get("dropout", 0.25))

    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x2 = self.bn1(x_cont)
        x = torch.cat([x, x2], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.drops(x)
        x = self.bn3(x)
        x = self.lin3(x)
        return x


model = ShelterModel(embedding_sizes, 1)

In [None]:
optimizer = torch.optim.SGD(
    model.parameters(), lr=configuration_dict.get("base_lr", 0.1), momentum=0.9
)
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=configuration_dict.get("number_of_epochs", 15) // 3, gamma=0.1
)

In [None]:
device = (
    torch.cuda.current_device() if torch.cuda.is_available() else torch.device("cpu")
)
print("Device to use: {}".format(device))
model.to(device)

In [None]:
tensorboard_writer = SummaryWriter("./tensorboard_logs")

In [None]:
def train_model(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x1, x2, y in train_dl:
        batch = y.shape[0]
        output = model(x1.to(device), x2.to(device))
        loss = F.cross_entropy(output, y.to(device))
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch * (loss.item())
    return sum_loss / total

In [None]:
def val_loss(model, valid_dl, epoch):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    with torch.no_grad():
        for x1, x2, y in valid_dl:
            current_batch_size = y.shape[0]
            out = model(x1.to(device), x2.to(device))
            loss = F.cross_entropy(out, y.to(device))
            sum_loss += current_batch_size * (loss.item())
            total += current_batch_size
            pred = torch.max(out, 1)[1]
            correct += (pred.cpu() == y).float().sum().item()
    print("\t valid loss %.3f and accuracy %.3f" % (sum_loss / total, correct / total))
    tensorboard_writer.add_scalar("accuracy/total", correct / total, epoch)

    debug_categories = pd.DataFrame(
        x1.numpy(), columns=columns_categories_ordered.keys()
    )
    debug_numercal = pd.DataFrame(x2.numpy(), columns=columns_numerical)
    debug_gt = pd.DataFrame(
        np.array([reveresed_outcome_dict[int(e)] for e in y]), columns=["GT"]
    )
    debug_pred = pd.DataFrame(
        np.array([reveresed_outcome_dict[int(e)] for e in pred.cpu()]), columns=["Pred"]
    )
    debug_table = debug_categories.join([debug_numercal, debug_gt, debug_pred])
    logger.report_table(
        title="Trainset - after labels encoding",
        series="pandas DataFrame",
        iteration=epoch,
        table_plot=debug_table.head(),
    )
    return sum_loss / total, correct / total

In [None]:
def train_loop(model, epochs):
    for i in range(epochs):
        loss = train_model(model, optimizer, train_dl)
        print("Epoch {}: training loss {}".format(i, loss))
        tensorboard_writer.add_scalar("training loss/loss", loss, i)
        tensorboard_writer.add_scalar(
            "learning rate/lr", optimizer.param_groups[0]["lr"], i
        )

        val_loss(model, valid_dl, i)
        scheduler.step()

In [None]:
train_dl = torch.utils.data.DataLoader(
    train_ds,
    batch_size=configuration_dict.get("batch_size", 100),
    shuffle=True,
    pin_memory=True,
    num_workers=1,
)
valid_dl = torch.utils.data.DataLoader(
    valid_ds,
    batch_size=configuration_dict.get("batch_size", 100),
    shuffle=False,
    pin_memory=True,
    num_workers=1,
)

In [None]:
train_loop(model, epochs=configuration_dict.get("number_of_epochs", 30))

## Save Model

ClearML automatically captures the model logged with Torch, and saves it as an artifact.

In [None]:
PATH = "./model_checkpoint.pth"
torch.save(model.state_dict(), PATH)
tensorboard_writer.close()