In [None]:
!pip install git+https://github.com/catalyst-team/catalyst --upgrade

In [None]:
!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

In [3]:
from torch import __version__
print(__version__)

1.9.0+cu102


In [4]:
from catalyst import __version__
print(__version__)



21.08


In [5]:
from catalyst import SETTINGS
print(SETTINGS.xla_required)

True


In [6]:
import os
from datetime import datetime

import torch
from torch import nn, optim
from torch.utils.data import DataLoader

from catalyst import dl
from catalyst.contrib.datasets import CIFAR10
from catalyst.contrib.nn import ResidualBlock
from catalyst.data import transforms

def conv_block(in_channels, out_channels, pool=False):
    layers = [
        nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
        nn.BatchNorm2d(out_channels),
        nn.ReLU(inplace=True),
    ]
    if pool:
        layers.append(nn.MaxPool2d(2))
    return nn.Sequential(*layers)


def resnet9(in_channels: int, num_classes: int, size: int = 16):
    sz, sz2, sz4, sz8 = size, size * 2, size * 4, size * 8
    return nn.Sequential(
        conv_block(in_channels, sz),
        conv_block(sz, sz2, pool=True),
        ResidualBlock(nn.Sequential(conv_block(sz2, sz2), conv_block(sz2, sz2))),
        conv_block(sz2, sz4, pool=True),
        conv_block(sz4, sz8, pool=True),
        ResidualBlock(nn.Sequential(conv_block(sz8, sz8), conv_block(sz8, sz8))),
        nn.Sequential(
            nn.MaxPool2d(4), nn.Flatten(), nn.Dropout(0.2), nn.Linear(sz8, num_classes)
        ),
    )

class CustomRunner(dl.IRunner):
    def __init__(self, logdir):
        super().__init__()
        self._logdir = logdir

    def get_engine(self):
        return dl.XLAEngine()

    def get_loggers(self):
        return {
            "console": dl.ConsoleLogger(),
            "csv": dl.CSVLogger(logdir=self._logdir),
            "tensorboard": dl.TensorboardLogger(logdir=self._logdir),
        }

    @property
    def stages(self):
        return ["train"]

    def get_stage_len(self, stage: str) -> int:
        return 3

    def get_loaders(self, stage: str):
        transform = transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
        )
        train_data = CIFAR10(os.getcwd(), train=False, download=True, transform=transform)
        valid_data = CIFAR10(os.getcwd(), train=False, download=True, transform=transform)

        if self.engine.is_ddp:
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                train_data,
                num_replicas=self.engine.world_size,
                rank=self.engine.rank,
                shuffle=True
            )
            valid_sampler = torch.utils.data.distributed.DistributedSampler(
                valid_data,
                num_replicas=self.engine.world_size,
                rank=self.engine.rank,
                shuffle=False
            )
        else:
            train_sampler = valid_sampler = None

        return {
            "train": DataLoader(train_data, batch_size=32, sampler=train_sampler),
            "valid": DataLoader(valid_data, batch_size=32, sampler=valid_sampler),
        }

    def get_model(self, stage: str):
        model = self.model \
            if self.model is not None \
            else resnet9(in_channels=3, num_classes=10)
        return model

    def get_criterion(self, stage: str):
        return nn.CrossEntropyLoss()

    def get_optimizer(self, stage: str, model):
        return optim.Adam(model.parameters(), lr=1e-3)

    def get_scheduler(self, stage: str, optimizer):
        return optim.lr_scheduler.MultiStepLR(optimizer, [5, 8], gamma=0.3)

    def get_callbacks(self, stage: str):
        return {
            "criterion": dl.CriterionCallback(
                metric_key="loss", input_key="logits", target_key="targets"
            ),
            "optimizer": dl.OptimizerCallback(metric_key="loss"),
            "scheduler": dl.SchedulerCallback(loader_key="valid", metric_key="loss"),
            "accuracy": dl.AccuracyCallback(
                input_key="logits", target_key="targets", topk_args=(1, 3, 5)
            ),
            "checkpoint": dl.CheckpointCallback(
                self._logdir,
                loader_key="valid",
                metric_key="accuracy",
                minimize=False,
                save_n_best=1,
            ),
            "tqdm": dl.TqdmCallback(),
        }

    def handle_batch(self, batch):
        x, y = batch
        logits = self.model(x)

        self.batch = {
            "features": x,
            "targets": y,
            "logits": logits,
        }

logdir = f"logs/{datetime.now().strftime('%Y%m%d-%H%M%S')}"
runner = CustomRunner(logdir)
runner.run()

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /content/cifar-10-python.tar.gz


0it [00:00, ?it/s]

Extracting /content/cifar-10-python.tar.gz to /content
Files already downloaded and verified


1/3 * Epoch (train):   0%|          | 0/313 [00:00<?, ?it/s]

train (1/3) accuracy: 0.3689999999999998 | accuracy/std: 0.11205302578716389 | accuracy01: 0.3689999999999998 | accuracy01/std: 0.11205302578716389 | accuracy03: 0.7078999999999995 | accuracy03/std: 0.12159489517185017 | accuracy05: 0.8602000000000005 | accuracy05/std: 0.08752057463875833 | loss: 1.7721755317687973 | loss/mean: 1.7721755317687973 | loss/std: 0.36059528424989634 | lr: 0.001 | momentum: 0.9


1/3 * Epoch (valid):   0%|          | 0/313 [00:00<?, ?it/s]

valid (1/3) accuracy: 0.4573000000000003 | accuracy/std: 0.09570201611036604 | accuracy01: 0.4573000000000003 | accuracy01/std: 0.09570201611036604 | accuracy03: 0.7743000000000005 | accuracy03/std: 0.07628543076298817 | accuracy05: 0.9137000000000008 | accuracy05/std: 0.04882415774837269 | loss: 1.5131483758926392 | loss/mean: 1.5131483758926392 | loss/std: 0.20366340870537172 | lr: 0.001 | momentum: 0.9
* Epoch (1/3) lr: 0.001 | momentum: 0.9


2/3 * Epoch (train):   0%|          | 0/313 [00:00<?, ?it/s]

train (2/3) accuracy: 0.5222999999999997 | accuracy/std: 0.09766288345057397 | accuracy01: 0.5222999999999997 | accuracy01/std: 0.09766288345057397 | accuracy03: 0.8346000000000001 | accuracy03/std: 0.06868996892693374 | accuracy05: 0.9380999999999998 | accuracy05/std: 0.041905436479777396 | loss: 1.3249054693222038 | loss/mean: 1.3249054693222038 | loss/std: 0.2278588062510951 | lr: 0.001 | momentum: 0.9


2/3 * Epoch (valid):   0%|          | 0/313 [00:00<?, ?it/s]

valid (2/3) accuracy: 0.5515000000000002 | accuracy/std: 0.08952193260671801 | accuracy01: 0.5515000000000002 | accuracy01/std: 0.08952193260671801 | accuracy03: 0.8529 | accuracy03/std: 0.06592722988237863 | accuracy05: 0.9566000000000001 | accuracy05/std: 0.03658481713613569 | loss: 1.2049140336990358 | loss/mean: 1.2049140336990358 | loss/std: 0.1951876894090425 | lr: 0.001 | momentum: 0.9
* Epoch (2/3) lr: 0.001 | momentum: 0.9


3/3 * Epoch (train):   0%|          | 0/313 [00:00<?, ?it/s]

train (3/3) accuracy: 0.6092999999999997 | accuracy/std: 0.09225297318301515 | accuracy01: 0.6092999999999997 | accuracy01/std: 0.09225297318301515 | accuracy03: 0.8772999999999997 | accuracy03/std: 0.05699262948822065 | accuracy05: 0.9611 | accuracy05/std: 0.0339949344102405 | loss: 1.0982439191818238 | loss/mean: 1.0982439191818238 | loss/std: 0.21103708709494978 | lr: 0.001 | momentum: 0.9


3/3 * Epoch (valid):   0%|          | 0/313 [00:00<?, ?it/s]

valid (3/3) accuracy: 0.5342000000000003 | accuracy/std: 0.09046714005838272 | accuracy01: 0.5342000000000003 | accuracy01/std: 0.09046714005838272 | accuracy03: 0.8727 | accuracy03/std: 0.06215984543608892 | accuracy05: 0.9690000000000004 | accuracy05/std: 0.030746455640503382 | loss: 1.2388727598190308 | loss/mean: 1.2388727598190308 | loss/std: 0.2180870976661938 | lr: 0.001 | momentum: 0.9
* Epoch (3/3) lr: 0.001 | momentum: 0.9
Top best models:
logs/20210820-054145/train.2.pth	0.5515


<__main__.CustomRunner at 0x7f80edb49d10>