In [1]:
!nvidia-smi

Wed Jun 12 06:07:25 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8    17W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
!git clone https://github.com/NVIDIA/apex /tmp/apex
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" /tmp/apex/.
!git clone https://github.com/ceshine/apex_pytorch_cifar_experiment /tmp/src
!cp /tmp/src/*.py .
!pip install python-telegram-bot pretrainedmodels
!pip install https://github.com/ceshine/pytorch_helper_bot/archive/0.1.3.zip

Cloning into '/tmp/apex'...
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 4640 (delta 0), reused 1 (delta 0), pack-reused 4636[K
Receiving objects: 100% (4640/4640), 8.69 MiB | 26.56 MiB/s, done.
Resolving deltas: 100% (3012/3012), done.
  cmdoptions.check_install_build_global(options)
Created temporary directory: /tmp/pip-ephem-wheel-cache-sa18vuzu
Created temporary directory: /tmp/pip-req-tracker-4ihl7oqz
Created requirements tracker '/tmp/pip-req-tracker-4ihl7oqz'
Created temporary directory: /tmp/pip-install-pmdk7jja
Processing /tmp/apex
  Created temporary directory: /tmp/pip-req-build-ioo3hb6z
  Added file:///tmp/apex to build tracker '/tmp/pip-req-tracker-4ihl7oqz'
    Running setup.py (path:/tmp/pip-req-build-ioo3hb6z/setup.py) egg_info for package from file:///tmp/apex
    Running command python setup.py egg_info
    torch.__version__  =  1.1.0
    running egg_info
    cr

In [3]:
# Upload telegram tokens
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving telegram_tokens.py to telegram_tokens.py
User uploaded file "telegram_tokens.py" with length 82 bytes


In [0]:
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision
from tqdm import tqdm
from helperbot import (
    TriangularLR, BaseBot, WeightDecayOptimizerWrapper,
    LearningRateSchedulerCallback
)
from helperbot.metrics import SoftmaxAccuracy
from apex import amp
from apex.optimizers import FusedAdam

from baseline import (
    CifarBot, get_cifar10_dataset,
    get_wide_resnet, get_se_resnext,
    get_gpu_memory_map
)
from telegram_tokens import BOT_TOKEN, CHAT_ID
from telegram_sender import telegram_sender

DEVICE = torch.device("cuda")
EPOCHS = 10
MODEL_FUNC = get_wide_resnet

@telegram_sender(token=BOT_TOKEN, chat_id=CHAT_ID)
def train_apex(level):
    train_dl, valid_dl = get_cifar10_dataset(batch_size=128)
    steps_per_epoch = len(train_dl)

    model = MODEL_FUNC()
    optimizer = optim.SGD(
        model.parameters(), lr=0.1,
        momentum=0.9, weight_decay=5e-4)
#     optimizer = WeightDecayOptimizerWrapper(optim.Adam(
#         model.parameters(), lr=1.5e-3), 0.1)    
#     optimizer = optim.Adam(
#         model.parameters(), lr=1.5e-3, weight_decay=1e-4)
    if level != "O0":
        model, optimizer = amp.initialize(
            model, optimizer, opt_level=level
        )
    
    n_epochs = EPOCHS
    n_steps = n_epochs * steps_per_epoch
    bot = CifarBot(
        log_dir=Path("."), checkpoint_dir=Path("/tmp/"),
        model=model, train_loader=train_dl, val_loader=valid_dl,
        optimizer=optimizer, echo=True,
        avg_window=steps_per_epoch // 3,
        criterion=nn.CrossEntropyLoss(),
        device=DEVICE, clip_grad=10.,
        callbacks=[
            LearningRateSchedulerCallback(
                TriangularLR(
                    optimizer, 1000, ratio=5, steps_per_cycle=n_steps
                )
            )
        ],
        metrics=[SoftmaxAccuracy()],
        pbar=True,
        use_amp=True if level != "O0" else False
    )
    bot.train(
        n_steps,
        snapshot_interval=steps_per_epoch,
        log_interval=steps_per_epoch // 3,
        keep_n_snapshots=1
    )
    print(f"GPU Memory Used: {get_gpu_memory_map()} MB")
    bot.load_model(bot.best_performers[0][1])
    bot.remove_checkpoints(keep=0)
    model = MODEL_FUNC().cpu()
    model.load_state_dict(bot.model.cpu().state_dict())
    torch.save(model, f"{level}.pth")

In [2]:
%%time
train_apex("O2")

Files already downloaded and verified
Files already downloaded and verified
| Wide-Resnet 28x10


[[06/12/2019 07:19:37 AM]] SEED: 9293
[[06/12/2019 07:19:37 AM]] # of parameters: 36,489,290
[[06/12/2019 07:19:37 AM]] # of trainable parameters: 36,489,290
[[06/12/2019 07:19:37 AM]] Optimizer SGD (
Parameter Group 0
    dampening: 0
    initial_lr: 0.1
    lr: 0.0001
    momentum: 0.9
    nesterov: False
    weight_decay: 0.0005
)
[[06/12/2019 07:19:37 AM]] Batches per epoch: 390


Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic


[[06/12/2019 07:20:30 AM]] Step 130: train 1.978501 lr: 1.977e-02
[[06/12/2019 07:21:25 AM]] Step 260: train 1.575762 lr: 3.975e-02
[[06/12/2019 07:22:21 AM]] Step 390: train 1.305088 lr: 5.973e-02
100%|██████████| 40/40 [00:09<00:00,  4.19it/s]
[[06/12/2019 07:22:30 AM]] Criterion loss: 1.69305051
[[06/12/2019 07:22:30 AM]] accuracy: 47.39%
[[06/12/2019 07:22:30 AM]] Snapshot metric 1.69305051
[[06/12/2019 07:22:30 AM]] Saving checkpoint /tmp/snapshot_basebot_1.69305051_390.pth...
[[06/12/2019 07:22:31 AM]] New low

[[06/12/2019 07:23:26 AM]] Step 520: train 1.130289 lr: 7.971e-02
[[06/12/2019 07:24:22 AM]] Step 650: train 1.066960 lr: 9.969e-02
[[06/12/2019 07:25:18 AM]] Step 780: train 0.995653 lr: 9.607e-02
100%|██████████| 40/40 [00:09<00:00,  4.19it/s]
[[06/12/2019 07:25:27 AM]] Criterion loss: 1.30126018
[[06/12/2019 07:25:27 AM]] accuracy: 60.27%
[[06/12/2019 07:25:27 AM]] Snapshot metric 1.30126018
[[06/12/2019 07:25:27 AM]] Saving checkpoint /tmp/snapshot_basebot_1.30126018_7

GPU Memory Used: 4383 MB
| Wide-Resnet 28x10
CPU times: user 18min, sys: 12min 24s, total: 30min 25s
Wall time: 29min 31s
