In [1]:
!nvidia-smi

Tue Jun 11 23:47:48 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8    16W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
!git clone https://github.com/NVIDIA/apex /tmp/apex
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" /tmp/apex/.
!git clone https://github.com/ceshine/apex_pytorch_cifar_experiment /tmp/src
!cp /tmp/src/*.py .
!pip install python-telegram-bot pretrainedmodels
!pip install https://github.com/ceshine/pytorch_helper_bot/archive/0.1.3.zip

Cloning into '/tmp/apex'...
remote: Enumerating objects: 4, done.[K
remote: Counting objects:  25% (1/4)   [Kremote: Counting objects:  50% (2/4)   [Kremote: Counting objects:  75% (3/4)   [Kremote: Counting objects: 100% (4/4)   [Kremote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects:  25% (1/4)   [Kremote: Compressing objects:  50% (2/4)   [Kremote: Compressing objects:  75% (3/4)   [Kremote: Compressing objects: 100% (4/4)   [Kremote: Compressing objects: 100% (4/4), done.[K
Receiving objects:   0% (1/4640)   Receiving objects:   1% (47/4640)   Receiving objects:   2% (93/4640)   Receiving objects:   3% (140/4640)   Receiving objects:   4% (186/4640)   Receiving objects:   5% (232/4640)   Receiving objects:   6% (279/4640)   Receiving objects:   7% (325/4640)   Receiving objects:   8% (372/4640)   Receiving objects:   9% (418/4640)   Receiving objects:  10% (464/4640)   Receiving objects:  11% (511/4640)   Receiving objects:  12% (5

In [3]:
# Upload telegram tokens
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving telegram_tokens.py to telegram_tokens.py
User uploaded file "telegram_tokens.py" with length 82 bytes


In [0]:
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision
from tqdm import tqdm
from helperbot import (
    TriangularLR, BaseBot, WeightDecayOptimizerWrapper,
    LearningRateSchedulerCallback
)
from helperbot.metrics import SoftmaxAccuracy
from apex import amp
from apex.optimizers import FusedAdam

from baseline import (
    CifarBot, get_cifar10_dataset,
    get_wide_resnet, get_se_resnext,
    get_gpu_memory_map
)
from telegram_tokens import BOT_TOKEN, CHAT_ID
from telegram_sender import telegram_sender

DEVICE = torch.device("cuda")
EPOCHS = 10
MODEL_FUNC = get_wide_resnet

@telegram_sender(token=BOT_TOKEN, chat_id=CHAT_ID)
def train_apex(level):
    train_dl, valid_dl = get_cifar10_dataset(batch_size=128)
    steps_per_epoch = len(train_dl)

    model = MODEL_FUNC()
    optimizer = optim.SGD(
        model.parameters(), lr=0.1,
        momentum=0.9, weight_decay=5e-4)
#     optimizer = WeightDecayOptimizerWrapper(optim.Adam(
#         model.parameters(), lr=1.5e-3), 0.1)    
#     optimizer = optim.Adam(
#         model.parameters(), lr=1.5e-3, weight_decay=1e-4)
    if level != "O0":
        model, optimizer = amp.initialize(
            model, optimizer, opt_level=level
        )
    
    n_epochs = EPOCHS
    n_steps = n_epochs * steps_per_epoch
    bot = CifarBot(
        log_dir=Path("."), checkpoint_dir=Path("/tmp/"),
        model=model, train_loader=train_dl, val_loader=valid_dl,
        optimizer=optimizer, echo=True,
        avg_window=steps_per_epoch // 3,
        criterion=nn.CrossEntropyLoss(),
        device=DEVICE, clip_grad=10.,
        callbacks=[
            LearningRateSchedulerCallback(
                TriangularLR(
                    optimizer, 1000, ratio=5, steps_per_cycle=n_steps
                )
            )
        ],
        metrics=[SoftmaxAccuracy()],
        pbar=True,
        use_amp=True if level != "O0" else False
    )
    bot.train(
        n_steps,
        snapshot_interval=steps_per_epoch,
        log_interval=steps_per_epoch // 3,
        keep_n_snapshots=1
    )
    print(f"GPU Memory Used: {get_gpu_memory_map()} MB")
    bot.load_model(bot.best_performers[0][1])
    bot.remove_checkpoints(keep=0)
    model = MODEL_FUNC().cpu()
    model.load_state_dict(bot.model.cpu().state_dict())
    torch.save(model, f"{level}.pth")

In [2]:
%%time
train_apex("O1")

Files already downloaded and verified
Files already downloaded and verified
| Wide-Resnet 28x10


[[06/12/2019 12:09:06 AM]] SEED: 9293
[[06/12/2019 12:09:06 AM]] # of parameters: 36,489,290
[[06/12/2019 12:09:06 AM]] # of trainable parameters: 36,489,290
[[06/12/2019 12:09:06 AM]] Optimizer SGD (
Parameter Group 0
    dampening: 0
    initial_lr: 0.1
    lr: 0.0001
    momentum: 0.9
    nesterov: False
    weight_decay: 0.0005
)
[[06/12/2019 12:09:06 AM]] Batches per epoch: 390


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


[[06/12/2019 12:10:03 AM]] Step 130: train 1.976669 lr: 1.977e-02
[[06/12/2019 12:11:02 AM]] Step 260: train 1.572628 lr: 3.975e-02
[[06/12/2019 12:12:01 AM]] Step 390: train 1.309087 lr: 5.973e-02
100%|██████████| 40/40 [00:10<00:00,  3.98it/s]
[[06/12/2019 12:12:11 AM]] Criterion loss: 1.53847215
[[06/12/2019 12:12:11 AM]] accuracy: 49.55%
[[06/12/2019 12:12:11 AM]] Snapshot metric 1.53847215
[[06/12/2019 12:12:11 AM]] Saving checkpoint /tmp/snapshot_basebot_1.53847215_390.pth...
[[06/12/2019 12:12:11 AM]] New low

[[06/12/2019 12:13:11 AM]] Step 520: train 1.134857 lr: 7.971e-02
[[06/12/2019 12:14:10 AM]] Step 650: train 1.075486 lr: 9.969e-02
[[06/12/2019 12:15:08 AM]] Step 780: train 1.001067 lr: 9.607e-02
100%|██████████| 40/40 [00:10<00:00,  4.00it/s]
[[06/12/2019 12:15:18 AM]] Criterion loss: 1.24838937
[[06/12/2019 12:15:18 AM]] accuracy: 59.44%
[[06/12/2019 12:15:18 AM]] Snapshot metric 1.24838937
[[06/12/2019 12:15:18 AM]] Saving checkpoint /tmp/snapshot_basebot_1.24838937_7

GPU Memory Used: 4433 MB
| Wide-Resnet 28x10
CPU times: user 19min 14s, sys: 12min 54s, total: 32min 8s
Wall time: 31min 14s
