In [1]:
!nvidia-smi

Wed Jun 12 04:02:06 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
!git clone https://github.com/NVIDIA/apex /tmp/apex
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" /tmp/apex/.
!git clone https://github.com/ceshine/apex_pytorch_cifar_experiment /tmp/src
!cp /tmp/src/*.py .
!pip install python-telegram-bot pretrainedmodels
!pip install https://github.com/ceshine/pytorch_helper_bot/archive/0.1.3.zip

Cloning into '/tmp/apex'...
remote: Enumerating objects: 4, done.[K
remote: Counting objects:  25% (1/4)   [Kremote: Counting objects:  50% (2/4)   [Kremote: Counting objects:  75% (3/4)   [Kremote: Counting objects: 100% (4/4)   [Kremote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 4640 (delta 0), reused 1 (delta 0), pack-reused 4636[K
Receiving objects: 100% (4640/4640), 8.69 MiB | 9.38 MiB/s, done.
Resolving deltas: 100% (3012/3012), done.
  cmdoptions.check_install_build_global(options)
Created temporary directory: /tmp/pip-ephem-wheel-cache-5emvzj75
Created temporary directory: /tmp/pip-req-tracker-6ftfv19y
Created requirements tracker '/tmp/pip-req-tracker-6ftfv19y'
Created temporary directory: /tmp/pip-install-zer_fp0u
Processing /tmp/apex
  Created temporary directory: /tmp/pip-req-build-ve4v3i9u
  Added file:///tmp/apex to build tracker '/tmp/pip-req-tracker-6ftfv19y'
    Running setup.py (path:/tmp/pip-req-b

In [3]:
# Upload telegram tokens
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving telegram_tokens.py to telegram_tokens.py
User uploaded file "telegram_tokens.py" with length 82 bytes


In [0]:
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision
from tqdm import tqdm
from helperbot import (
    TriangularLR, BaseBot, WeightDecayOptimizerWrapper,
    LearningRateSchedulerCallback
)
from helperbot.metrics import SoftmaxAccuracy
from apex import amp
from apex.optimizers import FusedAdam

from baseline import (
    CifarBot, get_cifar10_dataset,
    get_wide_resnet, get_se_resnext,
    get_gpu_memory_map
)
from telegram_tokens import BOT_TOKEN, CHAT_ID
from telegram_sender import telegram_sender

DEVICE = torch.device("cuda")
EPOCHS = 10
MODEL_FUNC = get_wide_resnet

@telegram_sender(token=BOT_TOKEN, chat_id=CHAT_ID)
def train_apex(level):
    train_dl, valid_dl = get_cifar10_dataset(batch_size=128)
    steps_per_epoch = len(train_dl)

    model = MODEL_FUNC()
    optimizer = optim.SGD(
        model.parameters(), lr=0.1,
        momentum=0.9, weight_decay=5e-4)
#     optimizer = WeightDecayOptimizerWrapper(optim.Adam(
#         model.parameters(), lr=1.5e-3), 0.1)    
#     optimizer = optim.Adam(
#         model.parameters(), lr=1.5e-3, weight_decay=1e-4)
    if level != "O0":
        model, optimizer = amp.initialize(
            model, optimizer, opt_level=level
        )
    
    n_epochs = EPOCHS
    n_steps = n_epochs * steps_per_epoch
    bot = CifarBot(
        log_dir=Path("."), checkpoint_dir=Path("/tmp/"),
        model=model, train_loader=train_dl, val_loader=valid_dl,
        optimizer=optimizer, echo=True,
        avg_window=steps_per_epoch // 3,
        criterion=nn.CrossEntropyLoss(),
        device=DEVICE, clip_grad=10.,
        callbacks=[
            LearningRateSchedulerCallback(
                TriangularLR(
                    optimizer, 1000, ratio=5, steps_per_cycle=n_steps
                )
            )
        ],
        metrics=[SoftmaxAccuracy()],
        pbar=True,
        use_amp=True if level != "O0" else False
    )
    bot.train(
        n_steps,
        snapshot_interval=steps_per_epoch,
        log_interval=steps_per_epoch // 3,
        keep_n_snapshots=1
    )
    print(f"GPU Memory Used: {get_gpu_memory_map()} MB")
    bot.load_model(bot.best_performers[0][1])
    bot.remove_checkpoints(keep=0)
    model = MODEL_FUNC().cpu()
    model.load_state_dict(bot.model.cpu().state_dict())
    torch.save(model, f"{level}.pth")

In [5]:
%%time
train_apex("O0")

0it [00:00, ?it/s]

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


170500096it [00:05, 28469542.76it/s]                               


Files already downloaded and verified
| Wide-Resnet 28x10


[[06/12/2019 04:06:45 AM]] SEED: 9293
[[06/12/2019 04:06:45 AM]] # of parameters: 36,489,290
[[06/12/2019 04:06:45 AM]] # of trainable parameters: 36,489,290
[[06/12/2019 04:06:45 AM]] Optimizer SGD (
Parameter Group 0
    dampening: 0
    initial_lr: 0.1
    lr: 0.0001
    momentum: 0.9
    nesterov: False
    weight_decay: 0.0005
)
[[06/12/2019 04:06:45 AM]] Batches per epoch: 390
[[06/12/2019 04:09:50 AM]] Step 130: train 1.979096 lr: 1.977e-02
[[06/12/2019 04:12:54 AM]] Step 260: train 1.575693 lr: 3.975e-02
[[06/12/2019 04:15:59 AM]] Step 390: train 1.296550 lr: 5.973e-02
100%|██████████| 40/40 [01:05<00:00,  1.63s/it]
[[06/12/2019 04:17:04 AM]] Criterion loss: 1.70243775
[[06/12/2019 04:17:04 AM]] accuracy: 46.49%
[[06/12/2019 04:17:04 AM]] Snapshot metric 1.70243775
[[06/12/2019 04:17:04 AM]] Saving checkpoint /tmp/snapshot_basebot_1.70243775_390.pth...
[[06/12/2019 04:17:05 AM]] New low

[[06/12/2019 04:20:09 AM]] Step 520: train 1.128377 lr: 7.971e-02
[[06/12/2019 04:23:14 AM]

GPU Memory Used: 6786 MB
| Wide-Resnet 28x10
CPU times: user 56min 50s, sys: 46min 13s, total: 1h 43min 3s
Wall time: 1h 43min 7s
