In [1]:
import torch
import numpy as np
import torch.nn as nn
import math
from torch.optim.optimizer import Optimizer
from google.colab import files

In [None]:
%history

import torch
import numpy as np
import torch.nn as nn
import math
from torch.optim.optimizer import Optimizer
from google.colab import files
version_higher = ( torch.__version__ >= "1.5.0" )

class NestAdam(Optimizer):
    def __init__(self, params, lr=1e-03, betas=(0.9, 0.999), eps=1e-16,
                 weight_decay=0, amsgrad=False, weight_decouple=True, fixed_decay=False,
                 rectify=True, degenerated_to_sgd=True):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        if not 0.0 <= weight_decay:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 

In [50]:
version_higher = (torch.__version__ >= "1.5.0")

class NAdam(Optimizer):
    def __init__(self, params, lr=1e-03, betas=(0.9, 0.999), eps=1e-16,
                 weight_decay=0, amsgrad=False, weight_decouple=True, fixed_decay=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        if not 0.0 <= weight_decay:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
            for param in params:
                if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
                    param['buffer'] = [[None, None, None] for _ in range(10)]

        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad, buffer=[[None, None, None] for _ in range(10)])
        super(NAdam, self).__init__(params, defaults)

        self.weight_decouple = weight_decouple
        self.fixed_decay = fixed_decay

    def __setstate__(self, state):
        super(NAdam, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue

                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError(
                        'NAdam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                beta1, beta2 = group['betas']

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data, memory_format=torch.preserve_format) \
                        if version_higher else torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_var'] = torch.zeros_like(p.data, memory_format=torch.preserve_format) \
                        if version_higher else torch.zeros_like(p.data)

                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_var'] = torch.zeros_like(p.data, memory_format=torch.preserve_format) \
                            if version_higher else torch.zeros_like(p.data)

                # get current state variable
                exp_avg, exp_avg_var = state['exp_avg'], state['exp_avg_var']

                state['step'] += 1
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']

                # Approximate Nesterov's Accelerated Gradient
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                grad_diff = grad - exp_avg
                exp_avg_var.mul_(beta2).addcmul_(grad_diff, grad_diff, value=1 - beta2)
                grad.mul_(1 - beta1).add_(exp_avg, alpha=beta1)

                if amsgrad:
                    max_exp_avg_var = state['max_exp_avg_var']
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_var, exp_avg_var, out=max_exp_avg_var)

                    # Use the max. for normalizing running avg. of gradient
                    denom = (max_exp_avg_var.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                else:
                    denom = (exp_avg_var.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                # perform weight decay, check if decoupled weight decay
                if self.weight_decouple:
                    if not self.fixed_decay:
                        p.data.mul_(1.0 - group['lr'] * group['weight_decay'])
                    else:
                        p.data.mul_(1.0 - group['weight_decay'])
                else:
                    if group['weight_decay'] != 0:
                        grad.add_(p.data, alpha=group['weight_decay'])

                # Update
                step_size = group['lr'] / bias_correction1          
                p.data.addcdiv_(grad, denom, value=-step_size)


        return loss

In [4]:
import torchvision
import torchvision.transforms as transforms

In [5]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_transform = transforms.Compose([transforms.Resize(256),
                                      transforms.RandomResizedCrop(224), 
                                      transforms.RandomHorizontalFlip(), 
                                      transforms.ToTensor(),
                                      normalize])
test_transform = transforms.Compose([transforms.Resize(256),
                                     transforms.CenterCrop(224),
                                     transforms.ToTensor(),
                                     normalize])

In [6]:
batch_size = 100
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=train_transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=test_transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [7]:
import time

def timeSince(since):
    now = time.time()
    s = now - since
    m = s // 60
    s -= m * 60
    return '%dm %ds' % (m, s)

In [8]:
import torchvision.models as models

In [9]:
def evaluateImageModel(model, data, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for image, labels in data:
            image, labels = image.to(device), labels.to(device)
            output = model(image)
            pred = torch.argmax(output, dim=1)
            correct += (pred == labels).sum().item()
            total += len(labels)
            loss = criterion(output, labels)
            total_loss += loss.item()

    return total_loss / len(data), 1 - correct / total

def trainImageModel(model, train_data, val_data, n_epochs, criterion, optimizer, device, path):
    model.train()
    train_error_list = []
    val_error_list = []
    min_error = None
    step = 0
    print_every = len(train_data)
    start = time.time()
    for epoch in range(n_epochs):
        running_loss = 0.0
        running_correct = 0
        running_total = 0
        for image, labels in train_data:
            optimizer.zero_grad()
            step += 1
            image, labels = image.to(device), labels.to(device)
            output = model(image)
            pred = torch.argmax(output, dim=1)
            running_correct += (pred == labels).sum().item()
            running_total += len(labels)
            loss = criterion(output, labels)
            running_loss += loss.item()
            loss.backward()
            optimizer.step()
            if step % print_every == 0:
                val_loss, val_error = evaluateImageModel(model, val_data, criterion, device)
                print(('%d/%d (%s) train loss: %.3f, train error: %.2f%%, val loss: %.3f, val error: %.2f%%') %
                      (epoch + 1, n_epochs, timeSince(start), running_loss / len(train_data), 
                       100 *(1 - running_correct / running_total), val_loss, 100 * val_error))
                train_error_list.append(1 - running_correct / running_total)
                val_error_list.append(val_error)
                if min_error is None or min_error > val_error:
                    if min_error is None:
                        print(('Validation error rate in first epoch: %.2f%%') % (100 * val_error))
                    else:
                        print(('Validation error rate is decreasing: %.2f%% --> %.2f%%') % 
                              (100 * min_error, 100 * val_error))
                    min_error = val_error
                    print('Saving model...')
                    torch.save(model, path)
                
                model.train()
                running_loss = 0.0
                running_correct = 0
                running_total = 0
    
    return train_error_list, val_error_list

In [44]:
model = models.resnet34()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()
n_epochs = 50
nadam_train_list = None
nadam_test_list = None
optimizer = NAdam(model.parameters())
model.to(device)
path = 'nadamRes.pt'
nadam_train_list, nadam_test_list = trainImageModel(model, trainloader, testloader,
                                                    n_epochs, criterion, optimizer,
                                                    device, path)
torch.save(nadam_train_list, 'nadamres_train.pt')
torch.save(nadam_test_list, 'nadamres_test.pt')

1/50 (1m 51s) train loss: 1.811, train error: 65.89%, val loss: 1.389, val error: 50.26%
Validation error rate in first epoch: 50.26%
Saving model...
2/50 (3m 43s) train loss: 1.417, train error: 51.06%, val loss: 1.057, val error: 37.07%
Validation error rate is decreasing: 50.26% --> 37.07%
Saving model...
3/50 (5m 35s) train loss: 1.205, train error: 43.01%, val loss: 0.983, val error: 34.37%
Validation error rate is decreasing: 37.07% --> 34.37%
Saving model...
4/50 (7m 28s) train loss: 1.068, train error: 37.89%, val loss: 0.837, val error: 28.64%
Validation error rate is decreasing: 34.37% --> 28.64%
Saving model...
5/50 (9m 21s) train loss: 0.971, train error: 34.19%, val loss: 0.661, val error: 23.64%
Validation error rate is decreasing: 28.64% --> 23.64%
Saving model...
6/50 (11m 14s) train loss: 0.907, train error: 31.90%, val loss: 0.694, val error: 23.95%
7/50 (13m 7s) train loss: 0.859, train error: 30.23%, val loss: 0.566, val error: 19.69%
Validation error rate is decrea

In [None]:
model = models.shufflenet_v2_x1_0()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()
n_epochs = 50
nadam_train_list = None
nadam_test_list = None
optimizer = NestAdam(model.parameters())
model.to(device)
path = 'nadamshffule.pt'
nadam_train_list, nadam_test_list = trainImageModel(model, trainloader, testloader,
                                                                    n_epochs, criterion, optimizer,
                                                                    device, path)
torch.save(nadam_train_list, 'nadamres_train.pt')
torch.save(nadam_test_list, 'nadamres_test.pt')

1/50 (1m 38s) train loss: 1.724, train error: 62.83%, val loss: 1.336, val error: 48.49%
Validation error rate in first epoch: 48.49%
Saving model...
2/50 (3m 15s) train loss: 1.385, train error: 49.66%, val loss: 1.076, val error: 38.44%
Validation error rate is decreasing: 48.49% --> 38.44%
Saving model...
3/50 (4m 53s) train loss: 1.206, train error: 42.85%, val loss: 0.936, val error: 32.11%
Validation error rate is decreasing: 38.44% --> 32.11%
Saving model...
4/50 (6m 30s) train loss: 1.090, train error: 38.65%, val loss: 0.754, val error: 26.63%
Validation error rate is decreasing: 32.11% --> 26.63%
Saving model...
5/50 (8m 9s) train loss: 0.992, train error: 35.12%, val loss: 0.708, val error: 24.24%
Validation error rate is decreasing: 26.63% --> 24.24%
Saving model...
6/50 (9m 48s) train loss: 0.931, train error: 32.55%, val loss: 0.705, val error: 24.38%
7/50 (11m 29s) train loss: 0.876, train error: 30.60%, val loss: 0.585, val error: 19.93%
Validation error rate is decreas

In [None]:
model = models.mobilenet_v2()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()
n_epochs = 50
nadam_train_list = None
nadam_test_list = None
optimizer = NestAdam(model.parameters())
model.to(device)
name = 'nadammobile'
path = name + '.pt'
nadam_train_list, nadam_test_list = trainImageModel(model, trainloader, testloader,
                                                                    n_epochs, criterion, optimizer,
                                                                    device, path)
torch.save(nadam_train_list, name + '_train.pt')
torch.save(nadam_test_list, name + '_test.pt')
files.download(name + '_train.pt')
files.download(name + '_test.pt')

1/50 (1m 46s) train loss: 1.680, train error: 61.48%, val loss: 1.416, val error: 49.14%
Validation error rate in first epoch: 49.14%
Saving model...
2/50 (3m 31s) train loss: 1.363, train error: 48.80%, val loss: 1.098, val error: 39.37%
Validation error rate is decreasing: 49.14% --> 39.37%
Saving model...
3/50 (5m 17s) train loss: 1.197, train error: 42.66%, val loss: 0.857, val error: 30.29%
Validation error rate is decreasing: 39.37% --> 30.29%
Saving model...
4/50 (7m 3s) train loss: 1.080, train error: 38.18%, val loss: 0.764, val error: 26.89%
Validation error rate is decreasing: 30.29% --> 26.89%
Saving model...
5/50 (8m 48s) train loss: 1.000, train error: 35.42%, val loss: 0.713, val error: 24.21%
Validation error rate is decreasing: 26.89% --> 24.21%
Saving model...
6/50 (10m 32s) train loss: 0.939, train error: 33.15%, val loss: 0.609, val error: 21.11%
Validation error rate is decreasing: 24.21% --> 21.11%
Saving model...
7/50 (12m 17s) train loss: 0.889, train error: 31.

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
model = models.densenet121()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()
n_epochs = 50
nadam_train_list = None
nadam_test_list = None
optimizer = NestAdam(model.parameters())
model.to(device)
name = 'nadamdense'
path = name + '.pt'
nadam_train_list, nadam_test_list = trainImageModel(model, trainloader, testloader,
                                                                    n_epochs, criterion, optimizer,
                                                                    device, path)
torch.save(nadam_train_list, name + '_train.pt')
torch.save(nadam_test_list, name + '_test.pt')
files.download(name + '_train.pt')
files.download(name + '_test.pt')

1/50 (3m 33s) train loss: 1.834, train error: 64.05%, val loss: 1.518, val error: 51.73%
Validation error rate in first epoch: 51.73%
Saving model...
2/50 (7m 5s) train loss: 1.348, train error: 48.44%, val loss: 1.137, val error: 40.16%
Validation error rate is decreasing: 51.73% --> 40.16%
Saving model...
3/50 (10m 38s) train loss: 1.148, train error: 40.95%, val loss: 0.827, val error: 28.91%
Validation error rate is decreasing: 40.16% --> 28.91%
Saving model...
4/50 (14m 10s) train loss: 1.015, train error: 35.71%, val loss: 0.720, val error: 24.51%
Validation error rate is decreasing: 28.91% --> 24.51%
Saving model...
5/50 (17m 42s) train loss: 0.924, train error: 32.40%, val loss: 0.605, val error: 21.36%
Validation error rate is decreasing: 24.51% --> 21.36%
Saving model...
6/50 (21m 14s) train loss: 0.855, train error: 30.05%, val loss: 0.567, val error: 19.41%
Validation error rate is decreasing: 21.36% --> 19.41%
Saving model...
7/50 (24m 46s) train loss: 0.803, train error: 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [53]:
!nvidia-smi

Tue Dec 15 08:23:00 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.45.01    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    54W / 300W |  15619MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!lsof /dev/nvidia*

COMMAND  PID USER   FD   TYPE  DEVICE SIZE/OFF  NODE NAME
python3 3224 root  mem    CHR 195,255          17360 /dev/nvidiactl
python3 3224 root  mem    CHR   195,0          17359 /dev/nvidia0
python3 3224 root  mem    CHR   246,0          17357 /dev/nvidia-uvm
python3 3224 root   59u   CHR 195,255      0t0 17360 /dev/nvidiactl
python3 3224 root   62u   CHR   246,0      0t0 17357 /dev/nvidia-uvm
python3 3224 root   63u   CHR   195,0      0t0 17359 /dev/nvidia0
python3 3224 root   64u   CHR   195,0      0t0 17359 /dev/nvidia0
python3 3224 root   65u   CHR   195,0      0t0 17359 /dev/nvidia0
python3 3224 root   70u   CHR   195,0      0t0 17359 /dev/nvidia0
python3 3224 root   71u   CHR   195,0      0t0 17359 /dev/nvidia0
python3 3224 root   72u   CHR   195,0      0t0 17359 /dev/nvidia0
python3 3224 root   75u   CHR   195,0      0t0 17359 /dev/nvidia0
python3 3224 root   76u   CHR   195,0      0t0 17359 /dev/nvidia0
python3 3224 root   77u   CHR   195,0      0t0 17359 /dev/nvidia0
python3 

In [None]:
!kill -9 64