### Imports

In [1]:
# https://github.com/ildoonet/pytorch-randaugment
# !pip install git+https://github.com/ildoonet/pytorch-randaugment

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

from RandAugment import RandAugment

import os
import time

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Data

In [4]:
_CIFAR_MEAN, _CIFAR_STD = (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)

transform = transforms.Compose([
#     transforms.RandomCrop(32, padding=4),
#     transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(_CIFAR_MEAN, _CIFAR_STD),
])

# Add RandAugment with N, M(hyperparameter)
transform.transforms.insert(0, RandAugment(n=3, m=2))

batch_size = 128

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


### Model

In [5]:
def initialize_weights(module):
    if isinstance(module, nn.Conv2d):
        nn.init.kaiming_normal_(module.weight.data, mode='fan_out')
    elif isinstance(module, nn.BatchNorm2d):
        module.weight.data.fill_(1)
        module.bias.data.zero_()
    elif isinstance(module, nn.Linear):
        module.bias.data.zero_()


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 remove_first_relu,
                 add_last_bn,
                 preact=False):
        super(BasicBlock, self).__init__()

        self._remove_first_relu = remove_first_relu
        self._add_last_bn = add_last_bn
        self._preact = preact

        self.bn1 = nn.BatchNorm2d(in_channels)
        self.conv1 = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=3,
            stride=stride,  # downsample with first conv
            padding=1,
            bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(
            out_channels,
            out_channels,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=False)

        if add_last_bn:
            self.bn3 = nn.BatchNorm2d(out_channels)

        self.shortcut = nn.Sequential()
        if in_channels != out_channels:
            self.shortcut.add_module(
                'conv',
                nn.Conv2d(
                    in_channels,
                    out_channels,
                    kernel_size=1,
                    stride=stride,  # downsample
                    padding=0,
                    bias=False))

    def forward(self, x):
        if self._preact:
            x = F.relu(
                self.bn1(x), inplace=True)  # shortcut after preactivation
            y = self.conv1(x)
        else:
            # preactivation only for residual path
            y = self.bn1(x)
            if not self._remove_first_relu:
                y = F.relu(y, inplace=True)
            y = self.conv1(y)

        y = F.relu(self.bn2(y), inplace=True)
        y = self.conv2(y)

        if self._add_last_bn:
            y = self.bn3(y)

        y += self.shortcut(x)
        return y


class BottleneckBlock(nn.Module):
    expansion = 4

    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 remove_first_relu,
                 add_last_bn,
                 preact=False):
        super(BottleneckBlock, self).__init__()

        self._remove_first_relu = remove_first_relu
        self._add_last_bn = add_last_bn
        self._preact = preact

        bottleneck_channels = out_channels // self.expansion

        self.bn1 = nn.BatchNorm2d(in_channels)
        self.conv1 = nn.Conv2d(
            in_channels,
            bottleneck_channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False)
        self.bn2 = nn.BatchNorm2d(bottleneck_channels)
        self.conv2 = nn.Conv2d(
            bottleneck_channels,
            bottleneck_channels,
            kernel_size=3,
            stride=stride,  # downsample with 3x3 conv
            padding=1,
            bias=False)
        self.bn3 = nn.BatchNorm2d(bottleneck_channels)
        self.conv3 = nn.Conv2d(
            bottleneck_channels,
            out_channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False)

        if add_last_bn:
            self.bn4 = nn.BatchNorm2d(out_channels)

        self.shortcut = nn.Sequential()  # identity
        if in_channels != out_channels:
            self.shortcut.add_module(
                'conv',
                nn.Conv2d(
                    in_channels,
                    out_channels,
                    kernel_size=1,
                    stride=stride,  # downsample
                    padding=0,
                    bias=False))

    def forward(self, x):
        if self._preact:
            x = F.relu(
                self.bn1(x), inplace=True)  # shortcut after preactivation
            y = self.conv1(x)
        else:
            # preactivation only for residual path
            y = self.bn1(x)
            if not self._remove_first_relu:
                y = F.relu(y, inplace=True)
            y = self.conv1(y)

        y = F.relu(self.bn2(y), inplace=True)
        y = self.conv2(y)
        y = F.relu(self.bn3(y), inplace=True)
        y = self.conv3(y)

        if self._add_last_bn:
            y = self.bn4(y)

        y += self.shortcut(x)
        return y

class Network(nn.Module):
    def __init__(self, config):
        super(Network, self).__init__()

        input_shape = config['input_shape']
        n_classes = config['n_classes']

        base_channels = config['base_channels']
        self._remove_first_relu = False
        self._add_last_bn = False
        block_type = config['block_type']
        depth = config['depth']
        preact_stage = [True, True, True]

        assert block_type in ['basic', 'bottleneck']
        if block_type == 'basic':
            block = BasicBlock
            n_blocks_per_stage = (depth - 2) // 6
            assert n_blocks_per_stage * 6 + 2 == depth
        else:
            block = BottleneckBlock
            n_blocks_per_stage = (depth - 2) // 9
            assert n_blocks_per_stage * 9 + 2 == depth

        n_channels = [
            base_channels,
            base_channels * 2 * block.expansion,
            base_channels * 4 * block.expansion,
        ]

        self.conv = nn.Conv2d(
            input_shape[1],
            n_channels[0],
            kernel_size=(3, 3),
            stride=1,
            padding=1,
            bias=False)

        self.stage1 = self._make_stage(
            n_channels[0],
            n_channels[0],
            n_blocks_per_stage,
            block,
            stride=1,
            preact=preact_stage[0])
        self.stage2 = self._make_stage(
            n_channels[0],
            n_channels[1],
            n_blocks_per_stage,
            block,
            stride=2,
            preact=preact_stage[1])
        self.stage3 = self._make_stage(
            n_channels[1],
            n_channels[2],
            n_blocks_per_stage,
            block,
            stride=2,
            preact=preact_stage[2])
        self.bn = nn.BatchNorm2d(n_channels[2])

        # compute conv feature size
        with torch.no_grad():
            self.feature_size = self._forward_conv(
                torch.zeros(*input_shape)).view(-1).shape[0]

        self.fc = nn.Linear(self.feature_size, n_classes)

        # initialize weights
        self.apply(initialize_weights)

    def _make_stage(self, in_channels, out_channels, n_blocks, block, stride,
                    preact):
        stage = nn.Sequential()
        for index in range(n_blocks):
            block_name = 'block{}'.format(index + 1)
            if index == 0:
                stage.add_module(
                    block_name,
                    block(
                        in_channels,
                        out_channels,
                        stride=stride,
                        remove_first_relu=self._remove_first_relu,
                        add_last_bn=self._add_last_bn,
                        preact=preact))
            else:
                stage.add_module(
                    block_name,
                    block(
                        out_channels,
                        out_channels,
                        stride=1,
                        remove_first_relu=self._remove_first_relu,
                        add_last_bn=self._add_last_bn,
                        preact=False))
        return stage

    def _forward_conv(self, x):
        x = self.conv(x)
        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = F.relu(
            self.bn(x),
            inplace=True)  # apply BN and ReLU before average pooling
        x = F.adaptive_avg_pool2d(x, output_size=1)
        return x

    def forward(self, x):
        x = self._forward_conv(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

### Training Functions

In [6]:
class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, num):
        self.val = val
        self.sum += val * num
        self.count += num
        self.avg = self.sum / self.count

def train(epoch, model, optimizer, criterion, train_loader, device):
    global global_step

    print('Train {}'.format(epoch))

    model.train()

    loss_meter = AverageMeter()
    accuracy_meter = AverageMeter()
    start = time.time()
    for step, (data, targets) in enumerate(train_loader):
        global_step += 1

        data = data.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()

        outputs = model(data)
        loss = criterion(outputs, targets)
        loss.backward()

        optimizer.step()

        num = data.size(0)

        loss_ = loss.item()
        loss_meter.update(loss_, num)

        if len(targets.shape) > 1:
            k=2
            y_weights, y_idx = torch.topk(targets, k=k, dim=1)
            out_weights, out_idx = torch.topk(outputs, k=k, dim=1)
            correct_ = torch.sum(torch.eq(y_idx, out_idx) * y_weights)
            accuracy = correct_ / num
        else:
            _, preds = torch.max(outputs, dim=1)
            correct_ = preds.eq(targets).sum().item()
            accuracy = correct_ / num

        accuracy_meter.update(accuracy, num)

        if step % 100 == 0:
            print('Epoch {} Step {}/{} '
                        'Loss {:.4f} ({:.4f}) '
                        'Accuracy {:.4f} ({:.4f})'.format(
                            epoch,
                            step,
                            len(train_loader),
                            loss_meter.val,
                            loss_meter.avg,
                            accuracy_meter.val,
                            accuracy_meter.avg,
                        ))

    elapsed = time.time() - start
    print('Elapsed {:.2f}'.format(elapsed))

    train_log = {
        'epoch': epoch,
        'train': {
            'loss': loss_meter.avg,
            'accuracy': accuracy,
            'time': elapsed,
        }
    }

    return train_log

def test(epoch, model, criterion, test_loader, device):
    print('Test {}'.format(epoch))

    model.eval()

    loss_meter = AverageMeter()
    correct_meter = AverageMeter()
    start = time.time()
    with torch.no_grad():
        for step, (data, targets) in enumerate(test_loader):

            data = data.to(device)
            targets = targets.to(device)

            outputs = model(data)
            loss = criterion(outputs, targets)

            _, preds = torch.max(outputs, dim=1)

            correct_ = preds.eq(targets).sum().item()
            correct_meter.update(correct_, 1)

            num = data.size(0)

            loss_ = loss.item()
            loss_meter.update(loss_, num)

    accuracy = correct_meter.sum / len(test_loader.dataset)

    print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(
        epoch, loss_meter.avg, accuracy))

    elapsed = time.time() - start
    print('Elapsed {:.2f}'.format(elapsed))

    test_log = {
        'epoch': epoch,
        'test': {
            'loss': loss_meter.avg,
            'accuracy': accuracy,
            'time': elapsed
        }
    }

    return test_log

### Training Loop

In [7]:
save_dir = "./pretrained/"

epochs = 300
early_stopping_tolerance = 10

base_lr = 0.2
weight_decay = 1e-4
momentum = 0.9
nesterov = True

In [8]:
model = Network({
    'block_type': 'basic',
    'depth': 20,
    'base_channels': 64,
    'input_shape': [1,3,32,32],
    'n_classes': 10}).to(device)

optimizer = torch.optim.SGD(
    model.parameters(),
    lr=base_lr,
    momentum=momentum,
    weight_decay=weight_decay,
    nesterov=nesterov)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs, 0)

criterion = nn.CrossEntropyLoss()

In [9]:
global_step = 0
early_stopping_steps = 0

epoch_logs = []
best_acc = 0
for epoch in range(1, epochs+1):        
    
    train_log = train(epoch, model, optimizer, criterion, trainloader, device)
    test_log  = test(epoch, model, criterion, testloader, device)

    early_stopping_steps += 1
    
    scheduler.step()

    epoch_logs.append(train_log)
    epoch_logs.append(test_log)

    if test_log['test']['accuracy'] > best_acc:
        early_stopping_steps = 0
        for item in os.listdir(save_dir):
            if item.endswith(".pth"):
                os.remove(os.path.join(save_dir, item))
        best_acc = test_log['test']['accuracy']
        model_path = os.path.join(save_dir, '.CIFAR10', 'resnet_cifar10_' + str(round(best_acc, 2)) + '.pth')
        torch.save(model, model_path)

    if early_stopping_steps > early_stopping_tolerance:
        break
        
print('best_acc', best_acc)

Train 1
Epoch 1 Step 0/391 Loss 2.3368 (2.3368) Accuracy 0.1172 (0.1172)
Epoch 1 Step 100/391 Loss 2.2068 (2.2969) Accuracy 0.1797 (0.1315)
Epoch 1 Step 200/391 Loss 2.0690 (2.2282) Accuracy 0.1953 (0.1560)
Epoch 1 Step 300/391 Loss 1.9651 (2.1554) Accuracy 0.2500 (0.1859)
Elapsed 77.17
Test 1
Epoch 1 Loss 1.8590 Accuracy 0.3124
Elapsed 4.49
Train 2
Epoch 2 Step 0/391 Loss 1.8246 (1.8246) Accuracy 0.3672 (0.3672)
Epoch 2 Step 100/391 Loss 1.4798 (1.6674) Accuracy 0.4688 (0.3987)
Epoch 2 Step 200/391 Loss 1.2563 (1.6162) Accuracy 0.5859 (0.4161)
Epoch 2 Step 300/391 Loss 1.3605 (1.5516) Accuracy 0.5078 (0.4415)
Elapsed 71.09
Test 2
Epoch 2 Loss 1.4482 Accuracy 0.5056
Elapsed 4.49
Train 3
Epoch 3 Step 0/391 Loss 1.4472 (1.4472) Accuracy 0.5469 (0.5469)
Epoch 3 Step 100/391 Loss 1.3016 (1.2738) Accuracy 0.5312 (0.5496)
Epoch 3 Step 200/391 Loss 1.0039 (1.2436) Accuracy 0.5859 (0.5613)
Epoch 3 Step 300/391 Loss 1.2156 (1.2088) Accuracy 0.5781 (0.5751)
Elapsed 70.98
Test 3
Epoch 3 Loss 1.23

Epoch 24 Step 300/391 Loss 0.5762 (0.5685) Accuracy 0.8203 (0.8015)
Elapsed 71.00
Test 24
Epoch 24 Loss 0.7706 Accuracy 0.7354
Elapsed 4.41
Train 25
Epoch 25 Step 0/391 Loss 0.6155 (0.6155) Accuracy 0.7891 (0.7891)
Epoch 25 Step 100/391 Loss 0.4602 (0.5685) Accuracy 0.8281 (0.7999)
Epoch 25 Step 200/391 Loss 0.5899 (0.5613) Accuracy 0.7969 (0.8042)
Epoch 25 Step 300/391 Loss 0.6046 (0.5673) Accuracy 0.7656 (0.8035)
Elapsed 71.02
Test 25
Epoch 25 Loss 0.7745 Accuracy 0.7328
Elapsed 4.44
Train 26
Epoch 26 Step 0/391 Loss 0.7939 (0.7939) Accuracy 0.7109 (0.7109)
Epoch 26 Step 100/391 Loss 0.4551 (0.5373) Accuracy 0.8516 (0.8117)
Epoch 26 Step 200/391 Loss 0.6789 (0.5473) Accuracy 0.7734 (0.8103)
Epoch 26 Step 300/391 Loss 0.4378 (0.5542) Accuracy 0.8672 (0.8070)
Elapsed 71.02
Test 26
Epoch 26 Loss 0.7475 Accuracy 0.7489
Elapsed 4.45
Train 27
Epoch 27 Step 0/391 Loss 0.6102 (0.6102) Accuracy 0.8047 (0.8047)
Epoch 27 Step 100/391 Loss 0.6300 (0.5332) Accuracy 0.7969 (0.8146)
Epoch 27 Step 2

Epoch 48 Step 0/391 Loss 0.4230 (0.4230) Accuracy 0.8281 (0.8281)
Epoch 48 Step 100/391 Loss 0.4582 (0.4593) Accuracy 0.8438 (0.8356)
Epoch 48 Step 200/391 Loss 0.5487 (0.4711) Accuracy 0.8203 (0.8322)
Epoch 48 Step 300/391 Loss 0.4767 (0.4762) Accuracy 0.8125 (0.8319)
Elapsed 71.07
Test 48
Epoch 48 Loss 0.7796 Accuracy 0.7457
Elapsed 4.46
Train 49
Epoch 49 Step 0/391 Loss 0.4024 (0.4024) Accuracy 0.8750 (0.8750)
Epoch 49 Step 100/391 Loss 0.4410 (0.4584) Accuracy 0.8594 (0.8388)
Epoch 49 Step 200/391 Loss 0.4610 (0.4635) Accuracy 0.8516 (0.8364)
Epoch 49 Step 300/391 Loss 0.6127 (0.4691) Accuracy 0.8125 (0.8347)
Elapsed 70.78
Test 49
Epoch 49 Loss 0.7619 Accuracy 0.7473
Elapsed 4.45
Train 50
Epoch 50 Step 0/391 Loss 0.5566 (0.5566) Accuracy 0.8125 (0.8125)
Epoch 50 Step 100/391 Loss 0.4960 (0.4637) Accuracy 0.8125 (0.8366)
Epoch 50 Step 200/391 Loss 0.5557 (0.4681) Accuracy 0.8281 (0.8358)
Epoch 50 Step 300/391 Loss 0.4382 (0.4676) Accuracy 0.8359 (0.8363)
Elapsed 70.96
Test 50
Epoch 

In [10]:
# RUN 1: best_acc 0.8519 - NO AUG
# RUN 2: best_acc 0.8773 - RandomCrop, HorizontalFlip
# RUN 3: best_acc 0.7431 - RandAug(N=2,M=5), RandomCrop, HorizontalFlip 
# RUN 4: best_acc 0.6213 - RandAug(N=3,M=7), RandomCrop, HorizontalFlip 
# RUN 5: best_acc 0.7890 - RandAug(N=2,M=3), RandomCrop, HorizontalFlip 
# RUN 6: best_acc 0.7975 - RandAug(N=2,M=3)
# RUN 7: best_acc 0.7853 - RandAug(N=2,M=3)
# RUN 8: best_acc 0.7632 - RandAug(N=3,M=2)
# RUN 9: best_acc 0.7655 - RandAug(N=3,M=2)

### Evaluation

In [None]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for batch in testloader:
        data, targets = batch
        data = data.to(device)
        targets = targets.to(device)
        # calculate outputs by running images through the network
        outputs = model(data)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

In [None]:
# prepare to count predictions for each class
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

# again no gradients needed
with torch.no_grad():
    for batch in testloader:
        data, targets = batch
        data = data.to(device)
        targets = targets.to(device)
        outputs = model(data)
        _, predictions = torch.max(outputs, 1)
        # collect the correct predictions for each class
        for label, prediction in zip(targets, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1


# print accuracy for each class
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print("Accuracy for class {:5s} is: {:.1f} %".format(classname,
                                                   accuracy))