In [1]:
from math import pi
from math import cos

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

from torch.utils.data import DataLoader, random_split

import visdom
import numpy as np

from collections import Counter

import pickle

import copy


In [2]:
cuda = torch.cuda.is_available()
batch_size = 64

# load data
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize(
                                    (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                                ])

train = datasets.CIFAR10('data/cifar10', train=True, download=True,
                         transform=transform)
test = datasets.CIFAR10('data/cifar10', train=False, transform=transform)

p = 0.9
train_size = int(p*len(train))
val_size = len(train)-train_size

torch.manual_seed(0)
train, val = random_split(train, [train_size, val_size])

train_loader = torch.utils.data.DataLoader(
    train, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(
    val, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    test, batch_size=batch_size, shuffle=True)

data_loaders = {"train": train_loader, "val": val_loader}
data_lengths = {"train": train_size, "val": val_size}


Files already downloaded and verified


In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32,
                               kernel_size=5,
                               stride=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.conv2_bn = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3)
        self.conv3_bn = nn.BatchNorm2d(64)
        self.dense1 = nn.Linear(in_features=4 * 64, out_features=128)
        self.dense1_bn = nn.BatchNorm1d(128)
        self.dense2 = nn.Linear(128, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_bn(self.conv2(x)), 2))
        x = F.relu(F.max_pool2d(self.conv3_bn(self.conv3(x)), 2))
        x = x.view(-1, 4 * 64)
        x = F.relu(self.dense1_bn(self.dense1(x)))
        x = F.log_softmax(self.dense2(x), dim=1)  # NLL loss expects log_softmax
        return x


In [4]:
def proposed_lr(initial_lr, iteration, epoch_per_cycle):
    # proposed learning late function
    return initial_lr * (cos(pi * iteration / epoch_per_cycle) + 1) / 2


In [5]:
def proposed_lr_new(initial_lr, minimum_lr, iteration, epoch_per_cycle):
    # proposed learning late function
    return minimum_lr+(initial_lr-minimum_lr) * (cos(pi * iteration / epoch_per_cycle) + 1) / 2


In [6]:
def train_se(model, epochs, cycles, initial_lr, vis=None):
    """
    during an iteration a batch goes forward and backward  
    while during an epoch every batch of a data set is processed
    """
    snapshots = []
    _lr_list, _loss_list = [], []
    count = 0
    epochs_per_cycle = epochs // cycles
    optimizer = optim.SGD(model.parameters(), lr=initial_lr)

    for i in range(cycles):

        for j in range(epochs_per_cycle):
            _epoch_loss = 0

            for batch_idx, (data, target) in enumerate(train_loader):
                lr = proposed_lr(initial_lr, j+batch_idx /
                                 len(train_loader), epochs_per_cycle)
                optimizer.param_groups[0]['lr'] = lr
                if cuda:
                    data, target = data.cuda(), target.cuda()

                optimizer.zero_grad()
                output = model(data)
                loss = F.nll_loss(output, target)
                _epoch_loss += loss.data/len(train_loader)
                loss.backward()
                optimizer.step()

            _lr_list.append(optimizer.param_groups[0]['lr'])
            _loss_list.append(_epoch_loss.cpu())
            count += 1

            if vis is not None and j % 5 == 4:
                vis.line(np.array(_lr_list), np.arange(count), win="lr",
                         opts=dict(title="learning rate",
                                   xlabel="epochs",
                                   ylabel="learning rate (s.e.)"))
                vis.line(np.array(_loss_list), np.arange(count),  win="loss",
                         opts=dict(title="loss",
                                   xlabel="epochs",
                                   ylabel="training loss (s.e.)"))
                vis.line(np.array(_loss_list), np.arange(count),  win="loss (log)",
                         opts=dict(title="loss (log)",
                                   xlabel="epochs",
                                   ylabel="training loss (s.e.)",
                                   ytype="log"))

        snapshots.append(copy.deepcopy(model.state_dict()))
    return snapshots


In [7]:
def train_se_per_epoch(model, epochs, cycles, initial_lr, vis=None):
    """
    during an iteration a batch goes forward and backward  
    while during an epoch every batch of a data set is processed
    """
    snapshots = []
    _lr_list, _loss_list = [], []
    count = 0
    epochs_per_cycle = epochs // cycles
    optimizer = optim.SGD(model.parameters(), lr=initial_lr)

    for i in range(cycles):

        for j in range(epochs_per_cycle):
            _epoch_loss = 0

            lr = proposed_lr(initial_lr, j, epochs_per_cycle)
            optimizer.param_groups[0]['lr'] = lr

            for batch_idx, (data, target) in enumerate(train_loader):
                if cuda:
                    data, target = data.cuda(), target.cuda()

                optimizer.zero_grad()
                output = model(data)
                loss = F.nll_loss(output, target)
                _epoch_loss += loss.data/len(train_loader)
                loss.backward()
                optimizer.step()

            _lr_list.append(optimizer.param_groups[0]['lr'])
            _loss_list.append(_epoch_loss.cpu())
            count += 1

            if vis is not None and j % 5 == 4:
                vis.line(np.array(_lr_list), np.arange(count), win="lr (epoch)",
                         opts=dict(title="learning rate",
                                   xlabel="epochs",
                                   ylabel="learning rate (s.e.)"))
                vis.line(np.array(_loss_list), np.arange(count),  win="loss (epoch)",
                         opts=dict(title="loss",
                                   xlabel="epochs",
                                   ylabel="training loss (s.e.)"))
                vis.line(np.array(_loss_list), np.arange(count),  win="loss (log) (epoch)",
                         opts=dict(title="loss (log)",
                                   xlabel="epochs",
                                   ylabel="training loss (s.e.)",
                                   ytype="log"))

        snapshots.append(copy.deepcopy(model.state_dict()))
    return snapshots


In [8]:
def train_se_valloss_per_epoch(model, epochs, cycles, initial_lr, vis=None):
    """
    during an iteration a batch goes forward and backward  
    while during an epoch every batch of a data set is processed
    """
    snapshots = []
    loss_snapshots = []
    valloss_snapshots = []
    _lr_list, _loss_list, _valloss_list = [], [], []
    count = 0
    epochs_per_cycle = epochs // cycles
    optimizer = optim.SGD(model.parameters(), lr=initial_lr)

    for i in range(cycles):

        for j in range(epochs_per_cycle):

            for phase in ['train', 'val']:
                if phase == 'train':
                    lr = proposed_lr(initial_lr, j, epochs_per_cycle)
                    optimizer.param_groups[0]['lr'] = lr
                    model.train(True)  # Set model to training mode
                else:
                    model.train(False)  # Set model to evaluate mode

                _epoch_loss = 0


                for batch_idx, (data, target) in enumerate(data_loaders[phase]):
                    if cuda:
                        data, target = data.cuda(), target.cuda()

                    optimizer.zero_grad()
                    output = model(data)
                    loss = F.nll_loss(output, target)
                    _epoch_loss += loss.data/len(data_loaders[phase])

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                if phase == 'train':
                    _lr_list.append(optimizer.param_groups[0]['lr'])
                    _loss_list.append(_epoch_loss.cpu())
                    count += 1
                else:
                    _valloss_list.append(_epoch_loss.cpu())


            if vis is not None and j % 5 == 4:
                vis.line(np.array(_lr_list), np.arange(count), win="lr (epoch)",
                        opts=dict(title="learning rate",
                                xlabel="epochs",
                                ylabel="learning rate (s.e.)"))
                vis.line(np.array(_loss_list), np.arange(count),  win="loss (epoch)",
                        opts=dict(title="loss",
                                xlabel="epochs",
                                ylabel="training loss (s.e.)"))
                vis.line(np.array(_valloss_list), np.arange(count),  win="loss (epoch)",
                         opts=dict(title="loss",
                                   xlabel="epochs",
                                   ylabel="validation loss (s.e.)"))
                vis.line(np.array(_loss_list), np.arange(count),  win="loss (log) (epoch)",
                        opts=dict(title="loss (log)",
                                xlabel="epochs",
                                ylabel="training loss (s.e.)",
                                ytype="log"))
                vis.line(np.array(_valloss_list), np.arange(count),  win="loss (log) (epoch)",
                         opts=dict(title="loss (log)",
                                   xlabel="epochs",
                                   ylabel="validation loss (s.e.)",
                                   ytype="log"))

        snapshots.append(copy.deepcopy(model.state_dict()))
        loss_snapshots.append(_loss_list[-1])
        valloss_snapshots.append(_valloss_list[-1])
    return snapshots, loss_snapshots, valloss_snapshots


In [26]:
def train_se_valloss(model, epochs, cycles, initial_lr, vis=None):
    """
    during an iteration a batch goes forward and backward  
    while during an epoch every batch of a data set is processed
    """
    snapshots = []
    loss_snapshots = []
    valloss_snapshots = []
    _lr_list, _loss_list, _valloss_list = [], [], []
    count = 0
    epochs_per_cycle = epochs // cycles
    optimizer = optim.SGD(model.parameters(), lr=initial_lr)

    for i in range(cycles):

        for j in range(epochs_per_cycle):

            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train(True)  # Set model to training mode
                else:
                    model.train(False)  # Set model to evaluate mode

                _epoch_loss = 0

                for batch_idx, (data, target) in enumerate(data_loaders[phase]):
                    if phase == 'train':
                        lr = proposed_lr(initial_lr, j+batch_idx /
                                         len(data_loaders[phase]), epochs_per_cycle)
                        optimizer.param_groups[0]['lr'] = lr
                    if cuda:
                        data, target = data.cuda(), target.cuda()

                    optimizer.zero_grad()
                    output = model(data)
                    loss = F.nll_loss(output, target)
                    _epoch_loss += loss.data/len(data_loaders[phase])

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                if phase == 'train':
                    _lr_list.append(optimizer.param_groups[0]['lr'])
                    _loss_list.append(_epoch_loss.cpu())
                    count += 1
                else:
                    _valloss_list.append(_epoch_loss.cpu())


            if vis is not None and j % 5 == 4:
                vis.line(np.array(_lr_list), np.arange(count), win="lr",
                        opts=dict(title="learning rate",
                                xlabel="epochs",
                                ylabel="learning rate (s.e.)"))
                vis.line(np.array(_loss_list), np.arange(count),  win="loss", name="loss",
                         opts=dict(title="loss",
                                   xlabel="epochs",
                                   ylabel="loss (s.e.)"))
                vis.line(np.array(_valloss_list), np.arange(count),  win="loss", update='append', name="valloss")
                vis.line(np.array(_loss_list), np.arange(count),  win="loss (log)", name="loss",
                        opts=dict(title="loss (log)",
                                xlabel="epochs",
                                ylabel="loss (s.e.)",
                                ytype="log"))
                vis.line(np.array(_valloss_list), np.arange(count),  win="loss (log)", update='append', name="valloss")

        snapshots.append(copy.deepcopy(model.state_dict()))
        loss_snapshots.append(_loss_list[-1])
        valloss_snapshots.append(_valloss_list[-1])
    return snapshots, loss_snapshots, valloss_snapshots


In [20]:
vis.line(
    X=np.arange(1, 38),
    Y=np.random.randn(37),
    win="test",
    name='6',
    update='append',
)

vis.line(
    X=np.arange(1, 38),
    Y=np.random.randn(37),
    win="test",
    name='11',
    update='append',
)


'test'

In [9]:
def train_se_loss(model, epochs, cycles, initial_lr, vis=None):
    """
    during an iteration a batch goes forward and backward  
    while during an epoch every batch of a data set is processed
    """
    snapshots = []
    loss_snapshots = []
    _lr_list, _loss_list = [], []
    count = 0
    epochs_per_cycle = epochs // cycles
    optimizer = optim.SGD(model.parameters(), lr=initial_lr)

    for i in range(cycles):

        for j in range(epochs_per_cycle):
            _epoch_loss = 0

            for batch_idx, (data, target) in enumerate(train_loader):
                lr = proposed_lr(initial_lr, j+batch_idx /
                                 len(train_loader), epochs_per_cycle)
                optimizer.param_groups[0]['lr'] = lr
                if cuda:
                    data, target = data.cuda(), target.cuda()

                optimizer.zero_grad()
                output = model(data)
                loss = F.nll_loss(output, target)
                _epoch_loss += loss.data/len(train_loader)
                loss.backward()
                optimizer.step()

            _lr_list.append(optimizer.param_groups[0]['lr'])
            _loss_list.append(_epoch_loss.cpu())
            count += 1

            if vis is not None and j % 5 == 4:
                vis.line(np.array(_lr_list), np.arange(count), win="lr",
                         opts=dict(title="learning rate",
                                   xlabel="epochs",
                                   ylabel="learning rate (s.e.)"))
                vis.line(np.array(_loss_list), np.arange(count),  win="loss",
                         opts=dict(title="loss",
                                   xlabel="epochs",
                                   ylabel="training loss (s.e.)"))
                vis.line(np.array(_loss_list), np.arange(count),  win="loss (log)",
                         opts=dict(title="loss (log)",
                                   xlabel="epochs",
                                   ylabel="training loss (s.e.)",
                                   ytype="log"))

        snapshots.append(copy.deepcopy(model.state_dict()))
        loss_snapshots.append(_loss_list[-1])
    return snapshots, loss_snapshots


In [10]:
def train_se_loss_lr(model, epochs, cycles, initial_lr, burnin = 1, vis=None):
    """
    during an iteration a batch goes forward and backward  
    while during an epoch every batch of a data set is processed
    """
    snapshots = []
    loss_snapshots = []
    _lr_list, _loss_list = [], []
    count = 0
    epochs_per_cycle = epochs // cycles
    optimizer = optim.SGD(model.parameters(), lr=initial_lr)

    for i in range(cycles):
        for j in range(epochs_per_cycle):
            _epoch_loss = 0

            for batch_idx, (data, target) in enumerate(train_loader):
                if i > burnin-1:
                    lr = proposed_lr_new(
                        initial_lr, initial_lr/100, j+batch_idx/len(train_loader), epochs_per_cycle)
                    optimizer.param_groups[0]['lr'] = lr
                if cuda:
                    data, target = data.cuda(), target.cuda()

                optimizer.zero_grad()
                output = model(data)
                loss = F.nll_loss(output, target)
                _epoch_loss += loss.data/len(train_loader)
                loss.backward()
                optimizer.step()

            _lr_list.append(optimizer.param_groups[0]['lr'])
            _loss_list.append(_epoch_loss.cpu())
            count += 1

            if vis is not None and j % 5 == 4:
                vis.line(np.array(_lr_list), np.arange(count), win="lr (new update)",
                         opts=dict(title="learning rate",
                                   xlabel="epochs",
                                   ylabel="learning rate (s.e.)"))
                vis.line(np.array(_loss_list), np.arange(count),  win="loss (new update)",
                         opts=dict(title="loss",
                                   xlabel="epochs",
                                   ylabel="training loss (s.e.)"))
                vis.line(np.array(_loss_list), np.arange(count),  win="loss (log) (new update)",
                         opts=dict(title="loss (log)",
                                   xlabel="epochs",
                                   ylabel="training loss (s.e.)",
                                   ytype="log"))

        snapshots.append(copy.deepcopy(model.state_dict()))
        loss_snapshots.append(_loss_list[-1])
    return snapshots, loss_snapshots


In [9]:
def test_se(Model, snapshots, use_model_num):
    index = len(snapshots) - use_model_num
    snapshots = snapshots[index:]
    model_list = [Model() for _ in snapshots]

    for model, weight in zip(model_list, snapshots):
        model.load_state_dict(weight)
        model.eval()
        if cuda:
            model.cuda()

    test_loss = 0
    correct = 0
    for data, target in test_loader:
        if cuda:
            data, target = data.cuda(), target.cuda()
        output_list = [model(data).unsqueeze(0) for model in model_list]
        output = torch.mean(torch.cat(output_list), 0).squeeze()
        test_loss += F.nll_loss(output, target).data
        pred = output.data.max(1)[1]
        correct += pred.eq(target.data).cpu().sum()

    test_loss /= len(test_loader)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100 * correct / len(test_loader.dataset)))

    return test_loss


In [7]:
def test_se_loss(Model, snapshots, loss_snapshots, use_model_num):
    # use the models with least losses
    loss_snapshots = np.array(loss_snapshots)
    index = np.argpartition(loss_snapshots, use_model_num)[:use_model_num]
    snapshots = [snapshots[i] for i in index]
    model_list = [Model() for _ in snapshots]

    for model, weight in zip(model_list, snapshots):
        model.load_state_dict(weight)
        model.eval()
        if cuda:
            model.cuda()

    test_loss = 0
    correct = 0
    for data, target in test_loader:
        if cuda:
            data, target = data.cuda(), target.cuda()
        output_list = [model(data).unsqueeze(0) for model in model_list]
        output = torch.mean(torch.cat(output_list), 0).squeeze()
        test_loss += F.nll_loss(output, target).data
        pred = output.data.max(1)[1]
        correct += pred.eq(target.data).cpu().sum()

    test_loss /= len(test_loader)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100 * correct / len(test_loader.dataset)))

    return test_loss


In [8]:
def test_se_vote(Model, snapshots, use_model_num):
    index = len(snapshots) - use_model_num
    snapshots = snapshots[index:]
    model_list = [Model() for _ in snapshots]

    for model, weight in zip(model_list, snapshots):
        model.load_state_dict(weight)
        model.eval()
        if cuda:
            model.cuda()

    test_loss = 0
    correct = 0
    for data, target in test_loader:
        if cuda:
            data, target = data.cuda(), target.cuda()
        output_list = [model(data).unsqueeze(0) for model in model_list]
        output = torch.mean(torch.cat(output_list), 0).squeeze()
        test_loss += F.nll_loss(output, target).data
        pred_list = torch.cat([item.data.max(2)[1] for item in output_list])
        pred = torch.mode(pred_list,0)[0]  # most vote
        correct += pred.eq(target.data).cpu().sum()

    test_loss /= len(test_loader)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100 * correct / len(test_loader.dataset)))

    return test_loss


In [14]:
def train_normal(model, epochs, vis=None):

    optimizer = optim.Adam(model.parameters())
    _lr_list, _loss_list = [], []
    for epoch in range(epochs):
        _epoch_loss = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            if cuda:
                data, target = data.cuda(), target.cuda()

            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            _epoch_loss += loss.data / len(train_loader)
            loss.backward()
            optimizer.step()

        _loss_list.append(_epoch_loss.cpu())
        _lr_list.append(optimizer.param_groups[0]['lr'])

        if vis is not None and epoch % 5 == 4:
            vis.line(np.array(_lr_list), np.arange(epoch+1), win="lr_n",
                     opts=dict(title="learning rate",
                               xlabel="epochs",
                               ylabel="learning rate (normal)"))
            vis.line(np.array(_loss_list), np.arange(epoch+1), win="loss_n",
                     opts=dict(title="loss",
                               xlabel="epochs",
                               ylabel="training loss (normal)"))
            vis.line(np.array(_loss_list), np.arange(epoch+1),  win="loss_n (log)",
                         opts=dict(title="loss (log)",
                                   xlabel="epochs",
                                   ylabel="training loss (normal)",
                                   ytype="log"))

    return model


In [15]:
def test_normal(model):

    test_loss = 0
    correct = 0
    for data, target in test_loader:
        if cuda:
            data, target = data.cuda(), target.cuda()
        output = model(data)
        test_loss += F.nll_loss(output, target).data
        pred = output.data.max(1)[1]
        correct += pred.eq(target.data).cpu().sum()

    test_loss /= len(test_loader)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100 * correct / len(test_loader.dataset)))

    return test_loss


In [27]:
vis = visdom.Visdom(port=6006)

model1 = Net()
if cuda:
    model1.cuda()

models1, losses1, vallosses1 = train_se_valloss(model1, 300, 12, 0.2, vis)
pickle.dump((models1,losses1,vallosses1), open("save/se-val-original-300-12-0.2-2.p", "wb"))



Setting up a new session...


In [None]:
print("snapshot ensemble (original)")
test_se(Net, models1, 5)
print("---")
print("snapshot ensemble (min loss)")
test_se_loss(Net, models1, losses1, 5)
print("---")
print("snapshot ensemble (min valloss)")
test_se_loss(Net, models1, vallosses1, 5)
print("---")
print("snapshot ensemble (vote)")
test_se_vote(Net, models1, 5)


snapshot ensemble (original)

Test set: Average loss: 2.1052, Accuracy: 7590/10000 (76%)

---
snapshot ensemble (min loss)

Test set: Average loss: 2.1061, Accuracy: 7582/10000 (76%)

---
snapshot ensemble (vote)

Test set: Average loss: 2.1148, Accuracy: 7574/10000 (76%)



tensor(2.1148, device='cuda:0')