In [10]:
import torch, torchvision
from torchvision import datasets, models, transforms
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Subset
import time
from torchsummary import summary
from torch.optim import lr_scheduler
import copy

import sys
import pathlib
import numpy as np
import matplotlib.pyplot as plt
import os

from PIL import Image
from collections import OrderedDict
import shutil 
# reference : https://github.com/automan000/CyclicLR_Scheduler_PyTorch
from CyclicLR_Scheduler_PyTorch.cyclic_lr_scheduler import CyclicLR
from Residual_Attention_Network.model.residual_attention_network import ResidualAttentionModel_92_32input_update as ResidualAttentionModel
import resnet_modified

In [11]:
# Load the Data
data_dir = './modified_data/tiny-imagenet-200'
num_classes = 200


# Create the training data generator
batch_size = 500
im_height = 64
im_width = 64
phases = ['train', 'val', 'test']
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")    
batch_size = 500

def load_data(batch_size=500):
    data_transforms = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0, 0, 0), tuple(np.sqrt((255, 255, 255)))),
    ])
    

    # Load Data from folders
    image_datasets = {
        'train': datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=data_transforms),
        'val': datasets.ImageFolder(os.path.join(data_dir, 'val'), transform=data_transforms),
        'test': datasets.ImageFolder(os.path.join(data_dir, 'test'), transform=data_transforms)
    }

    # subset_indices = np.random.permutation(range(100))
    # dataloaders = {x: DataLoader(image_datasets[x], batch_size=batch_size, shuffle=False, 
    #                              sampler=SubsetRandomSampler(subset_indices)) for x in phases}

    dataloaders = {'train': DataLoader(image_datasets['train'], batch_size=batch_size, shuffle=True),
                  'val': DataLoader(image_datasets['val'], batch_size=batch_size, shuffle=True),
                  'test': DataLoader(image_datasets['test'], batch_size=batch_size, shuffle=False)}
    dataset_sizes = {x: len(image_datasets[x]) for x in phases}
    class_names = image_datasets['train'].classes
    return image_datasets, dataloaders, dataset_sizes, class_names

image_datasets, dataloaders, dataset_sizes, class_names = load_data()

print(dataset_sizes)

{'train': 100000, 'val': 10000, 'test': 10000}


In [12]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    tr_acc, val_acc = [], []
    tr_loss, val_loss  = [], []
    
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
#                 with torch.set_grad_enabled(phase == 'train'):
                if phase == 'train':
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    # backward + optimize only if in training phase
                    loss.backward()
                    optimizer.step()
                else:
                    with torch.no_grad():
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            if phase == 'train':
                tr_acc.append(epoch_acc)
                tr_loss.append(epoch_loss)
            elif phase == 'val':
                val_acc.append(epoch_acc)
                val_loss.append(epoch_loss)
                
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, tr_acc, val_acc, tr_loss, val_loss

## Resnet base model

In [10]:
# Load the pretrained model
model = models.resnet18(pretrained=True)
# Freeze model parameters to train only the last layer. 
# Comment out this cell if you want to fine tune the whole network
for param in model.parameters():
    param.requires_grad = False

model.fc = nn.Linear(num_ftrs, num_classes)
model = model.to(device)
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [12]:
# Define Optimizer and Loss Function
criterion = nn.CrossEntropyLoss()
# learning_rates = [0.00001, 0.0001, 0.001]
optimizer_ft = optim.Adam(model.parameters(), lr=0.001)
# Decay LR by a factor of 0.1 every 5 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=5, gamma=0.1)
model, tr_acc, val_acc, tr_loss, val_loss = train_model(model, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=10)

Epoch 0/9
----------
train Loss: 3.7194 Acc: 0.2345
val Loss: 3.1500 Acc: 0.3170

Epoch 1/9
----------
train Loss: 2.9385 Acc: 0.3467
val Loss: 3.0045 Acc: 0.3407

Epoch 2/9
----------
train Loss: 2.7716 Acc: 0.3736
val Loss: 2.9721 Acc: 0.3381

Epoch 3/9
----------
train Loss: 2.6766 Acc: 0.3886
val Loss: 2.9419 Acc: 0.3480

Epoch 4/9
----------
train Loss: 2.6167 Acc: 0.3986
val Loss: 2.9417 Acc: 0.3502

Epoch 5/9
----------
train Loss: 2.4794 Acc: 0.4265
val Loss: 2.9084 Acc: 0.3559

Epoch 6/9
----------
train Loss: 2.4716 Acc: 0.4283
val Loss: 2.9089 Acc: 0.3565

Epoch 7/9
----------
train Loss: 2.4670 Acc: 0.4300
val Loss: 2.9055 Acc: 0.3581

Epoch 8/9
----------
train Loss: 2.4624 Acc: 0.4305
val Loss: 2.9155 Acc: 0.3543

Epoch 9/9
----------
train Loss: 2.4564 Acc: 0.4319
val Loss: 2.9058 Acc: 0.3553

Training complete in 46m 10s
Best val Acc: 0.358100


In [14]:
torch.save(model, './models/resnet18_model2.pt')

## Ensemble

In [76]:
# Load the model and saved state_dict
vgg_model = models.vgg11_bn(pretrained=True)
num_ftrs = vgg_model.classifier[6].in_features
vgg_model.classifier[6] = nn.Linear(num_ftrs,num_classes)
vgg_model.load_state_dict(torch.load('./models/vgg11_bn_best_model_state_dict.pt'))
vgg_model.eval()

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU(inplace=True)
    (11): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU(inplace=True)
    (14): MaxPool2d(ke

In [77]:
resnet_model = models.resnet18(pretrained=True)
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, num_classes)
resnet_model.load_state_dict(torch.load('./models/resnet18_model_state_dict.pt'))
resnet_model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [78]:
dense_model = torchvision.models.densenet169(pretrained=True)
num_ftrs = dense_model.classifier.in_features
dense_model.classifier = nn.Linear(num_ftrs, num_classes)
dense_model.load_state_dict(torch.load('./models/dense169_model_state_dict.pt'))
dense_model.eval()

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

In [79]:
for param in resnet_model.parameters():
    param.requires_grad = False
for param in vgg_model.parameters():
    param.requires_grad = False
for param in dense_model.parameters():
    param.requires_grad = False
# Load models onto GPU
resnet_model = resnet_model.to(device)
vgg_model = vgg_model.to(device)
dense_model = dense_model.to(device)

In [13]:
class Ensemble():
    def __init__(self, models):
        self.models = models
        self.loss = 0.0
        self.top5_acc = 0.0
        self.top1_acc = 0.0
        
    def get_num_corrects(self, output, target, topk=(1,)):
        res = []
        with torch.no_grad():
            maxk = max(topk)
            batch_size = target.size(0)

            _, pred = output.topk(maxk, 1, True, True)
            pred = pred.t()
            correct = pred.eq(target.view(1, -1).expand_as(pred))

            for k in topk:
                correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
                res.append(correct_k)
        return res
        
    def find_majority_vote(self, preds):
        maj_vote = torch.zeros(preds.shape[1])
        for i in range(preds.shape[1]):
            _, counts = np.unique(preds[:, i], return_counts=True)
            maj_vote[i] = preds[np.argmax(counts), i]
        maj_vote = maj_vote.to(device)
        return maj_vote
    
    def evaluate_testdata(self, inputs, mode='average'):
        inputs = inputs.to(device)
        phase = 'val'
        for m in self.models:
            m.eval()
        with torch.no_grad():
            if mode == 'average':
                # Take average of the output to make prediction
                outputs = torch.zeros(1, num_classes)
                for m in self.models:
                    outputs += m(inputs)
                outputs /= len(self.models)
                _, preds = torch.max(outputs, 1)
            else:
                # Majority vote
                loss = 0
                predictions = torch.zeros(len(self.models), inputs.shape[0])
                for i in range(len(self.models)):
                    outputs = self.models[i](inputs)
                    _, preds = torch.max(outputs, 1)
                    predictions[i, :] = preds
                    loss += criterion(outputs, labels)
                preds = self.find_majority_vote(predictions)
        return preds

                
    def evaluate_all(self, criterion, mode='average'):
        running_loss = 0.0
        running_corrects = 0
        running_corrects1 = 0
        running_corrects5 = 0
        phase = 'val'
        for m in self.models:
            m.eval()
            
        with torch.no_grad():
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
#                 if isinstance(m, ResidualAttentionModel):
#                 inputs = torch.nn.functional.interpolate(inputs, (32, 32))
                if mode == 'average':
                    # Take average of the output to make prediction
                    outputs = torch.zeros(batch_size, num_classes)
                    for m in self.models:
                        outputs += m(inputs)
                    outputs /= len(self.models)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                else:
                    # Majority vote
                    loss = 0
                    predictions = torch.zeros(len(self.models), inputs.shape[0])
                    for i in range(len(self.models)):
                        outputs = self.models[i](inputs)
                        _, preds = torch.max(outputs, 1)
                        predictions[i, :] = preds
                        loss += criterion(outputs, labels)
                    preds = self.find_majority_vote(predictions)
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                corr1, corr5 = self.get_num_corrects(outputs, labels, topk=(1, 5))
                running_corrects1 += corr1[0]
                running_corrects5 += corr5[0]
                
            self.loss = running_loss / dataset_sizes[phase]
            self.top1_acc = running_corrects1.double() / dataset_sizes[phase]
            self.top5_acc = running_corrects5.double() / dataset_sizes[phase]
        return self.top1_acc, self.top5_acc, self.loss

In [81]:
# Ensemble by averaging
criterion = nn.CrossEntropyLoss()
ensemble_solver = Ensemble([resnet_model, vgg_model])
val_acc, val_loss = ensemble_solver.evaluate_all(criterion)
print("averaging")
print("validation accuracy", val_acc)
print("validation loss", val_loss)
print()

# Ensemble by averaging
criterion = nn.CrossEntropyLoss()
ensemble_solver = Ensemble([resnet_model, dense_model])
val_acc, val_loss = ensemble_solver.evaluate_all(criterion)
print("averaging")
print("validation accuracy", val_acc)
print("validation loss", val_loss)
print()

# Ensemble by averaging
criterion = nn.CrossEntropyLoss()
ensemble_solver = Ensemble([vgg_model, dense_model])
val_acc, val_loss = ensemble_solver.evaluate_all(criterion)
print("averaging")
print("validation accuracy", val_acc)
print("validation loss", val_loss)
print()



# Ensemble by averaging
criterion = nn.CrossEntropyLoss()
ensemble_solver = Ensemble([resnet_model, dense_model, vgg_model])
val_acc, val_loss = ensemble_solver.evaluate_all(criterion)
print("averaging")
print("validation accuracy", val_acc)
print("validation loss", val_loss)
print()

# Ensemble by majority vote
criterion = nn.CrossEntropyLoss()
ensemble_solver = Ensemble([resnet_model, dense_model, vgg_model])
val_acc, val_loss = ensemble_solver.evaluate_all(criterion, 'majority vote')
print("majority vote")
print("validation accuracy", val_acc)
print("validation loss", val_loss)

averaging
validation accuracy tensor(0.5321, device='cuda:0', dtype=torch.float64)
validation loss 1.9333795607089996

averaging
validation accuracy tensor(0.5360, device='cuda:0', dtype=torch.float64)
validation loss 1.9644196271896361

averaging
validation accuracy tensor(0.5200, device='cuda:0', dtype=torch.float64)
validation loss 2.013554871082306

averaging
validation accuracy tensor(0.5697, device='cuda:0', dtype=torch.float64)
validation loss 1.7308927834033967

majority vote
validation accuracy tensor(0.4447, device='cuda:0', dtype=torch.float64)
validation loss 7.863580560684204


## Snapshot ensemble

In [133]:
# def evaluate_ensemble_model(snapshots):

def snapshot_training(model, criterion, optimizer, scheduler, num_epochs=25, step_size=5):
    since = time.time()

    best_acc = 0.0
    tr_acc, val_acc = [], []
    tr_loss, val_loss  = [], []
    num_snapshots = num_epochs // (step_size * 2)
    print("num_snapshots", num_snapshots)
    snapshots = []
    current_lr = scheduler.get_lr()[0]
    
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        print("current lr", current_lr)
        
        # Check if we take snapshot        
        take_snapshot = current_lr == scheduler.base_lr
        if take_snapshot:
            print("at the bottom!")

            
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            next_ind = 0
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                if phase == 'train':
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    # backward + optimize only if in training phase
                    loss.backward()
                    optimizer.step()
                else:
                    with torch.no_grad():
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            # Update scheduler
            if phase == 'train':
                scheduler.step()
                current_lr = scheduler.get_lr()[0]
                
            # Update loss and acuracy
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            if phase == 'train':
                tr_acc.append(epoch_acc)
                tr_loss.append(epoch_loss)
            elif phase == 'val':
                val_acc.append(epoch_acc)
                val_loss.append(epoch_loss)
                
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
        print()
        if take_snapshot:
            print("snapshot taken")
            snapshots.append(copy.deepcopy(model.state_dict()))


    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    return snapshots, tr_acc, val_acc, tr_loss, val_loss

In [17]:
# Load the pretrained model
resnet_model = models.resnet18(pretrained=True)
# for param in resnet_model.parameters():
#     param.requires_grad = False
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, num_classes)
# resnet_model.load_state_dict(torch.load('./models/resnet18_model_state_dict.pt'))
resnet_model = resnet_model.to(device)
resnet_model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [20]:


# Define Optimizer and Loss Function
criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.SGD(resnet_model.parameters(), lr=0.01, momentum=0.9)
# cyc_lr_scheduler = lr_scheduler.CyclicLR(optimizer_ft, base_lr=1e-4, max_lr=1e-1, step_size_up=10)
cyc_lr_scheduler = CyclicLR(optimizer_ft, base_lr=0.001, max_lr=0.1, step_size=5, mode='exp_range')
# cyc_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=5, gamma=0.1)
snapshots, tr_acc, val_acc, tr_loss, val_loss = snapshot_training(resnet_model, criterion, 
                                                                     optimizer_ft, cyc_lr_scheduler, num_epochs=22, step_size=5)



num_snapshots 2
Epoch 0/21
----------
current lr 0.021
train Loss: 4.8489 Acc: 0.0791
val Loss: 4.0772 Acc: 0.1873

Epoch 1/21
----------
current lr 0.001
at the bottom!
train Loss: 3.5228 Acc: 0.2788
val Loss: 3.1350 Acc: 0.3258

snapshot taken
Epoch 2/21
----------
current lr 0.020602
train Loss: 2.2223 Acc: 0.4599
val Loss: 2.1512 Acc: 0.4756

Epoch 3/21
----------
current lr 0.03981195999999999
train Loss: 1.8626 Acc: 0.5294
val Loss: 2.4376 Acc: 0.4241

Epoch 4/21
----------
current lr 0.05863576060000001
train Loss: 1.7787 Acc: 0.5489
val Loss: 2.7045 Acc: 0.3882

Epoch 5/21
----------
current lr 0.07707920399200001
train Loss: 1.6978 Acc: 0.5671
val Loss: 3.1626 Acc: 0.3473

Epoch 6/21
----------
current lr 0.0951480149401
train Loss: 1.5795 Acc: 0.5944
val Loss: 2.9106 Acc: 0.3793

Epoch 7/21
----------
current lr 0.0755652278325592
train Loss: 0.9086 Acc: 0.7472
val Loss: 3.3432 Acc: 0.3691

Epoch 8/21
----------
current lr 0.05636468166567521
train Loss: 0.3558 Acc: 0.8958
va

In [12]:
sn_models = []
for i in range(len(snapshots)):
    model = models.resnet18(pretrained=True)
    for param in model.parameters():
        param.requires_grad = False
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, num_classes)

#     model.load_state_dict(torch.load('./models/snapshots/resnet18_snapshot{}.pt'.format(i)))
    model.load_state_dict()
    model = model.to(device)
    sn_models.append(model)

In [13]:
# Ensemble by averaging
criterion = nn.CrossEntropyLoss()
ensemble_solver = Ensemble(sn_models)
val_acc, val_loss = ensemble_solver.evaluate_all(criterion)
val_acc

### Hyperparameter tuning for snapshot ensemble

(tune only the last layer)  
base_lr: 0.0001  
max_lr:0.1  
num of epochs: 22  
stepsize: 5  
num of snapshots: 2  
validation accuracy without snapshot ensemble: 35.2%    
validation accuracy with snapshot ensemble: 32.57%  


(fine tune the whole layers)  
base_lr: 0.001  
max_lr:0.1  
num of epochs: 22  
stepsize: 5  
num of snapshots: 2  
validation accuracy without snapshot ensemble: 53.3%   
validation accuracy :  53.1%

(fine tune the whole layers)   
base_lr: 0.001  
max_lr:0.1  
num of epochs: 28  
stepsize: 3  
num of snapshots: 4  
validation accuracy without snapshot ensemble: 53.3%   
validation accuracy :  53.51%  

(fine tune the whole layers)  
base_lr: 0.0001   
max_lr:0.1  
stepsize 10  
num of snapshots 3  
validation accuracy without snapshot ensemble: 53.3%   
validation accuracy : 52.59%    

(fine tune the whole layers)    
base_lr: 0.001  
max_lr:0.1  
stepsize 15  
num of snapshots 2  
validation accuracy without snapshot ensemble: 53.3%   
validation accuracy : 52.63 %

In [50]:
def evaluate_snapshots(model_list, snapshots, criterion):
    phase = 'val'

    for model, weight in zip(model_list, snapshots):
        model.load_state_dict(weight)
        model.eval()
        model.to(device)
       
    test_acc = 0.0
    test_loss = 0
    correct = 0
    for data, target in dataloaders[phase]:
        data = data.to(device)
        target = target.to(device)
        output_list = [model(data).unsqueeze(0) for model in model_list]
        output = torch.mean(torch.cat(output_list), 0).squeeze()
        test_loss += criterion(output, target)
        
        pred = output.data.max(1)[1]
        correct += torch.sum(pred == target.data)
        
    test_acc = correct.double() / dataset_sizes[phase]
    test_loss /= dataset_sizes[phase]
    print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, test_loss, test_acc))

    return test_loss, test_acc

In [57]:
model_list = []
for i in range(len(snapshots)):
    model = models.resnet18(pretrained=True)

    for param in model.parameters():
        param.requires_grad = False
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, num_classes)
    model_list.append(model)
evaluate_snapshots(model_list, snapshots, nn.CrossEntropyLoss())

val Loss: 0.0060 Acc: 0.5263


(tensor(0.0060, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(0.5263, device='cuda:0', dtype=torch.float64))

In [56]:
stepsizes = [10, 15, 20]
for st in stepsizes:
    # Define Optimizer and Loss Function
    criterion = nn.CrossEntropyLoss()
    optimizer_ft = optim.SGD(resnet_model.parameters(), lr=0.01, momentum=0.9)
    cyc_lr_scheduler = CyclicLR(optimizer_ft, base_lr=0.0001, max_lr=0.1, step_size=st, mode='exp_range')
    snapshots, tr_acc, val_acc, tr_loss, val_loss = snapshot_training(resnet_model, criterion, 
                                                                         optimizer_ft, cyc_lr_scheduler, num_epochs=44, step_size=st)
    print("*******************")
    print("stepsize", st)
    print("num of snapshots", len(snapshots))
    model_list = []
    for i in range(len(snapshots)):
        model = models.resnet18(pretrained=True)

        for param in model.parameters():
            param.requires_grad = False
        num_ftrs = model.fc.in_features
        model.fc = nn.Linear(num_ftrs, num_classes)
        model_list.append(model)
    evaluate_snapshots(model_list, snapshots, nn.CrossEntropyLoss())

num_snapshots 2
Epoch 0/43
----------
current lr 0.010190909090909089
train Loss: 0.0015 Acc: 0.9998
val Loss: 2.7897 Acc: 0.5253

Epoch 1/43
----------
current lr 0.0001
at the bottom!
train Loss: 0.0016 Acc: 0.9998
val Loss: 2.7920 Acc: 0.5239

snapshot taken
Epoch 2/43
----------
current lr 0.009990100000000009
train Loss: 0.0015 Acc: 0.9998
val Loss: 2.7890 Acc: 0.5239

Epoch 3/43
----------
current lr 0.019682397999999997
train Loss: 0.0014 Acc: 0.9998
val Loss: 2.7870 Acc: 0.5243

Epoch 4/43
----------
current lr 0.029179861030000004
train Loss: 0.0013 Acc: 0.9997
val Loss: 2.7911 Acc: 0.5252

Epoch 5/43
----------
current lr 0.0384854165596
train Loss: 0.0013 Acc: 0.9997
val Loss: 2.7998 Acc: 0.5256

Epoch 6/43
----------
current lr 0.047601952992505
train Loss: 0.0013 Acc: 0.9997
val Loss: 2.8097 Acc: 0.5247

Epoch 7/43
----------
current lr 0.05653232015509595
train Loss: 0.0013 Acc: 0.9997
val Loss: 2.8184 Acc: 0.5247

Epoch 8/43
----------
current lr 0.0652793297791358
train

train Loss: 0.0005 Acc: 0.9997
val Loss: 3.0246 Acc: 0.5255

Epoch 26/43
----------
current lr 0.02600145126799158
train Loss: 0.0005 Acc: 0.9997
val Loss: 3.0255 Acc: 0.5259

Epoch 27/43
----------
current lr 0.020613949404249324
train Loss: 0.0005 Acc: 0.9997
val Loss: 3.0211 Acc: 0.5268

Epoch 28/43
----------
current lr 0.015331607432655126
train Loss: 0.0005 Acc: 0.9997
val Loss: 3.0255 Acc: 0.5260

Epoch 29/43
----------
current lr 0.010152860905552383
train Loss: 0.0005 Acc: 0.9997
val Loss: 3.0246 Acc: 0.5257

Epoch 30/43
----------
current lr 0.005076166148248429
train Loss: 0.0005 Acc: 0.9997
val Loss: 3.0343 Acc: 0.5259

Epoch 31/43
----------
current lr 0.0001
at the bottom!
train Loss: 0.0005 Acc: 0.9998
val Loss: 3.0293 Acc: 0.5265

snapshot taken
Epoch 32/43
----------
current lr 0.004977140441898302
train Loss: 0.0005 Acc: 0.9998
val Loss: 3.0321 Acc: 0.5262

Epoch 33/43
----------
current lr 0.009756738074958604
train Loss: 0.0005 Acc: 0.9997
val Loss: 3.0334 Acc: 0.52

KeyboardInterrupt: 

#### test submission

In [None]:

class TestEnsemble():
    def __init__(self, models):
        self.models = models
        self.loss = 0.0
        self.acc = 0.0
        
    def evaluate_all(self, criterion, mode='average'):
        running_loss = 0.0
        running_corrects = 0
        phase = 'val'
        for m in self.models:
            m.eval()
        
        with torch.no_grad():
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                # Take average of the output to make prediction
                outputs = None
                for m in self.models:
                    if outputs is None:
                        outputs = m(inputs)
                    else:
                        outputs += m(inputs)
                outputs /= len(self.models)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
                
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            self.loss = running_loss / dataset_sizes[phase]
            self.acc = running_corrects.double() / dataset_sizes[phase]
        return self.acc, self.loss

In [4]:
# Load models for Ensemble

# Load Ahad's resnet152
resnet_model = resnet_modified.resnet152(pretrained=False, decay_factor=0.04278)
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, num_classes)
best_model_path = "./models/resnet152_best_model_state_dict.pth"
resnet_model.load_state_dict(torch.load(best_model_path, map_location=torch.device(device)))
resnet_model = resnet_model.to(device)
resnet_model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): BundledBottleneck(
    (layers): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
         

In [5]:
# Load Michael's dense162

dense_model = torchvision.models.densenet169(pretrained=True)
num_ftrs = dense_model.classifier.in_features
dense_model.classifier = nn.Linear(num_ftrs, num_classes)
dense_model.load_state_dict(torch.load('./models/densenet169_best_model_epoch_0.pth', map_location=torch.device(device)))
dense_model = dense_model.to(device)
dense_model.eval()

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

In [6]:
# Load Kei's VGG net

vgg_model = torch.hub.load('pytorch/vision:v0.6.0', 'vgg19_bn', pretrained=True)
num_ftrs = vgg_model.classifier[6].in_features
vgg_model.classifier[6] = nn.Linear(num_ftrs,num_classes)
vgg_model.load_state_dict(torch.load('./models/vgg19_bn_best_model_with_dataaug_state_dict.pt'))
vgg_model = vgg_model.to(device)
vgg_model.eval()

Using cache found in /Users/watanabekeisuke/.cache/torch/hub/pytorch_vision_v0.6.0


VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 256

In [7]:

# Load Chris's resnet attention
attention_model = ResidualAttentionModel()
attention_model =  torch.nn.DataParallel(attention_model)
checkpoint = torch.load('./models/chris_resnet_model_best.pth.tar', map_location=torch.device(device))
attention_model.load_state_dict(checkpoint['state_dict'])
attention_model = attention_model.to(device)
attention_model.eval()


DataParallel(
  (module): ResidualAttentionModel_92_32input_update(
    (conv1): Sequential(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (residual_block1): ResidualBlock(
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv1): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (conv4): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
    )
    (atten

In [192]:
# vgg19_bn model 
# 61.7%
models = [resnet_model, dense_model, vgg_model]
for m in models:
    criterion = nn.CrossEntropyLoss()
    ensemble_solver = Ensemble([m])
    top1_acc, top5_acc, val_loss = ensemble_solver.evaluate_all(criterion)
    print("Validation top1_acc: {}, top5_acc:{}, loss:{}".format(top1_acc, top5_acc, val_loss))

Validation top1_acc: 0.635, top5_acc:0.8523, loss:1.4698823869228363
Validation top1_acc: 0.5327, top5_acc:0.7951, loss:1.8803689002990722
Validation top1_acc: 0.6172, top5_acc:0.8463, loss:1.5434519827365876


In [None]:
combo = [
    [resnet_model, dense_model, vgg_model],
         [resnet_model, dense_model], 
         [resnet_model, vgg_model], 
         [dense_model, vgg_model]
]

for c in combo:
    criterion = nn.CrossEntropyLoss()
    ensemble_solver = Ensemble(c)
    top1_acc, top5_acc, val_loss = ensemble_solver.evaluate_all(criterion)
    print("Validation top1_acc: {}, top5_acc:{}, loss:{}".format(top1_acc, top5_acc, val_loss))