In [None]:
import copy
from collections import OrderedDict

import sys
import time
import os
import math
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.autograd import Variable

from torch import optim
import torch.nn.functional as F
import numpy as np

In [None]:
# Load training data
transform_train = transforms.Compose([                                   
    transforms.RandomCrop(32, padding=4),                                       
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, 
                                        download=True,
                                        transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128,
                                          shuffle=True, num_workers=2)
# Load testing data
transform_test = transforms.Compose([                                           
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True,
                                       transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [None]:
# CIFAR model (architecture from CS 242)

def conv_block(in_channels, out_channels, kernel_size=3, stride=1,
               padding=1):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding,
                  bias=False),
        nn.BatchNorm2d(out_channels)
        )

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
#         self.model = nn.Sequential(
#             conv_block(3, 32),
#             conv_block(32, 32),
#             conv_block(32, 64, stride=2),
#             conv_block(64, 64),
#             conv_block(64, 64),
#             conv_block(64, 128, stride=2),
#             conv_block(128, 128),
#             conv_block(128, 256),
#             conv_block(256, 256),
#             )
        self.layer1 = conv_block(3,32)
        self.layer2 = conv_block(32,32)
        self.layer3 = conv_block(32,64)
        self.pooler = nn.AdaptiveAvgPool2d(1)
        self.relu = nn.ReLU(inplace=True)
        self.classifier = nn.Linear(64, 10)
        
        self.relu_outputs = {}
    
    def get_activations(self):
        return self.relu_outputs

    def forward(self, x):
        h1 = self.layer1(x)
        relu1 = self.relu(h1)
        #relu1.requires_grad=True
        relu1.retain_grad()
        self.relu_outputs[1] = relu1
        h2 = self.layer2(relu1)
        relu2 = self.relu(h2)
        relu2.retain_grad()
        self.relu_outputs[2] = relu2
        h3 = self.layer3(relu2)
        relu3 = self.relu(h3)
        relu3.retain_grad()
        self.relu_outputs[3] = relu2
        h = self.pooler(relu3)
        B, C, _, _ = h.shape
        h = h.view(B, C)
        return self.classifier(h)

In [None]:
def train(net, epoch, train_loss_tracker, train_acc_tracker):
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        inputs.requires_grad=True
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets) # Add regularization term (define new class)
        loss.backward()
        # update optimizer state
        optimizer.step()
        # compute average loss
        train_loss += loss.item()
        train_loss_tracker.append(loss.item())
        loss = train_loss / (batch_idx + 1)
        # compute accuracy
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        acc = 100. * correct / total
        # Print status
        sys.stdout.write(f'\rEpoch {epoch}: Train Loss: {loss:.3f}' +  
                         f'| Train Acc: {acc:.3f}')
        sys.stdout.flush()
        
#     key1 = list(visualisation.keys())[1]
#     activation1 = visualisation[key1]
#     #print(activation1)
#     print(activation1.grad)
        
    train_acc_tracker.append(acc)
    sys.stdout.flush()
    
    print(torch.norm(net.relu_outputs[1], p=1) * (128 ** -1))
    

    

def test(net, epoch, test_loss_tracker, test_acc_tracker):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    best_acc = 0.0
 
    for batch_idx, (inputs, targets) in enumerate(testloader):
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = net(inputs)
        loss = criterion(outputs, targets)

        test_loss += loss.item()
        test_loss_tracker.append(loss.item())
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        loss = test_loss / (batch_idx + 1)
        acc = 100.* correct / total
    sys.stdout.write(f' | Test Loss: {loss:.3f} | Test Acc: {acc:.3f}\n')
    sys.stdout.flush()
    
    # Save checkpoint.
    acc = 100.*correct/total
    test_acc_tracker.append(acc)
    if acc > best_acc:
        state = {
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        torch.save(state, './checkpoint/ckpt.pth')
        best_acc = acc  

In [None]:
class SaveFeatures():
    def __init__(self, module):
        self.hook = module.register_forward_hook(self.hook_fn)
    def hook_fn(self, module, input, output):
        self.features = torch.tensor(output,requires_grad=True).cuda()
    def close(self):
        self.hook.remove()

In [None]:

visualisation = {}
hooks = {}

def hook_fn(m, i, o):
    visualisation[m] = o.clone().requires_grad_(True) # torch.tensor(o,requires_grad=True).cuda()
    #visualisation[m].requires_grad = True
    visualisation[m].cuda()
    visualisation[m].retain_grad()

def get_all_layers(net):
    for name, layer in net._modules.items():
    #If it is a sequential, don't register a hook on it
    # but recursively register hook on all it's module children
#         print(name)
#         print()
        if isinstance(layer, nn.Sequential):
            get_all_layers(layer)
        else:
            if isinstance(layer, nn.ReLU):
                layer.register_forward_hook(hook_fn)
            

# def close_hooks():
#     for h in hooks:
#         h.remove()
        
            


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
torch.manual_seed(43) # to give stable randomness 

device = 'cuda'
net = ConvNet()
net = net.to(device)

print(count_parameters(net))

get_all_layers(net)

29418


In [None]:
for name, layer in net._modules.items():
    print(layer)

Sequential(
  (0): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (1): Sequential(
    (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (3): AdaptiveAvgPool2d(output_size=1)
)
Linear(in_features=64, out_features=10, bias=True)


In [None]:
visualisation.keys()

dict_keys([ReLU()])

In [None]:
visualisation

{ReLU(): tensor([[[[0.0000e+00, 4.6546e-02, 0.0000e+00,  ..., 3.4188e-01,
            4.0852e-01, 0.0000e+00],
           [0.0000e+00, 2.5571e-02, 0.0000e+00,  ..., 4.3483e-01,
            5.4083e-01, 1.4643e-01],
           [0.0000e+00, 3.4552e-02, 0.0000e+00,  ..., 3.3381e-01,
            6.0975e-01, 1.8885e-01],
           ...,
           [3.6434e-01, 1.1214e+00, 1.4111e+00,  ..., 0.0000e+00,
            6.6011e-02, 0.0000e+00],
           [6.9664e-01, 1.5155e+00, 1.7735e+00,  ..., 6.6560e-01,
            6.7098e-01, 3.8384e-01],
           [3.1135e-01, 7.3083e-01, 8.8344e-01,  ..., 4.2552e-01,
            3.2706e-01, 8.9385e-02]],
 
          [[6.2496e-01, 6.3573e-01, 8.6042e-01,  ..., 1.3619e-01,
            5.4083e-01, 5.3639e-01],
           [1.0403e+00, 1.2702e+00, 1.5899e+00,  ..., 1.0386e+00,
            1.5838e+00, 9.4751e-01],
           [7.4424e-01, 1.1550e+00, 1.2905e+00,  ..., 4.4427e-01,
            8.8615e-01, 4.0871e-01],
           ...,
           [0.0000e+00, 0.0000

None


In [None]:
# PART 1.1: set the learning rate (lr) used in the optimizer.
lr = 0.1    # Part 1.1 results tell us this has the best test accuracy out of the three 

# PART 1.1: Modify this to train for a short 5 epochs
# PART 1.2: Modify this to train a longer 100 epochs
epochs = 1

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=0.9,
                            weight_decay=5e-4)

# PART 1.2: try different learning rate scheduler 
scheduler_name= 'multistep'   # set this to 'multistep' or 'cosine_annealing' (or None for Part 1.1)
if scheduler_name=='multistep':
    milestones = [25, 50, 75, 100]
    gamma      = 0.1
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                    milestones=milestones,
                                                    gamma=gamma)
elif scheduler_name=='cosine_annealing':
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    milestones = []

elif scheduler_name==None:
    milestones = []; gamma = 0

else:
    NotImplementedError

# Records the training loss and training accuracy during training
train_loss_tracker, train_acc_tracker = [], []

# Records the test loss and test accuracy during training
test_loss_tracker, test_acc_tracker = [], []

print('Training for {} epochs, with learning rate {} and milestones {}'.format(
      epochs, lr, milestones))

start_time = time.time()
for epoch in range(0, epochs):
    ep_start_time = time.time()
    
    train(net, epoch, train_loss_tracker, train_acc_tracker)
    test(net, epoch, test_loss_tracker, test_acc_tracker)
    scheduler.step()
    
    ep_end_time = time.time()
    epoch_time = ep_end_time - ep_start_time
    print(f"Epoch {epoch}: {epoch_time} seconds")

total_time = time.time() - start_time
print('Total training time: {} seconds'.format(total_time))

Training for 1 epochs, with learning rate 0.1 and milestones [25, 50, 75, 100]
Epoch 0: Train Loss: 1.729| Train Acc: 34.446tensor(3889.8840, device='cuda:0', grad_fn=<MulBackward0>)
 | Test Loss: 1.607 | Test Acc: 40.730
Epoch 0: 8.38831639289856 seconds
Total training time: 8.389055013656616 seconds


In [None]:
net.get_activations()[1].grad

In [None]:
net.get_activations()[2].grad

In [None]:
list(net.get_activations().keys())

[1, 2]

In [None]:
for m in net.modules():
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        print(m)

Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
Linear(in_features=64, out_features=10, bias=True)
