In [None]:
import copy
from collections import OrderedDict

import sys
import time
import os
import math
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.autograd import Variable

from torch import optim
import torch.nn.functional as F
import numpy as np


In [None]:
# Load training data
transform_train = transforms.Compose([                                   
    transforms.RandomCrop(32, padding=4),                                       
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, 
                                        download=True,
                                        transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128,
                                          shuffle=True, num_workers=2)
# Load testing data
transform_test = transforms.Compose([                                           
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True,
                                       transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [None]:
# CIFAR model (architecture from CS 242)

def conv_block(in_channels, out_channels, kernel_size=3, stride=1,
               padding=1):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding,
                  bias=False),
        nn.BatchNorm2d(out_channels),
        nn.ReLU(inplace=True)
        )

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.model = nn.Sequential(
            conv_block(3, 32),
            conv_block(32, 32),
            conv_block(32, 64, stride=2),
            conv_block(64, 64),
            conv_block(64, 64),
            conv_block(64, 128, stride=2),
            conv_block(128, 128),
            conv_block(128, 256),
            conv_block(256, 256),
            nn.AdaptiveAvgPool2d(1)
            )

        self.classifier = nn.Linear(256, 10)

    def forward(self, x):
        h = self.model(x)
        B, C, _, _ = h.shape
        h = h.view(B, C)
        return self.classifier(h)

In [None]:
def train(net, epoch, train_loss_tracker, train_acc_tracker):
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets) # Add regularization term (define new class)
        loss.backward()
        # update optimizer state
        optimizer.step()
        # compute average loss
        train_loss += loss.item()
        train_loss_tracker.append(loss.item())
        loss = train_loss / (batch_idx + 1)
        # compute accuracy
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        acc = 100. * correct / total
        # Print status
        sys.stdout.write(f'\rEpoch {epoch}: Train Loss: {loss:.3f}' +  
                         f'| Train Acc: {acc:.3f}')
        sys.stdout.flush()
    train_acc_tracker.append(acc)
    sys.stdout.flush()

def test(net, epoch, test_loss_tracker, test_acc_tracker):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    best_acc = 0.0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            test_loss_tracker.append(loss.item())
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            loss = test_loss / (batch_idx + 1)
            acc = 100.* correct / total
    sys.stdout.write(f' | Test Loss: {loss:.3f} | Test Acc: {acc:.3f}\n')
    sys.stdout.flush()
    
    # Save checkpoint.
    acc = 100.*correct/total
    test_acc_tracker.append(acc)
    if acc > best_acc:
        state = {
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        torch.save(state, './checkpoint/ckpt.pth')
        best_acc = acc  

In [None]:
def regularizer(net, discretizer, distance='KL'):
    '''
    Given a net, creates a penalty based on the distance of the net's weight 
    distribution and the input distribution 
    Args: 
        net: model
        discretizer (int): (positive) number of bins to simplify weight disribution
    '''
    for m in net.modules():
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
            w = m.weight; b = m.bias     # usually bias will be zero (we init conv blocks as bias=False)
            
            
    
    if distance == 'KL':
        
        
    else:
        NotImplementedError()

IndentationError: expected an indented block (<ipython-input-9-5ad4b4b5825b>, line 18)

In [None]:
class UniformRegularizingLoss(nn.Module):
    def __init__(self, lbda, distribution='uniform', weighted=None):
        super(RegularizingLoss, self).__init__()
        
        self.ce_loss = nn.CrossEntropyLoss(reduction="mean", weight=weighted)
        self.lbda = lbda
        #self.dist = distribution
        
    def forward(self, logits, labels, net, bins=30):
        regularizing_term = regularizer(net, discretizer=bins)
        
        return self.lbda * regularizing_term + self.ce_loss(logits, labels)
        
        

In [None]:
torch.manual_seed(43) # to give stable randomness 

device = 'cuda'
net = ConvNet()
net = net.to(device)

# PART 1.1: set the learning rate (lr) used in the optimizer.
lr = 0.1    # Part 1.1 results tell us this has the best test accuracy out of the three 

# PART 1.1: Modify this to train for a short 5 epochs
# PART 1.2: Modify this to train a longer 100 epochs
epochs = 100

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=0.9,
                            weight_decay=5e-4)

# PART 1.2: try different learning rate scheduler 
scheduler_name= 'multistep'   # set this to 'multistep' or 'cosine_annealing' (or None for Part 1.1)
if scheduler_name=='multistep':
    milestones = [25, 50, 75, 100]
    gamma      = 0.1
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                    milestones=milestones,
                                                    gamma=gamma)
elif scheduler_name=='cosine_annealing':
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    milestones = []

elif scheduler_name==None:
    milestones = []; gamma = 0

else:
    NotImplementedError

# Records the training loss and training accuracy during training
train_loss_tracker, train_acc_tracker = [], []

# Records the test loss and test accuracy during training
test_loss_tracker, test_acc_tracker = [], []

print('Training for {} epochs, with learning rate {} and milestones {}'.format(
      epochs, lr, milestones))

start_time = time.time()
for epoch in range(0, epochs):
    ep_start_time = time.time()
    
    train(net, epoch, train_loss_tracker, train_acc_tracker)
    test(net, epoch, test_loss_tracker, test_acc_tracker)
    scheduler.step()
    
    ep_end_time = time.time()
    epoch_time = ep_end_time - ep_start_time
    print(f"Epoch {epoch}: {epoch_time} seconds")

total_time = time.time() - start_time
print('Total training time: {} seconds'.format(total_time))

Training for 100 epochs, with learning rate 0.1 and milestones [25, 50, 75, 100]
Epoch 0: Train Loss: 1.630| Train Acc: 38.790 | Test Loss: 1.380 | Test Acc: 48.900
Epoch 0: 11.352970361709595 seconds
Epoch 1: Train Loss: 1.147| Train Acc: 58.532 | Test Loss: 1.090 | Test Acc: 61.620
Epoch 1: 11.700154781341553 seconds
Epoch 2: Train Loss: 0.897| Train Acc: 68.368 | Test Loss: 1.526 | Test Acc: 55.240
Epoch 2: 11.708998680114746 seconds
Epoch 3: Train Loss: 0.764| Train Acc: 73.256 | Test Loss: 1.148 | Test Acc: 66.000
Epoch 3: 11.338812112808228 seconds
Epoch 4: Train Loss: 0.697| Train Acc: 76.002 | Test Loss: 0.929 | Test Acc: 69.700
Epoch 4: 11.356273889541626 seconds
Epoch 5: Train Loss: 0.649| Train Acc: 77.598 | Test Loss: 1.101 | Test Acc: 64.110
Epoch 5: 11.651323080062866 seconds
Epoch 6: Train Loss: 0.611| Train Acc: 78.742 | Test Loss: 0.849 | Test Acc: 71.820
Epoch 6: 11.755674123764038 seconds
Epoch 7: Train Loss: 0.579| Train Acc: 80.030 | Test Loss: 0.894 | Test Acc: 70

In [None]:
M = net.modules()

In [None]:
m = next(M)

In [None]:
dir(m)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_buffers',
 '_call_impl',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_backward_hooks',
 '_get_name',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_set',
 '_parameters',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_replicate_for_data_parallel',
 '_save_to_state_dict',
 '_slow_forward',
 '_state_dict_hooks',
 '_version',
 'add_module',
 'apply',


In [None]:
count = 0
for m in net.modules():
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        w = m.weight; b = m.bias
        W = w
        break

In [None]:
W.grad

tensor([[[[ 1.2592e-04, -5.7475e-05, -4.7485e-04],
          [-7.8162e-04, -1.2203e-03, -1.5978e-03],
          [-7.9703e-04, -1.3087e-03, -1.9508e-03]],

         [[ 1.9689e-04,  3.7990e-05, -4.2284e-04],
          [-8.3158e-04, -1.2807e-03, -1.6360e-03],
          [-9.3918e-04, -1.4063e-03, -1.9979e-03]],

         [[ 9.3454e-04,  7.5219e-04,  3.4766e-04],
          [-1.3931e-04, -5.8620e-04, -8.7297e-04],
          [-3.5968e-04, -7.1853e-04, -1.2178e-03]]],


        [[[-4.2590e-03, -1.9439e-03,  5.2018e-04],
          [-2.9612e-03, -1.2428e-05,  2.3305e-03],
          [-4.0652e-03, -1.4808e-03,  9.1480e-04]],

         [[-2.9791e-03, -1.8602e-03, -9.3304e-04],
          [-1.8971e-03, -2.8362e-04,  1.8103e-04],
          [-3.1598e-03, -1.2055e-03, -4.1428e-04]],

         [[-4.8152e-04,  1.5994e-03,  2.8851e-03],
          [ 4.7891e-04,  4.0578e-03,  5.1998e-03],
          [-1.5172e-04,  3.4703e-03,  5.2405e-03]]],


        [[[-1.4263e-04, -2.6837e-03, -4.2197e-03],
          [-3.9

In [None]:
W.shape

torch.Size([32, 3, 3, 3])

In [None]:
W.grad.shape

torch.Size([32, 3, 3, 3])