<a href="https://colab.research.google.com/github/Het-Shah/IKD_DAFL/blob/master/code/notebooks/DSN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchvision.transforms as transforms
import torchvision.datasets as datasets

import random
import time

import numpy as np
import copy

In [3]:
!nvidia-smi

Mon Mar 16 17:51:21 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

# LeNet

In [0]:
class LeNet5(nn.Module):

    def __init__(self,params):
        super(LeNet5, self).__init__()

        self.conv1 = nn.Conv2d(1, params[0], kernel_size=(5, 5))
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 2), stride=2)
        self.conv2 = nn.Conv2d(params[0], params[1], kernel_size=(5, 5))
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 2), stride=2)
        self.conv3 = nn.Conv2d(params[1], params[2], kernel_size=(5, 5))
        self.relu3 = nn.ReLU()
        self.fc1 = nn.Linear(params[2], params[3])
        self.relu4 = nn.ReLU()
        self.fc2 = nn.Linear(params[3], 10)

    def forward(self, img, out_feature=False):
        output = self.conv1(img)
        output = self.relu1(output)
        output = self.maxpool1(output)
        output = self.conv2(output)
        output = self.relu2(output)
        output = self.maxpool2(output)
        output = self.conv3(output)
        output = self.relu3(output)
        feature = output.view(-1, 120)
        output = self.fc1(feature)
        output = self.relu4(output)
        output = self.fc2(output)
        if out_feature == False:
            return output
        else:
            return output,feature

In [0]:
original_parameters = [6,16,120,84]
net = LeNet5(original_parameters)

In [6]:
net

LeNet5(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=(2, 2), stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxpool2): MaxPool2d(kernel_size=(2, 2), stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(16, 120, kernel_size=(5, 5), stride=(1, 1))
  (relu3): ReLU()
  (fc1): Linear(in_features=120, out_features=84, bias=True)
  (relu4): ReLU()
  (fc2): Linear(in_features=84, out_features=10, bias=True)
)

In [7]:
original_parameters = [6,16,120,84]
for depth in range(3):
  if depth == 0:
    print("Original network: ")
    net = LeNet5(original_parameters)
    print(net)
  else:
    print("At depth " + str(depth) + ": ")
    original_parameters = [int(i/2) for i in original_parameters]
    print(original_parameters)
    net = LeNet5(original_parameters)
    print(net)

Original network: 
LeNet5(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=(2, 2), stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxpool2): MaxPool2d(kernel_size=(2, 2), stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(16, 120, kernel_size=(5, 5), stride=(1, 1))
  (relu3): ReLU()
  (fc1): Linear(in_features=120, out_features=84, bias=True)
  (relu4): ReLU()
  (fc2): Linear(in_features=84, out_features=10, bias=True)
)
At depth 1: 
[3, 8, 60, 42]
LeNet5(
  (conv1): Conv2d(1, 3, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=(2, 2), stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(3, 8, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxpool2): MaxPool2d(kernel_size=(2, 2), stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(8, 60, kernel

# VGG

In [0]:
class VGG(nn.Module):

    def __init__(self, features, params, num_classes=10, init_weights=True):
        super(VGG, self).__init__()
        self.features = features
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(params[0] * 7 * 7, params[1]),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(params[1], params[2]),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(params[2], num_classes),
        )
        if init_weights:
            self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

In [0]:
def make_layers(cfg, batch_norm=False):
    layers = []
    in_channels = 3
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)

def _vgg(arch, cfg,params, batch_norm, pretrained, progress, **kwargs):
    if pretrained:
        kwargs['init_weights'] = False
    model = VGG(make_layers(cfg, batch_norm=batch_norm),params = params, **kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model

In [0]:
def vgg16(features,params,pretrained=False,progress=True, **kwargs):
    return _vgg('vgg16', features, params, False, pretrained, progress, **kwargs)

In [0]:
def get_output_nodes(params):
  for i in reversed(params):
    if type(i) == int:
      last_number = i
      break

  return last_number

# Testing on Cifar-10

In [0]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [14]:
train_data = datasets.CIFAR10(root = 'data', 
                              train = True, 
                              download = True)

means = train_data.data.mean(axis = (0,1,2)) / 255
stds = train_data.data.std(axis = (0,1,2)) / 255

print(f'Calculated means: {means}')
print(f'Calculated stds: {stds}')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Extracting data/cifar-10-python.tar.gz to data
Calculated means: [0.49139968 0.48215841 0.44653091]
Calculated stds: [0.24703223 0.24348513 0.26158784]


In [0]:
train_transforms = transforms.Compose([
                           transforms.RandomHorizontalFlip(),
                           transforms.RandomRotation(10),
                           transforms.ToTensor(),
                           transforms.Normalize(mean = means, 
                                                std = stds)
                       ])

test_transforms = transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize(mean = means, 
                                                std = stds)
                       ])

In [16]:
train_data = datasets.CIFAR10('data', 
                              train = True, 
                              download = True, 
                              transform = train_transforms)

test_data = datasets.CIFAR10('data', 
                             train = False, 
                             download = True, 
                             transform = test_transforms)

Files already downloaded and verified
Files already downloaded and verified


In [0]:
n_train_examples = int(len(train_data)*0.9)
n_valid_examples = len(train_data) - n_train_examples

train_data, valid_data = torch.utils.data.random_split(train_data, 
                                                       [n_train_examples, n_valid_examples])

In [18]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 45000
Number of validation examples: 5000
Number of testing examples: 10000


In [0]:
BATCH_SIZE = 64

train_iterator = torch.utils.data.DataLoader(train_data, 
                                             shuffle = True, 
                                             batch_size = BATCH_SIZE)

valid_iterator = torch.utils.data.DataLoader(valid_data, 
                                             batch_size = BATCH_SIZE)

test_iterator = torch.utils.data.DataLoader(test_data, 
                                            batch_size = BATCH_SIZE)

# Building model and testing

In [0]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False):
    since = time.time()
    model = model.to(device)
    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    # Special case for inception because in training it has an auxiliary output. In train
                    #   mode we calculate the loss by summing the final output and the auxiliary output
                    #   but in testing we only consider the final output.
                    if is_inception and phase == 'train':
                        # From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
                        outputs, aux_outputs = model(inputs)
                        loss1 = criterion(outputs, labels)
                        loss2 = criterion(aux_outputs, labels)
                        loss = loss1 + 0.4*loss2
                    else:
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history

In [0]:
dataloaders_dict = {'train':train_iterator,'val':valid_iterator}

# Number of classes in the dataset
num_classes = 10

# Batch size for training (change depending on how much memory you have)
batch_size = 8

# Number of epochs to train for
EPOCHS = 200

In [46]:
original_parameters = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M']
classifier_parameters = [512,4096,4096]

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

for depth in range(7):
  last_number = get_output_nodes(original_parameters)
  classifier_params = [last_number,4096,4096]
  if depth == 0:
    continue
    print("Original network: ")
    net = vgg16(original_parameters,classifier_parameters)
    # print(net)
    optimizer = optim.SGD(net.parameters(),lr=0.001)
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)
    net, hist = train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=EPOCHS)
    print("\n############################################################################\n")

  # For odd case :
  elif depth % 2 == 1:
    continue
    print("At depth " + str(depth) + ": ")
    print("Dividing filters by 2: ")
    original_parameters = [int(i/2) if type(i) == int else i for i in original_parameters]
    classifier_parameters = [int(i/2) if type(i) == int else i for i in classifier_parameters]
    print(original_parameters)
    net = vgg16(original_parameters,classifier_parameters)
    last_layer = [net.features[i] for i in range(len(net.features))][-1]
    if str(last_layer) ==  'ReLU(inplace=True)':
      net.features = nn.Sequential(*[net.features[i] for i in range(len(net.features) -1)])
    # print(net)
    optimizer = optim.SGD(net.parameters(),lr=0.001)
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)
    net, hist = train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=EPOCHS)
    print("\n############################################################################\n")

  # For even case :
  elif depth % 2 == 0:
    print("At depth " + str(depth) + ": ")
    print("Removing last layer: ")
    second_last_layer = [net.features[i] for i in range(len(net.features)-1)][-1]
    if str(second_last_layer) ==  'ReLU(inplace=True)':
      net.features = nn.Sequential(*[net.features[i] for i in range(len(net.features) -2)])
    else:
      net.features = nn.Sequential(*[net.features[i] for i in range(len(net.features) -1)])
    original_parameters = original_parameters[:-1]
    # print(net)
    optimizer = optim.SGD(net.parameters(),lr=0.001)
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)
    net, hist = train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=EPOCHS)
    print("\n############################################################################\n")


At depth 2: 
Removing last layer: 
Epoch 0/199
----------
train Loss: 2.3026 Acc: 0.0996
val Loss: 2.3025 Acc: 0.1022

Epoch 1/199
----------
train Loss: 2.3025 Acc: 0.1009
val Loss: 2.3024 Acc: 0.0878

Epoch 2/199
----------
train Loss: 2.3024 Acc: 0.1019
val Loss: 2.3023 Acc: 0.0996

Epoch 3/199
----------
train Loss: 2.3023 Acc: 0.1042
val Loss: 2.3022 Acc: 0.1006

Epoch 4/199
----------
train Loss: 2.3021 Acc: 0.1084
val Loss: 2.3021 Acc: 0.1012

Epoch 5/199
----------
train Loss: 2.3020 Acc: 0.1104
val Loss: 2.3020 Acc: 0.1012

Epoch 6/199
----------
train Loss: 2.3019 Acc: 0.1075
val Loss: 2.3018 Acc: 0.1018

Epoch 7/199
----------
train Loss: 2.3017 Acc: 0.1083
val Loss: 2.3016 Acc: 0.1012

Epoch 8/199
----------
train Loss: 2.3015 Acc: 0.1075
val Loss: 2.3014 Acc: 0.1014

Epoch 9/199
----------
train Loss: 2.3012 Acc: 0.1047
val Loss: 2.3011 Acc: 0.1016

Epoch 10/199
----------
train Loss: 2.3008 Acc: 0.1056
val Loss: 2.3007 Acc: 0.1012

Epoch 11/199
----------
train Loss: 2.30

KeyboardInterrupt: ignored