# EE-559: Project #1
## Classification, weight sharing, auxiliary losses

- Franck Dessimoz
- Léopold Bouraux
- Martin Esguerra
_____
_____

In [None]:
import sys, math, torch, time
import numpy as np
import torch.optim as optim
import dlc_practical_prologue as prologue
import matplotlib.pyplot as plt

from torch import nn
from torch.nn import functional as F

%matplotlib inline
%config InlineBackend.figure_format='retina'

sys.path.insert(0, "./src")

### 0. Visualize and prepare data

**• Load the dataset**

In [None]:
# Load the dataset
n = 1000
train_input, train_target, train_classes, test_input, test_target, test_classes = prologue.generate_pair_sets(n)

**• Visualize data**

In [None]:
# Tensors sizes
print('train_input size:  ', list(train_input.shape))
print('train_target size: ', list(train_target.shape))
print('train_classes size:', list(train_classes.shape))
print('test_input size:   ', list(test_input.shape))
print('test_target size:  ', list(test_target.shape))
print('test_classes size: ', list(test_classes.shape))

In [None]:
# The first of 1000 items for train data

f = plt.figure(figsize=(10,5))
ax1 = f.add_subplot(121)
ax2 = f.add_subplot(122)
f.suptitle('First pair of images from train_input')
ax1.imshow(train_input[0][0])
ax2.imshow(train_input[0][1])
plt.show()

print('train_target[0]:', train_target[0].tolist())
print('train_classes[0]:', train_classes[0].tolist())

In [None]:
mu = train_input.mean()
std = train_input.std()
train_input_norm = train_input.sub(mu).div(std)
test_input_norm = test_input.sub(mu).div(std)

n_rounds = 10
nb_params = []
nb_errors = []
losses_mean = []
nb_errors_mean = []
losses_std = []
nb_errors_std = []

### 1. Functions

In [None]:
def eval_loss(model, test_in, test_out):
    """ This function ...
        args:
            model:
            test_in: 
            test_out:
        returns:
            loss:
    """
    model.eval()
    criterion = nn.CrossEntropyLoss()
    out = model(test_in)
    model.train()
    loss = criterion(out, test_out).item()
    return loss

In [None]:
def eval_loss_inter(intermed, test_in, test_out):
    """ This function ...
        args:
            intermed:
            test_in: 
            test_out:
        returns:
            loss:
    """
    model.eval()
    out1, out2 = intermed(test_in)
    criterion = nn.CrossEntropyLoss()
    loss1 = criterion(out1, test_out[:,0]).item()
    loss2 = criterion(out2, test_out[:,1]).item()
    model.train()
    loss = loss1 + loss2
    return loss

In [None]:
def train_model(model, train_input, train_target,
                test_input, test_target,
                mini_batch_size=25, eta=0.001, epoch=25, disp=False):
    """ This function trains a model given the training input data
        and training target data.
        args:
            model: model to train
            train_input: training input data
            train_target: training target data
        returns:
            loss_arr: an array containing the training loss for each epoch
            eval_arr: an array containing the evaluation loss for each epoch
    """
    loss_arr = []
    eval_arr = []
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=eta)
    for e in range(epoch):
        sum_loss = 0
        for input_, target_ in zip(train_input.split(mini_batch_size),
                                  train_target.split(mini_batch_size)):
            output = model(input_)
            loss = criterion(output, target_)
            model.zero_grad()
            loss.backward()
            sum_loss += loss.item()
            optimizer.step()
            
        loss_eval = eval_loss(model, test_input, test_target)
        if((e % 2 == 0) and disp):
            print("Epoch: {}\tLoss out: {}\tLoss eval: {}".format(e, sum_loss, loss_eval))
        
        loss_arr.append(sum_loss)
        eval_arr.append(loss_eval)
        
    return loss_arr, eval_arr

In [None]:
def train_model_comp(intermed, output_net, train_input, train_target, train_classes,
                     test_input, test_target, test_classes,
                     eta=0.001, mini_batch_size=25, epoch1=25, epoch2=25, lr1=0.001, lr2=0.001, disp=False):
    """ This function trains a composite model given the training input data,
        the training target data and the number of training classes.
        args:
            intermed: intermediate network
            output_net: output network
            train_input: training input data
            train_target: training target data
            train_classes: number of classes in the training data
        returns:
            loss_arr: an array containing the loss for each epoch
            eval_arr: an array containing the evaluation loss for each epoch
    """
    loss_arr = []
    eval_arr = []
    criterion = nn.CrossEntropyLoss()
    optim1 = optim.Adam(intermed.parameters(), lr=lr1)
    optim2 = optim.Adam(output_net.parameters(), lr=lr2)
    # Train the intermediate
    for e in range(epoch1):
        sum_loss = 0
        for input_, class_ in zip(train_input.split(mini_batch_size), 
                                   train_classes.split(mini_batch_size)):
            out1, out2 = intermed(input_)
            loss = criterion(out1, class_[:,0]) + criterion(out2, class_[:,1])
            intermed.zero_grad()
            loss.backward()
            sum_loss += loss.item()
            optim1.step()
        if((e % 2 == 0) & disp):
            print("Epoch: {}\tLoss inter train: {}\tLoss inter eval: {}\t".format(e, sum_loss,
                                                                                  eval_loss_inter(intermed,
                                                                                                  test_input,
                                                                                                  test_classes)))
    # Train the output
    for e in range(epoch2):
        sum_loss = 0
        for input_, target_ in zip(train_input.split(mini_batch_size), 
                                   train_target.split(mini_batch_size)):
            out1, out2 = intermed(input_) 
            nums = torch.cat([out1, out2],1)
            output = output_net(nums)
            loss = criterion(output, target_)
            output_net.zero_grad()
            loss.backward()
            sum_loss += loss.item()
            optim2.step()
        
        loss_eval = eval_loss(Composite(intermed, output_net), test_input, test_target) 
        if(disp):
            print("Epoch", e, "Loss out:", sum_loss, "Loss eval: ", loss_eval)
        loss_arr.append(sum_loss)
        eval_arr.append(loss_eval)
    return loss_arr, eval_arr

In [None]:
def compute_nb_errors(model, input_data, target_data):
    """ This function computes the number of errors given a trained model,
        test input and corresponding test output.
        args: 
            model: model trained
            input: test input
            target: test output
        return:
            nb_errors: the number of misclassified samples
    """
    nb_errors = 0
    output = model(input_data)
    _, predicted_classes = output.max(1)
    nb_errors = (target_data != predicted_classes).sum().item()
    return nb_errors

### 2. Networks implementation

#### 2.1 Net - Simple network

In [None]:
class Net(nn.Module):
    def __init__(self, nb_hidden=100):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(2, 16, kernel_size=3)    #2*14*14 --> 16*12*12
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3)   #16*10*10 --> 32*8*8
        self.fc1 = nn.Linear(288, nb_hidden)
        self.fc2 = nn.Linear(nb_hidden, 2)
    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), kernel_size=3, stride=1)) #16*12*12 --> 16*10*10
        x = F.relu(F.max_pool2d(self.conv2(x), kernel_size=3, stride=2)) #32*8*8 --> 32*3*3
        x = F.relu(self.fc1(x.view(-1, 288)))
        x = self.fc2(x)
        return x
    
print('Number of parameters:', sum(p.numel() for p in Net().parameters() if p.requires_grad))

In [None]:
losses = []
evals = []
nb_errors = []

for i in range(n_rounds):
    # Train the model
    model = Net()
    l, e = train_model(model, train_input_norm, train_target, test_input_norm, test_target)
    losses.append(l)
    evals.append(e)
    # Compute the number of errors
    nb_errors.append(compute_nb_errors(model, test_input_norm, test_target))

nb_params.append(sum(p.numel() for p in model.parameters() if p.requires_grad))
losses_mean.append(np.mean(losses, axis=0))
eval_mean.append(np.mean(evals, axis=0))
nb_errors_mean.append(np.mean(nb_errors, axis=0))
losses_std.append(np.std(losses, axis=0))
eval_std.append(np.std(evals, axis=0))
nb_errors_std.append(np.std(nb_errors, axis=0))

#### 2.2 Net2 - More complex network

In [None]:
class Net2(nn.Module):
    def __init__(self, nb_hidden=200):
        super(Net2, self).__init__()
        self.conv1 = nn.Conv2d(2, 16, kernel_size=3)  #2*14*14 --> 16*12*12
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3) #16*10*10 --> 32*8*8
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3) #32*6*6 --> 64*4*4
        self.fc1 = nn.Linear(16 * 64, nb_hidden)
        self.fc2 = nn.Linear(nb_hidden, 2)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), kernel_size=3, stride=1)) #16*12*12 --> 16*10*10
        x = F.relu(F.max_pool2d(self.conv2(x), kernel_size=3, stride=1)) #32*8*8 --> 32*6*6
        x = F.relu(self.conv3(x))                                        #64*4*4v
        x = F.relu(self.fc1(x.view(-1, 16 * 64)))
        x = self.fc2(x)
        return x
    
print('Number of parameters:', sum(p.numel() for p in Net2().parameters() if p.requires_grad))

In [None]:
losses = []
evals = []
nb_errors = []

for i in range(n_rounds):
    # Train the model
    model = Net2()
    l, e = train_model(model, train_input_norm, train_target, test_input_norm, test_target)
    losses.append(l)
    evals.append(e)
    # Compute the number of errors
    nb_errors.append(compute_nb_errors(model, test_input_norm, test_target))

nb_params.append(sum(p.numel() for p in model.parameters() if p.requires_grad))
losses_mean.append(np.mean(losses, axis=0))
eval_mean.append(np.mean(evals, axis=0))
nb_errors_mean.append(np.mean(nb_errors, axis=0))
losses_std.append(np.std(losses, axis=0))
eval_std.append(np.std(evals, axis=0))
nb_errors_std.append(np.std(nb_errors, axis=0))

#### 2.3 Composite - Use auxiliary loss

In [None]:
class IntermediateNet(nn.Module):
    def __init__(self, nb_hidden=1000):
        super(IntermediateNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)  #2*14*14 --> 32*12*12
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3) #32*10*10 --> 64*8*8
        self.fc1 = nn.Linear(64*9, nb_hidden)
        self.out = nn.Linear(nb_hidden, 10)
        
    def forward(self, x):
        x1, x2 = x[:,0].unsqueeze(1), x[:,1].unsqueeze(1)
        x1 = F.relu(F.max_pool2d(self.conv1(x1), kernel_size=3, stride=1)) #32*12*12 --> 32*10*10
        x1 = F.relu(F.max_pool2d(self.conv2(x1), kernel_size=3, stride=2)) #64*8*8 --> 64*3*3
        x1 = F.relu(self.fc1(x1.view(-1, 64*9)))
        y1 = self.out(x1)
        
        x2 = F.relu(F.max_pool2d(self.conv1(x2), kernel_size=3, stride=1)) #32*12*12 --> 32*10*10
        x2 = F.relu(F.max_pool2d(self.conv2(x2), kernel_size=3, stride=2)) #64*8*8 --> 64*3*3
        x2 = F.relu(self.fc1(x2.view(-1, 64*9)))
        y2 = self.out(x2)
        return y1, y2
    
class OutputNet(nn.Module):
    def __init__(self):
        super(OutputNet, self).__init__()
        self.fc1 = nn.Linear(20, 50)
        self.fc2 = nn.Linear(50,2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
class Composite(nn.Module):
    def __init__(self, intermediate, final):
        super(Composite, self).__init__()
        self.intermediate = intermediate
        self.final = final
        
    def forward(self, x):
        y1, y2 = self.intermediate(x)
        return self.final(torch.cat([y1,y2], 1))
    
print('Number of parameters:', sum(p.numel() for p in Composite(IntermediateNet(), OutputNet()).parameters() if p.requires_grad))

In [None]:
losses = []
evals = []
nb_errors = []

for i in range(n_rounds):
    # Train the model
    inter = IntermediateNet()
    out = OutputNet()
    eta = 0.001
    l, e = train_model_comp(inter, out, train_input_norm, train_target, train_classes, test_input_norm,
                            test_target, test_classes, eta, epoch1=25, epoch2=25)
    losses.append(l)
    evals.append(e)
    # Compute the number of errors
    model = Composite(inter, out)
    nb_errors.append(compute_nb_errors(model, test_input_norm, test_target))

nb_params.append(sum(p.numel() for p in model.parameters() if p.requires_grad))
losses_mean.append(np.mean(losses, axis=0))
eval_mean.append(np.mean(evals, axis=0))
nb_errors_mean.append(np.mean(nb_errors, axis=0))
losses_std.append(np.std(losses, axis=0))
eval_std.append(np.std(evals, axis=0))
nb_errors_std.append(np.std(nb_errors, axis=0))

#### 2.4 ResNet

In [None]:
class ResNetBlock(nn.Module):
    def __init__(self, nb_channels, kernel_size, skip_connections, batch_normalization):
        super(ResNetBlock, self).__init__()
        self.conv1 = nn.Conv2d(nb_channels, nb_channels, kernel_size=kernel_size, padding=(kernel_size - 1) // 2)
        self.bn1 = nn.BatchNorm2d(nb_channels)
        self.conv2 = nn.Conv2d(nb_channels, nb_channels, kernel_size=kernel_size, padding=(kernel_size - 1) // 2)
        self.bn2 = nn.BatchNorm2d(nb_channels)
        
        self.skip_connections = skip_connections
        self.batch_normalization = batch_normalization

    def forward(self, x):
        y = self.conv1(x)
        if self.batch_normalization:
            y = self.bn1(y)
        y = F.relu(y)
        y = self.conv2(y)
        if self.batch_normalization:
            y = self.bn2(y)
        if self.skip_connections:
            y = y + x
        y = F.relu(y)
        return y
    
class ResNet(nn.Module):
    def __init__(self, nb_residual_blocks, nb_channels, kernel_size=3, nb_classes=10,
                 skip_connections=True, batch_normalization=True):
        super(ResNet, self).__init__()
        self.conv = nn.Conv2d(2, nb_channels, kernel_size=kernel_size, padding=(kernel_size - 1) // 2)
        self.bn = nn.BatchNorm2d(nb_channels)
        self.resnet_blocks = nn.Sequential(
            *(ResNetBlock(nb_channels, kernel_size, skip_connections, batch_normalization)
              for _ in range(nb_residual_blocks)))
        self.fc1 = nn.Linear(nb_channels, nb_classes)

    def forward(self, x):
        x = F.relu(self.bn(self.conv(x)))
        x = self.resnet_blocks(x)
        x = F.avg_pool2d(x, 14).view(x.size(0), -1)
        x = self.fc1(x)
        return x
    

In [None]:
losses = []
evals = []
nb_errors = []

for i in range(n_rounds):
    # Train the model
    model = ResNet(nb_residual_blocks=2, nb_channels=16, kernel_size=3, nb_classes=2,
                   skip_connections=True, batch_normalization=True)
    l, e = train_model(model, train_input_norm, train_target, test_input_norm, test_target)
    losses.append(l)
    evals.append(e)
    # Compute the number of errors
    nb_errors.append(compute_nb_errors(model, test_input_norm, test_target))

nb_params.append(sum(p.numel() for p in model.parameters() if p.requires_grad))
losses_mean.append(np.mean(losses, axis=0))
eval_mean.append(np.mean(evals, axis=0))
nb_errors_mean.append(np.mean(nb_errors, axis=0))
losses_std.append(np.std(losses, axis=0))
eval_std.append(np.std(evals, axis=0))
nb_errors_std.append(np.std(nb_errors, axis=0))

#### 2.5 ResNet - Composite

In [None]:
class IntermediateResNet(nn.Module):
    def __init__(self, nb_residual_blocks, nb_channels, kernel_size=3, nb_classes=10,
                 skip_connections=True, batch_normalization=True, nb_hidden=1000):
        super(IntermediateResNet, self).__init__()
        self.conv = nn.Conv2d(1, nb_channels, kernel_size=kernel_size, padding=(kernel_size - 1) // 2)
        self.bn = nn.BatchNorm2d(nb_channels)
        self.resnet_blocks = nn.Sequential(
            *(ResNetBlock(nb_channels, kernel_size, skip_connections, batch_normalization)
              for _ in range(nb_residual_blocks)))
        self.fc1 = nn.Linear(nb_channels, nb_hidden)
        self.out = nn.Linear(nb_hidden, 10)

    def forward(self, x):
        x1, x2 = x[:,0].unsqueeze(1), x[:,1].unsqueeze(1)
        
        x1 = F.relu(self.bn(self.conv(x1)))
        x1 = self.resnet_blocks(x1)
        x1 = F.avg_pool2d(x1, 14).view(x1.size(0), -1)
        x1 = self.fc1(x1)
        y1 = self.out(x1)
        
        x2 = F.relu(self.bn(self.conv(x2)))
        x2 = self.resnet_blocks(x2)
        x2 = F.avg_pool2d(x2, 14).view(x2.size(0), -1)
        x2 = self.fc1(x2)
        y2 = self.out(x2)
        return y1, y2

In [None]:
losses = []
nb_errors = []
evals = []

for i in range(n_rounds):
    # Train the model
    inter = IntermediateResNet(nb_residual_blocks=2, nb_channels=16, nb_classes=10)
    out = OutputNet()
    eta = 0.001
    l, e = train_model_comp(inter, out, train_input_norm, train_target, train_classes,
                                   test_input_norm, test_target, test_classes, eta)
    losses.append(l)
    evals.append(e)
    # Compute the number of errors
    model = Composite(inter, out)
    nb_errors.append(compute_nb_errors(model, test_input_norm, test_target))

nb_params.append(sum(p.numel() for p in model.parameters() if p.requires_grad))
losses_mean.append(np.mean(losses, axis=0))
eval_mean.append(np.mean(evals, axis=0))
nb_errors_mean.append(np.mean(nb_errors, axis=0))
losses_std.append(np.std(losses, axis=0))
eval_std.append(np.std(evals, axis=0))
nb_errors_std.append(np.std(nb_errors, axis=0))

### 3. Model comparison

In [None]:
labels = ['Net', 'Net2', 'CompositeNet', 'ResNet', 'CompositeResNet']
colors = ['peru', 'maroon', 'forestgreen', 'steelblue', 'darkorange']
fig, ax = plt.subplots(4, 1, figsize=(16, 40))
n_epochs = 25

for i in range(5):
    X = [x for x in range(n_epochs)]
    ax[0].errorbar(X, losses_mean[i], yerr=losses_std[i], fmt='.-',
                   label=labels[i], color=colors[i])
ax[0].legend()
ax[0].set_title('Mean training loss over {} rounds of {} epochs with standard deviation'.format(n_rounds, n_epochs))
ax[0].set_xlabel('Epoch')
ax[0].set_ylabel('Loss')
ax[0].grid(True)

for i in range(5):
    X = [x for x in range(n_epochs)]
    ax[1].errorbar(X, eval_mean[i], yerr=eval_std[i], fmt='.-',
                   label=labels[i], color=colors[i])
ax[1].legend()
ax[1].set_title('Mean evaluation loss over {} rounds of {} epochs with standard deviation'.format(n_rounds, n_epochs))
ax[1].set_xlabel('Epoch')
ax[1].set_ylabel('Loss')
ax[1].grid(True)

ax[2].bar(labels, list(map(lambda x: x/1000*100, nb_errors_mean)), color=colors, width=0.25,
          yerr=list(map(lambda x: x/1000 * 100, nb_errors_std)))
ax[2].set_title('Mean percentage of errors over {} rounds with standard deviation'.format(n_rounds))
ax[2].set_ylabel('Errors')
ax[2].yaxis.grid(True)

ax[3].bar(labels, nb_params, color=colors, width=0.25)
ax[3].set_title('Number of parameters')
ax[3].set_ylabel('Number of parameters')
ax[3].yaxis.grid(True)

plt.show()