# **Imports**

In [1]:
!pip install wandb -qU
import wandb
wandb.login()
!pip install torch
!pip install torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch
import torchvision
import torchvision.transforms as transforms
from itertools import product
import torch.optim as optim
device = "cuda:0" if torch.cuda.is_available() else "cpu"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 KB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 KB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 KB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## **Define the data loaders (for both train & test)**

In [2]:
def get_dataloader(is_train, batch_size=4):
    transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
    
    dataset = torchvision.datasets.CIFAR10(root='./data', train=is_train,
                                            download=True, transform=transform)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                              shuffle=is_train, num_workers=2)

    return loader
  
trainloader = get_dataloader(is_train = True)
testloader = get_dataloader(is_train = False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


# **Define the CNN modules for the different questions**

In [12]:
# The basic model of question #1
class Q1Net(nn.Module):
    def __init__(self, n_conv_layers, ker, cout1, c_factor):
        super().__init__()
        self.c_factor = c_factor        # determine in how many neurons increase the receptive field each layer
        self.layers = nn.ModuleList()   # This container holds the layers in the net
        self.pool = nn.MaxPool2d(2, 2)  # A MaxPooling layer
        self.build_layers(n_conv_layers, ker, cout1)

    def build_layers(self, n_conv_layers, ker, cout):
        c_in = 3   # c_in is 3 for RGB channels in the input

        # Build the convolution layers
        for l in range(n_conv_layers):
          self.layers.append(nn.Conv2d(c_in, cout, ker, padding='same'))
          c_in = cout
          cout += self.c_factor

        # Build the fully-connected layer
        pow = 5 - n_conv_layers
        self.layers.append(nn.Linear((cout-self.c_factor)*(4**pow), 10)) 

    def forward(self, x):
        for layer in self.layers[:-1]:          # run input in every layer but last that is fully-connected
            x = self.pool(F.relu(layer(x)))
        x = torch.flatten(x, 1)
        x = self.layers[-1](x)
        return x


# The model for qusetion #2, like the basic but without the non-linear layers
class Q2Net(nn.Module):
    def __init__(self, n_conv_layers, ker, cout1, c_factor):
        super().__init__()
        self.c_factor = c_factor        
        self.layers = nn.ModuleList()   
        self.build_layers(n_conv_layers, ker, cout1)

    def build_layers(self, n_conv_layers, ker, cout):
        c_in = 3 
        for l in range(n_conv_layers):
          self.layers.append(nn.Conv2d(c_in, cout, ker, padding="same"))
          c_in = cout
          cout += self.c_factor
        self.layers.append(nn.Linear((cout-self.c_factor)*32*32, 10))  

    def forward(self, x):
        for layer in self.layers[:-1]:          
            x = layer(x)
        x = torch.flatten(x, 1)
        x = self.layers[-1](x)
        return x


# The model for qusetion #3
# if with_avg=True, then: Conv -> Average over channels -> FC
# otherwise, without the averageing
class Q3Net(nn.Module):
    def __init__(self, ker, cout, with_avg=True):
        super().__init__()  
        self.with_avg = with_avg
        self.conv = nn.Conv2d(3, cout, ker, padding='same')
        if self.with_avg:
          self.fc = nn.Linear(1*32*32, 10) 
        else:
          self.fc = nn.Linear(cout*32*32, 10) 

    def forward(self, x):
        x = self.conv(x)
        if self.with_avg:
          x = torch.mean(x, dim=1)
        x = F.relu(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

        
def get_model(model, parameters):
  # Simply create an instance of the given model, with the given parameters list
  return model(*parameters).to(device)

# **Define the evaluation (for test set)**
This function is for the evaluation of the model using the test set.
The evaluation is called after every epoch in the training set.
Here I also computed the accuracy event thought is not necessary for the submission, just for sanity checking that the model actually works fine.

In [4]:
def evaluate(model, test_set_loader, criterion):
    correct = 0.
    total = 0.
    with torch.no_grad():
        for data in test_set_loader:
            inputs, labels = data[0].to(device), data[1].to(device)

            # The loss is accumulated during the run and then normalized
            # It uses the CrossEntropyLoss which perform Softmax within it
            outputs = model(inputs)
            test_loss = criterion(outputs, labels)

            # The accuracy computed by taking the label which is corresponding  
            # to the maximum value in the output vector of each prediction
            # accumulate it if the label is correct, and the normalize
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')
    return test_loss, correct / total

# **Define the train loop**

In [5]:
def train_loop(model, train_data_loader, test_data_loader, lr, epochs):
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(), lr=lr)
  running_loss = 0.0

  for epoch in range(epochs):
      for i, data in enumerate(train_data_loader):
          inputs, labels = data[0].to(device), data[1].to(device)

          # Update model's parameters with gradient descend method
          optimizer.zero_grad()
          outputs = model(inputs)
          train_loss = criterion(outputs, labels)
          train_loss.backward()
          optimizer.step()

          # print statistics
          running_loss += train_loss.item()
          if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

          # Plot the loss
          metrics = {"train/train_loss": train_loss.item()}
          wandb.log(metrics)   

      # Evaluate the model's phase and plot it
      test_loss, accuracy = evaluate(model, test_data_loader, criterion)
      test_metrics = {"test/test_loss": test_loss,
                     "test/test_accuracy": accuracy}
      wandb.log(test_metrics)

  print('Finished Training')

# **Define the Hyper-Parameters**
This function defined in separate from other implementations in order to keep tracking the paramteres easy. 
Here I defined the different options for kernel sizes (both kernels of the same size), for the number of channels from each conv2d layer, and for the fully-connected layer.
NOTICE that all the kernels are from the same size, and the cout values are increasing by 4 in every layer (implemented in the model class).

Postprocess, in WANDB we can group the charts by specific parameters and compare.

In [10]:
def get_parameters(q):

  # for question 1
  if q==1:
    # Define options groups for parameters
    number_of_layers = [4,3]
    kernel_options = [5]
    cout1_options = [10]
    receptive_factor = [20]    # How many channels will be added to c_out every layer
    res = []
    # create lists of parameters using product of the groups
    # also, create as many cout needed for the number of layers, each one is
    # bigger in 4 from the previous cout
    for p in product(number_of_layers, kernel_options, cout1_options, receptive_factor): 
        res.append(p)
    return res

  # for question 2
  elif q==2:
    best_fit_param = [[4,5,10,20]]
    bigger_param = [[6,5,10,20], [6,5,20,20], [10,5,20,30]]
    names = ["Best Fit", "More layers", "More Channels", "Even More of Both"]
    return zip(names, best_fit_param + bigger_param)

   # for question 2
  elif q==3:
    # List format: [<kernel size>, <channels amount>, <averaging: boolean>]
    param = [[5,10,True],[5,20,True],[5,10,False],[5,20,False]]
    return param

# **Q. 1: Experiment different hyper-parameters**
The hyper-parameters mentioned are only of the size of the layers, meanly the amount of neurons in the net.

In [None]:
for i, param in enumerate(get_parameters(q=1)):
  model = get_model(Q1Net, param)
  n_param = sum(p.numel() for p in model.parameters() if p.requires_grad)

  wandb.init(
      project="Ex1_Q1",
      config={
          "epochs": 15,
          "lr": 0.001,
          "num_of_conv": param[0],
          "ker": param[1],
          "cout1": param[2],
          "c_factor" : param[3],
          "n_param" : n_param
          })
  config = wandb.config
  
  # train and evaluate the model
  train_loop(model, trainloader, testloader, config.lr, config.epochs)  

  wandb.finish()

# **Q2: Importance of Non-Linearity**
Reminder: The best parameters I found on Q1 were: 
4 layers of convolution of kernel size 5. Also, in each layer the amount of channels increase by 20, starting in 10 in the first layer.

In [None]:
for name, param in get_parameters(q=2):
  model = get_model(Q2Net, param)
  n_param = sum(p.numel() for p in model.parameters() if p.requires_grad)
  wandb.init(
      project="Ex1_Q2",
      config={
          "epochs": 15,
          "lr": 0.001,
          "name" :name,
          "num_of_conv": param[0],
          "ker": param[1],
          "cout1": param[2],
          "c_factor" : param[3],
          "n_param" : n_param
          })
  config = wandb.config
  train_loop(model, trainloader, testloader, config.lr, config.epochs)  
  wandb.finish()

# **Q3: Cascaded Receptive Field**
Here I ran the two options (send the activations to FC directly, and average the activations over the channels) with changing the net size (altering the amount of channelc out from the convolution layer)

In [13]:
for param in get_parameters(q=3):
  model = get_model(Q3Net, param)
  n_param = sum(p.numel() for p in model.parameters() if p.requires_grad)
  wandb.init(
      project="Ex1_Q3",
      config={
          "epochs": 15,
          "lr": 0.001,
          "cout1": param[1],
          "averaged" : param[2],
          "n_param" : n_param
          })
  config = wandb.config
  
  # train and evaluate the model
  train_loop(model, trainloader, testloader, config.lr, config.epochs)  

  wandb.finish()

VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[1,  2000] loss: 2.277
[1,  4000] loss: 2.172
[1,  6000] loss: 2.111
[1,  8000] loss: 2.090
[1, 10000] loss: 2.069
[1, 12000] loss: 2.047
Accuracy of the network on the 10000 test images: 30.0 %
[2,  2000] loss: 2.527
[2,  4000] loss: 2.013
[2,  6000] loss: 1.996
[2,  8000] loss: 1.984
[2, 10000] loss: 1.983
[2, 12000] loss: 1.977
Accuracy of the network on the 10000 test images: 31.0 %
[3,  2000] loss: 2.467
[3,  4000] loss: 1.944
[3,  6000] loss: 1.956
[3,  8000] loss: 1.948
[3, 10000] loss: 1.954
[3, 12000] loss: 1.940
Accuracy of the network on the 10000 test images: 31.0 %
[4,  2000] loss: 2.420
[4,  4000] loss: 1.940
[4,  6000] loss: 1.932
[4,  8000] loss: 1.921
[4, 10000] loss: 1.916
[4, 12000] loss: 1.920
Accuracy of the network on the 10000 test images: 32.0 %
[5,  2000] loss: 2.392
[5,  4000] loss: 1.910
[5,  6000] loss: 1.914
[5,  8000] loss: 1.902
[5, 10000] loss: 1.903
[5, 12000] loss: 1.911
Accuracy of the network on the 10000 test images: 33.0 %
[6,  2000] loss: 2.371
[6

VBox(children=(Label(value='0.001 MB of 0.012 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.093093…

0,1
test/test_accuracy,▁▃▃▄▅▆▆▆▇█▆▇███
test/test_loss,█▅▃▃▃▃▂▂▁▃▄▂▂▁▃
train/train_loss,▅▄▅▄▃▃▃▆▅▆▂▁▆▆▂▃▃▆▅█▅▃▄▅▁▃▂▂▂▁▆▂▃▅▄▂▄▆▄▃

0,1
test/test_accuracy,0.3521
test/test_loss,1.62267
train/train_loss,2.10065


[1,  2000] loss: 2.304
[1,  4000] loss: 2.299
[1,  6000] loss: 2.244
[1,  8000] loss: 2.142
[1, 10000] loss: 2.108
[1, 12000] loss: 2.087
Accuracy of the network on the 10000 test images: 27.0 %
[2,  2000] loss: 2.606
[2,  4000] loss: 2.066
[2,  6000] loss: 2.055
[2,  8000] loss: 2.049
[2, 10000] loss: 2.034
[2, 12000] loss: 2.038
Accuracy of the network on the 10000 test images: 29.0 %
[3,  2000] loss: 2.541
[3,  4000] loss: 2.028
[3,  6000] loss: 2.021
[3,  8000] loss: 2.013
[3, 10000] loss: 2.005
[3, 12000] loss: 1.990
Accuracy of the network on the 10000 test images: 29.0 %
[4,  2000] loss: 2.497
[4,  4000] loss: 1.990
[4,  6000] loss: 2.002
[4,  8000] loss: 1.989
[4, 10000] loss: 1.968
[4, 12000] loss: 1.984
Accuracy of the network on the 10000 test images: 32.0 %
[5,  2000] loss: 2.460
[5,  4000] loss: 1.957
[5,  6000] loss: 1.978
[5,  8000] loss: 1.964
[5, 10000] loss: 1.967
[5, 12000] loss: 1.957
Accuracy of the network on the 10000 test images: 32.0 %
[6,  2000] loss: 2.425
[6

0,1
test/test_accuracy,▁▂▃▅▅▆▅▆▆▇▇▇▇██
test/test_loss,█▂▁▅▂▃▂▄▅▄▅▄▃▄▄
train/train_loss,▆▅█▅▆▃▆▅▂▆▆▆▄▆▃▇▃▆▃▄▃▄▃▅▆▃▅▄▁▄▃▃▃▃▃▁█▄▁▁

0,1
test/test_accuracy,0.3628
test/test_loss,1.50665
train/train_loss,1.47612


[1,  2000] loss: 1.958
[1,  4000] loss: 1.712
[1,  6000] loss: 1.600
[1,  8000] loss: 1.535
[1, 10000] loss: 1.469
[1, 12000] loss: 1.426
Accuracy of the network on the 10000 test images: 51.0 %
[2,  2000] loss: 1.718
[2,  4000] loss: 1.367
[2,  6000] loss: 1.359
[2,  8000] loss: 1.352
[2, 10000] loss: 1.328
[2, 12000] loss: 1.333
Accuracy of the network on the 10000 test images: 53.0 %
[3,  2000] loss: 1.612
[3,  4000] loss: 1.286
[3,  6000] loss: 1.297
[3,  8000] loss: 1.296
[3, 10000] loss: 1.267
[3, 12000] loss: 1.288
Accuracy of the network on the 10000 test images: 53.0 %
[4,  2000] loss: 1.554
[4,  4000] loss: 1.226
[4,  6000] loss: 1.245
[4,  8000] loss: 1.252
[4, 10000] loss: 1.237
[4, 12000] loss: 1.220
Accuracy of the network on the 10000 test images: 55.0 %
[5,  2000] loss: 1.493
[5,  4000] loss: 1.200
[5,  6000] loss: 1.192
[5,  8000] loss: 1.202
[5, 10000] loss: 1.199
[5, 12000] loss: 1.197
Accuracy of the network on the 10000 test images: 56.0 %
[6,  2000] loss: 1.433
[6

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
test/test_accuracy,▁▃▃▅▆▆▇█▇▇▇▇███
test/test_loss,█▄▁▃▅▅▄▄▅▄▆▇▄▆█
train/train_loss,▅▅█▄▄▂▁▃▂▄▄▂▃▄▄▂▂▃▄▁▃▁▃▅▂▄▄▁▁▂▂▂▂▅▃▃▃▃▅▂

0,1
test/test_accuracy,0.5866
test/test_loss,1.34758
train/train_loss,0.91637


[1,  2000] loss: 1.904
[1,  4000] loss: 1.712
[1,  6000] loss: 1.602
[1,  8000] loss: 1.512
[1, 10000] loss: 1.457
[1, 12000] loss: 1.411
Accuracy of the network on the 10000 test images: 50.0 %
[2,  2000] loss: 1.704
[2,  4000] loss: 1.344
[2,  6000] loss: 1.333
[2,  8000] loss: 1.295
[2, 10000] loss: 1.309
[2, 12000] loss: 1.284
Accuracy of the network on the 10000 test images: 54.0 %
[3,  2000] loss: 1.569
[3,  4000] loss: 1.230
[3,  6000] loss: 1.210
[3,  8000] loss: 1.223
[3, 10000] loss: 1.196
[3, 12000] loss: 1.194
Accuracy of the network on the 10000 test images: 56.0 %
[4,  2000] loss: 1.423
[4,  4000] loss: 1.146
[4,  6000] loss: 1.131
[4,  8000] loss: 1.126
[4, 10000] loss: 1.133
[4, 12000] loss: 1.131
Accuracy of the network on the 10000 test images: 58.0 %
[5,  2000] loss: 1.342
[5,  4000] loss: 1.057
[5,  6000] loss: 1.067
[5,  8000] loss: 1.078
[5, 10000] loss: 1.041
[5, 12000] loss: 1.055
Accuracy of the network on the 10000 test images: 60.0 %
[6,  2000] loss: 1.283
[6

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
test/test_accuracy,▁▃▄▅▆▇▇▇▇▇█████
test/test_loss,▆▄█▄▅▃▅▅▁▂▃▅▄▃▄
train/train_loss,█▆▆▆▃▄▄▃▂▂▆▁▃▄▂▄▃▂▂▁▃▂▅▅▃▂▂▄▂▄▂▃▁▂▂▃▃▃▇▂

0,1
test/test_accuracy,0.6315
test/test_loss,0.83671
train/train_loss,0.93051
