# Experimenting

In [2]:
from torch import nn
from copy import deepcopy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torchvision
import tqdm
import os
import mlp.data_providers as data_providers
import numpy as np

In [3]:
class LinearLayerWithActivation(nn.Module):
    def __init__(self, input_shape, num_units, bias=False, activation_type=nn.ReLU()):
        super(LinearLayerWithActivation, self).__init__()
        self.activation_type = activation_type
        self.weights = nn.Parameter(torch.empty(size=(num_units, input_shape[1]), requires_grad=True))
        
        nn.init.normal_(self.weights)
        
        if bias:
            self.bias = nn.Parameter(torch.zeros(num_units), requires_grad=True)
        else:
            self.bias = None
        
    def forward(self, x):
        out = F.linear(x, self.weights, self.bias)
        out = self.activation_type.forward(out)
        return out
        

In [4]:
class MultiLayerFCCNetwork(nn.Module):
    def __init__(self, input_shape, num_hidden_units, num_output_units, num_hidden_layers):
        super(MultiLayerFCCNetwork, self).__init__()
        self.input_shape = input_shape
        self.num_hidden_units = num_hidden_units
        self.num_output_units = num_output_units
        self.num_hidden_layers = num_hidden_layers
        
        x_dummy = torch.zeros(input_shape)
        
        self.layer_dict = nn.ModuleDict() # Allows us to initialize modules within a dictionary structure.
        out = x_dummy
        for i in range(self.num_hidden_layers):
            self.layer_dict['layer_{}'.format(i)] = LinearLayerWithActivation(input_shape=out.shape, 
                                                             num_units=self.num_hidden_units, bias=True,
                                                                       activation_type=nn.PReLU())
            
            out = self.layer_dict['layer_{}'.format(i)].forward(out)
        
        self.layer_dict['output_layer'] = LinearLayerWithActivation(input_shape=out.shape, 
                                                             num_units=self.num_output_units, 
                                                             bias=True, activation_type=nn.Identity())
        out = self.layer_dict['output_layer'].forward(out)
    
    def forward(self, x):
        out = x
        for i in range(self.num_hidden_layers):
            out = self.layer_dict['layer_{}'.format(i)].forward(out)

        out = self.layer_dict['output_layer'].forward(out)
        return out
            
        

In [5]:
x = torch.arange(16*128).view(16, 128).float()
y = torch.arange((16))

fcc_net = MultiLayerFCCNetwork(input_shape=x.shape, num_hidden_units=64, num_output_units=512, 
                               num_hidden_layers=4)
for x in fcc_net.layer_dict:
    print(x)
    
for x in fcc_net.named_parameters():
    print(x[0])

layer_0
layer_1
layer_2
layer_3
output_layer
layer_dict.layer_0.weights
layer_dict.layer_0.bias
layer_dict.layer_0.activation_type.weight
layer_dict.layer_1.weights
layer_dict.layer_1.bias
layer_dict.layer_1.activation_type.weight
layer_dict.layer_2.weights
layer_dict.layer_2.bias
layer_dict.layer_2.activation_type.weight
layer_dict.layer_3.weights
layer_dict.layer_3.bias
layer_dict.layer_3.activation_type.weight
layer_dict.output_layer.weights
layer_dict.output_layer.bias


In [6]:
for name, value in fcc_net.named_parameters():
            #print(name, value.shape)
            if all(item in name for item in ['conv', 'weight']):
                print(name)
            if all(item in name for item in ['linear', 'weight']):
                print(name)

### Learning to use pytorch_mlp_framework

Running the following code gives the following output

```System learnable parameters
model.layer_dict.input_conv.layer_dict.conv_0.weight torch.Size([32, 3, 3, 3])
model.layer_dict.input_conv.layer_dict.bn_0.weight torch.Size([32])
model.layer_dict.input_conv.layer_dict.bn_0.bias torch.Size([32])
model.layer_dict.reduction_block_0.layer_dict.conv_0.weight torch.Size([32, 32, 3, 3])
model.layer_dict.reduction_block_0.layer_dict.conv_0.bias torch.Size([32])
model.layer_dict.reduction_block_0.layer_dict.conv_1.weight torch.Size([32, 32, 3, 3])
model.layer_dict.reduction_block_0.layer_dict.conv_1.bias torch.Size([32])
model.logit_linear_layer.weight torch.Size([100, 32])
model.logit_linear_layer.bias torch.Size([100])
Total number of parameters 22724
Total number of conv layers 4
Total number of linear layers 1```

And the "layers" are `input_conv` and `reduction_block_0`.

Finally, we have the following from the documentation:
```
Welcome to the MLP course's Pytorch training and inference helper script

optional arguments:
  -h, --help            show this help message and exit
  --batch_size [BATCH_SIZE]
                        Batch_size for experiment
  --continue_from_epoch [CONTINUE_FROM_EPOCH]
                        Which epoch to continue from. 
                        If -2, continues from where it left off
                        If -1, starts from scratch
                        if >=0, continues from given epoch
  --seed [SEED]         Seed to use for random number generator for experiment
  --image_num_channels [IMAGE_NUM_CHANNELS]
                        The channel dimensionality of our image-data
  --image_height [IMAGE_HEIGHT]
                        Height of image data
  --image_width [IMAGE_WIDTH]
                        Width of image data
  --num_stages [NUM_STAGES]
                        Number of convolutional stages in the network. A stage
                        is considered a sequence of convolutional layers where
                        the input volume remains the same in the spacial
                        dimension and is always terminated by a dimensionality
                        reduction stage
  --num_blocks_per_stage [NUM_BLOCKS_PER_STAGE]
                        Number of convolutional blocks in each stage, not
                        including the reduction stage. A convolutional block
                        is made up of two convolutional layers activated using
                        the leaky-relu non-linearity
  --num_filters [NUM_FILTERS]
                        Number of convolutional filters per convolutional
                        layer in the network (excluding dimensionality
                        reduction layers)
  --num_epochs [NUM_EPOCHS]
                        The experiment's epoch budget
  --num_classes [NUM_CLASSES]
                        The experiment's epoch budget
  --experiment_name [EXPERIMENT_NAME]
                        Experiment name - to be used for building the
                        experiment folder
  --use_gpu [USE_GPU]   A flag indicating whether we will use GPU acceleration
                        or not
  --weight_decay_coefficient [WEIGHT_DECAY_COEFFICIENT]
                        Weight decay to use for Adam
  --block_type BLOCK_TYPE
                        Type of convolutional blocks to use in our network
                        (This argument will be useful in running experiments
                        to debug your network)
                        
 ```

The parameters that seem to control the number of layers are `--num_stages`, `--num_blocks_per_stage` and (probably) `--num_filters`.

Here, we've set num_stages to 1, num_blocks_per_stage to 0. Intuitively, this sounds like there should be 0 conv layers. Instead, we have 4 -- why?

We've also set num_filters to 32, so that has something to do.

I'm also not sure what's the difference between `--num_epochs` and `--num_classes`.

`--num_classes` is actually the number of output classes, as expected

filters = feature maps? so `num_filters` is actually number of feature maps
   - but why does FCC also have that parameter? because here num_filters is apparently used as the number of hidden units.

`--num_blocks_per_stage` is 0, so the only layers are:
   - the input conv layer (what's `EntryConvolutionalBlock`?) which has 32 "filters"
   - one reduction stage for one stage in `--num_stages`

what's `EntryConvolutionalBlock`?
   - it has conv parameters and bn parameters, what are they?

when the ExperimentBuilder() says that there's 4 convolutional layers, it's talking about:
- `model.layer_dict.input_conv.layer_dict.conv_0`
- `model.layer_dict.input_conv.layer_dict.bn_0`
- `model.layer_dict.reduction_block_0.layer_dict.conv_0`
- `model.layer_dict.reduction_block_0.layer_dict.conv_1` 
i.e. 2 conv layers in `input_conv` and 2 conv layers in `reduction_block_0`.


In [8]:
a = torch.randn(3, 3)
print(a)
t = torch.mean(a)
print(t)

tensor([[ 0.1212,  0.0902,  0.9037],
        [-1.5467, -0.3662, -0.8754],
        [-0.3033,  0.9105, -0.7133]])
tensor(-0.1977)


In [24]:
b = [1,2]
b.append(float(t))

In [25]:
b

[1, 2, -0.19769951701164246]

In [26]:
sum(b)/len(b)

0.9341001609961191

In [None]:
layers_r = []
for x in self.model.layer_dict:
        for y in locals()[x].layer_dict:
            layer_plotname = x + '_' + y
            layer_realname = 'model.layer_dict.' + x + '.layer_dict.' + y

            layers.append(layer_plotname)
            layers_r.append(layer_realname)

        for layer in globals()[layers_r]:
            grads = []
            for t in layer.named_parameters:
                grads.append(float(torch.mean(t.grad)))
            all_grads.append(sum(grads)/len(grads))

In [None]:
    for x in self.named_parameters():
            # now classify by layer, calculate the avg grads per layer and append name and avg grads to layers and all_grads
            for block in self.model.layer_dict:
                for layer in locals()[block].layer_dict:

In [34]:
rlayer = 'model.layer_dict.input_conv.layer_dict.conv_0.weight'
print(rlayer)
par = rlayer.split('.')
print(par)
layer = par[2]+'_'+par[4]
print(layer)

model.layer_dict.input_conv.layer_dict.conv_0.weight
['model', 'layer_dict', 'input_conv', 'layer_dict', 'conv_0', 'weight']
input_conv_conv_0


In [35]:
if par[2] and par[4] in rlayer:
    print(rlayer)

model.layer_dict.input_conv.layer_dict.conv_0.weight


In [37]:
sum([1,2])

3

In [41]:
name = 'logit_linear_layer.weight'
if all(item in name for item in ['linear', 'weight']):
    print('Yes')

Yes


In [44]:
if 'weight' in name:
    print('Yes')

Yes


In [45]:
a = torch.randn(3,3)
a

tensor([[ 0.9973,  0.8574,  0.2572],
        [-0.5075, -0.9552, -1.8104],
        [-1.3216, -0.9684, -0.0414]])

In [46]:
b = torch.randn(3,3)
b

tensor([[ 2.3509, -0.4538, -0.7192],
        [-0.3950, -0.7167, -0.3476],
        [ 0.0886,  1.3320, -0.1409]])

In [47]:
a+b

tensor([[ 3.3482,  0.4036, -0.4620],
        [-0.9026, -1.6719, -2.1581],
        [-1.2330,  0.3635, -0.1822]])

In [85]:
x = torch.randn(100, 32, 32, 32)

In [99]:
class ConvolutionalDimensionalityReductionBlockBNRC(nn.Module):
    def __init__(self, input_shape, num_filters, kernel_size, padding, bias, dilation, reduction_factor):
        super(ConvolutionalDimensionalityReductionBlockBNRC, self).__init__()

        self.num_filters = num_filters
        self.kernel_size = kernel_size
        self.input_shape = input_shape
        self.padding = padding
        self.bias = bias
        self.dilation = dilation
        self.reduction_factor = reduction_factor
        self.build_module()

    def build_module(self):
        self.layer_dict = nn.ModuleDict()
        x = torch.zeros(self.input_shape)
        out = x
        self.layer_dict['conv_r'] = nn.Conv2d(in_channels=out.shape[1], out_channels=self.num_filters, bias=self.bias,
                                              kernel_size=1, dilation=self.dilation, stride=2)
        self.layer_dict['conv_0'] = nn.Conv2d(in_channels=out.shape[1], out_channels=self.num_filters, bias=self.bias,
                                              kernel_size=self.kernel_size, dilation=self.dilation,
                                              padding=self.padding, stride=1)

        out = self.layer_dict['conv_0'].forward(out)
        self.layer_dict['bn_0'] = nn.BatchNorm2d(num_features=out.shape[1])
        out = F.leaky_relu(self.layer_dict['bn_0'].forward(out))
        out = F.avg_pool2d(out, self.reduction_factor)
        self.layer_dict['conv_1'] = nn.Conv2d(in_channels=out.shape[1], out_channels=self.num_filters, bias=self.bias,
                                              kernel_size=self.kernel_size, dilation=self.dilation,
                                              padding=self.padding, stride=1)

        out = self.layer_dict['conv_1'].forward(out)
        self.layer_dict['bn_1'] = nn.BatchNorm2d(num_features=out.shape[1])
        
        out = self.layer_dict['bn_1'].forward(out) + self.layer_dict['conv_r'].forward(x)
        out = F.leaky_relu(out)

        print(out.shape)

    def forward(self, x):
        out = x

        out = self.layer_dict['conv_0'].forward(out)
        out = F.leaky_relu(self.layer_dict['bn_0'].forward(out))

        out = F.avg_pool2d(out, self.reduction_factor)

        out = self.layer_dict['conv_1'].forward(out)
        out = self.layer_dict['bn_1'].forward(out) + self.layer_dict['conv_r'].forward(x)
        out = F.leaky_relu(out)

        return out


In [100]:
reduction_block = ConvolutionalDimensionalityReductionBlockBNRC(input_shape=x.shape, num_filters=32, kernel_size=3, padding=1, bias=True, dilation=1, reduction_factor=2)

torch.Size([100, 32, 32, 32])
torch.Size([100, 32, 16, 16])
torch.Size([100, 32, 16, 16])


In [94]:
a = x#.view(-1,x.shape[2])
linear_proj = nn.Linear(in_features=a.shape[2], out_features=16)
(linear_proj.forward(a)).shape

torch.Size([100, 32, 32, 16])

In [88]:
class ConvolutionalProcessingBlockBNRC(nn.Module):
    def __init__(self, input_shape, num_filters, kernel_size, padding, bias, dilation):
        super(ConvolutionalProcessingBlockBNRC, self).__init__()

        self.num_filters = num_filters
        self.kernel_size = kernel_size
        self.input_shape = input_shape
        self.padding = padding
        self.bias = bias
        self.dilation = dilation

        self.build_module()

    def build_module(self):
        self.layer_dict = nn.ModuleDict()
        x = torch.zeros(self.input_shape)
        out = x

        self.layer_dict['conv_0'] = nn.Conv2d(in_channels=out.shape[1], out_channels=self.num_filters, bias=self.bias,
                                              kernel_size=self.kernel_size, dilation=self.dilation,
                                              padding=self.padding, stride=1)



        out = self.layer_dict['conv_0'].forward(out)
        self.layer_dict['bn_0'] = nn.BatchNorm2d(num_features=out.shape[1])
        out = F.leaky_relu(self.layer_dict['bn_0'].forward(out))

        self.layer_dict['conv_1'] = nn.Conv2d(in_channels=out.shape[1], out_channels=self.num_filters, bias=self.bias,
                                              kernel_size=self.kernel_size, dilation=self.dilation,
                                              padding=self.padding, stride=1)



        out = self.layer_dict['conv_1'].forward(out)
        self.layer_dict['bn_1'] = nn.BatchNorm2d(num_features=out.shape[1])
        out = self.layer_dict['bn_1'].forward(out) + x
        out = F.leaky_relu(out)

        print(out.shape, x.shape)

    def forward(self, x):
        out = x

        out = self.layer_dict['conv_0'].forward(out)
        out = F.leaky_relu(self.layer_dict['bn_0'].forward(out))

        out = self.layer_dict['conv_1'].forward(out)
        out = self.layer_dict['bn_1'].forward(out) + x
        out = F.leaky_relu(out)

        return out

In [89]:
conv_block = ConvolutionalProcessingBlockBNRC(input_shape=x.shape, num_filters=32, kernel_size=3, padding=1, bias=False, dilation=1)

torch.Size([100, 32, 32, 32]) torch.Size([100, 32, 32, 32])
