In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

In [None]:
print(torch.cuda.is_available())

# Linear Layer ([Documentation](http://pytorch.org/docs/stable/generated/torch.nn.Linear.html))
``torch.nn.Linear(in_features, out_features, bias=True, device=None, dtype=None)``


Let us start with a single neuron having 10 inputs and just 1 output, but no bias

## Single Neuron

Let us start with a single neuron having 10 input features and just 1 output feature

In [None]:
single_neuron = nn.Linear()


PyTorch automatically ininitializes the weights of all layers, we can look at those weights by calling

In [None]:
print(single_neuron.weight.shape)
print(single_neuron.bias)

In [None]:
single_neuron.weight

An All-Zero Output always evaluates to 0 when there is no bias. Let us verify this by creating a suitable input tensor of ``(b, input_features) == (1, 10)`` with ``b`` being the batch size, ``input_features`` being the number of input features.

In [None]:
input_0 = torch.zeros((1, 10))
print(input_0.shape)
input_1 = torch.ones((1, 10))
print(input_1)
input_1000 = 1000 * torch.ones((1, 10))
print(input_1000)

In [None]:
print(single_neuron(input_0))
print(single_neuron(input_1))
print(single_neuron(input_1000))

In [None]:
# check the sum of weights
print(torch.sum(single_neuron.weight))

## Linear Layer with 4 input features and 3 output features

In [None]:
linear_layer = nn.Linear()

In [None]:
linear_input = torch.rand((1, 4))

In [None]:
out_feats = linear_layer(linear_input)
print(out_feats.shape)
linear_layer.weight.shape

In [None]:
linear_layer.bias.shape

A linear layer is applied to an input tensor by applying a linear transformation of the form $y = xA^T + b$. This can be easily varified by evaluating the equation manually:

In [None]:
torch.mm(linear_input, linear_layer.weight.T) +linear_layer.bias

In [None]:
# easy to produce errors with wrong layout
print( linear_layer.bias.shape)
print( torch.mm(linear_layer.weight, linear_input.T) + linear_layer.bias.T )
print( torch.mm(linear_layer.weight, linear_input.T) + linear_layer.bias)
print( torch.mm(linear_layer.weight, linear_input.T) + linear_layer.bias.reshape(5,1) ) #only working line of code

print( torch.mm(linear_layer.weight, linear_input) + linear_layer.bias)



In [None]:
# numpy verification
result = np.dot(linear_input.detach().numpy(), linear_layer.weight.T.detach().numpy()) +linear_layer.bias.reshape(3).detach().numpy()
print('Result: \n', result.shape)


## Multi Layer Perceptron (MLP)

A Neural Network consisting of more than one linear layer (also referred to as Fully Connected Layer) is also called a Multi Layer Perceptron (MLP).

In [None]:
# often a neural network is defined as object derived from "nn.Module"
# we would like to define an "__init__" and "forward" function

class ThreeLayerMLP(nn.Module):
    def __init__(self, input_features, hidden_features, output_features):
        super(ThreeLayerMLP, self).__init__()
        self.layer_0 = nn.Linear(in_features = input_features, out_features = hidden_features)
        self.layer_1 = nn.Linear(in_features = hidden_features, out_features = hidden_features)
        self.layer_2 = nn.Linear(in_features = hidden_features, out_features = output_features)
        
    def forward(self, x):
        x = self.layer_0(x)
        x = self.layer_1(x)
        x = self.layer_2(x)
        
        return x

In [None]:
# initialization
my_mlp = ThreeLayerMLP(10, 5, 1)

In [None]:
mlp_input = torch.rand((1, 10))

In [None]:
my_mlp(mlp_input) # this calls the forward function; handles distribution on multiple GPUs

In [None]:
# add activation functions
# 

class ThreeLayerMLPNew(nn.Module):
    def __init__(self, input_features, hidden_features, output_features):
        super(ThreeLayerMLPNew, self).__init__()
        self.layer_0 = nn.Linear(in_features = input_features, out_features = hidden_features)
        self.layer_1 = nn.Linear(in_features = hidden_features, out_features = hidden_features)
        self.layer_2 = nn.Linear(in_features = hidden_features, out_features = output_features)
        self.relu = nn.ReLU() # only defined once not three times / depends on coding style / better to define layer for each use

    def forward(self, x):
        x = self.layer_0(x)
        x = self.relu(x)
        x = self.layer_1(x)
        x = self.relu(x)
        x = self.layer_2(x)
        x = self.relu(x)
        
        return x

In [None]:
# execute multiple times to see that output never gets < 0

my_mlpnew = ThreeLayerMLPNew(3, 3, 1)
mlp_input = torch.rand((1, 3))
my_mlpnew(mlp_input)

# Conv2d Layer ([Documentation](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html))
``torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros', device=None, dtype=None)``

In [None]:
conv_input = torch.randn(1, 3, 9, 9) # batch_size == 1, num_channels == 3, height == 9, width == 9

In [None]:
# With square 3x3 kernels
m = nn.Conv2d(3, 1, 3, stride = 1, padding = 1)

In [None]:
output = m(conv_input)
print(output.shape)
print(output)

In [None]:
# With square 3x3 kernels and stride 2
m = nn.Conv2d(3, 1, 3, stride =  2, padding = 1)

In [None]:
output = m(conv_input)
print(output.shape)
print(output)

## Convolutional Neural Network

A Neural Network that only consists of convolutional layers is also referred to as a convolutional neural network. The big advantage is that the input size can vary as it only has to have the right number of input channels but the spatial height and width can vary.

In [None]:
class ConvNet(nn.Module):
    def __init__(self, input_channels, hidden_channels, output_channels):
        super(ConvNet, self).__init__()
        self.layer_0 = nn.Conv2d(in_channels = input_channels, out_channels = hidden_channels, kernel_size = 3, stride = 1, padding = 1)
        self.relu0 = nn.ReLU()
        self.layer_1 = nn.Conv2d(in_channels = hidden_channels, out_channels = hidden_channels, kernel_size = 3, stride = 1, padding = 1)
        self.relu1 = nn.ReLU()
        self.layer_2 = nn.Conv2d(in_channels = hidden_channels, out_channels = output_channels, kernel_size = 3, stride = 1, padding = 1)
        self.relu2 = nn.ReLU()

    def forward(self, x):
        x = self.layer_0(x)
        x = self.relu0( x )
        x = self.layer_1(x)
        x = self.relu1( x )
        x = self.layer_2(x)
        x = self.relu2( x )
        
        return x

# TODO
* Randomly initialise weight
* Implement forward propagation to get ) for any
* Implement backprop to compute partial derivatives
* For all the samples, perform forward propagation and
backpropagation
* Using numerical estimation of gradient to check the gradient
calculation, disable after checking
* Use gradient descent or advanced optimization method with
backpropagation to try to minimize cost function