# Machine Learning II 2021-2022 - UMONS 
# Assignment IV

In [None]:
# Setup
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

In this assignment you will get more detailed knowledge on neural networks. You will first implement a neural netowrk without using the PyTorch NN module. Then, you will compare it with the NN module implementation.



## Linear layer

### Using only basic tensor operations
**Here you may only use basic tensor operations from the `torch` module: no `torch.nn`, `torch.nn.F` or others.**

In [None]:
class Linear(object):
    """
    Fully connected layer.
    
    Args:
        in_features: number of input features
        out_features: number of output features
    """

    def __init__(self, in_features, out_features):
        super(Linear, self).__init__()

        ########################################################################
        #      TODO: Define placeholder tensors for layer weight and bias.     #
        #       The placeholder tensors should have the correct dimension      #
        #        according to the in_features and out_features variables.      #
        #                    Note: no for loops are needed!                    #
        ########################################################################

        self.weight = 
        self.bias = 

        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################

        # Initialize parameters
        self.init_params()

        # Define a cache varible to save computation, because some of the
        # forward pass values would be used during backward pass.
        self.cache = None

        # Define variables to store the gradients of the weight and bias
        # calculated during the backward pass
        self.weight_grad = None
        self.bias_grad = None

    def init_params(self, std=1.):
        """
        Initialize layer parameters. Sample weight from Gaussian distribution
        and bias uniform distribution.
        
        Args:
            std: Standard deviation of Gaussian distribution (default: 1.0)
        """

        self.weight = std*torch.randn_like(self.weight)
        self.bias = torch.rand_like(self.bias)

    def forward(self, x):
        """
        Forward pass of linear layer: multiply input tensor by weights and add
        bias. Store input tensor as cache variable.
        
        Args:
            x: input tensor

        Returns:
            y: output tensor
        """

        ########################################################################
        #   TODO: Implement this function  and Store input as cache variable   #
        ########################################################################

        y = 
        self.cache = 

        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################

        return y

    def backward(self, dupstream):
        """
        Backward pass of linear layer: calculate gradients of loss with respect
        to weight and bias and return downstream gradient dx.
        
        Args:
            dupstream: Gradient of loss with respect to output of this layer.

        Returns:
            dx: Gradient of loss with respect to input of this layer.
        """
        ########################################################################
        #                  TODO: Implement this function                       #
        ########################################################################

        x = 
        dx = 
        self.weight_grad = 
        self.bias_grad = 

        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################

        return dx

In [None]:
# Define layer dimensions
n_samples, in_features, out_features = 2, 3, 4
# Make random input tensor of dimensions [n_samples, in_features]
x = torch.randn((n_samples, in_features))
# Define upstream gradient dL/dy as randn
dy = torch.randn((n_samples, out_features))

########################################################################
#    TODO: Create a layer from the Linear object class above           #
#                 and do a forward and a backward pass                 #
########################################################################

layer = 
# Forward pass
y = 
# Backward pass
dx = 

########################################################################
#                         END OF YOUR CODE                             #
########################################################################

# What will be the shape of output tensor y?
print('Shape of ouput tensor y:', y.shape)

# What will be the shape of gradient of x w.r.t. y?
print('Shape of gradient x is:', dx.shape)

### Using PyTorch NN module

Now, you can compare your without PyTorch implementation with PyTorch implementation.

In [None]:
# Create a copy of the input tensor so we can use x for all other layers
# without overwriting its gradients
x_lin = x.clone()
# Enable requires_grad for x_lin
x_lin.requires_grad = True

########################################################################
#    TODO: Create a layer from the Linear object class above           #
#                 and do a forward and a backward pass                 #
########################################################################

# Create Linear layer from torch.nn module
torch_layer = 

# Load the parameters from our layer into the Pytorch layer
torch_layer.weight =  # transpose weight by .t()
torch_layer.bias = 

# Perform forward pass
torch_y = 

# Perform bacward pass


########################################################################
#                         END OF YOUR CODE                             #
########################################################################



# What will be the shape of output tensor torch_y?
print('Shape of ouput tensor torch_y:', torch_y.shape)

# What will be the shape of gradient of x w.r.t. y?
print('Shape of gradient x is:', dx.shape)

In [None]:
# Compare gradients of x, weight, bias w.r.t. y using torch.allclose
dx_same = torch.allclose(dx, x_lin.grad)
print('dx identical: ', dx_same)
dw_same = torch.allclose(layer.weight_grad, torch_layer.weight.grad.T)
print('dw identical: ', dw_same)
db_same = torch.allclose(layer.bias_grad, torch_layer.bias.grad)
print('db identical: ', db_same)

## Non-linear activation functions

Implement ReLU and Sigmoid functions. The Functions are defined as follows:

$$\text{ReLU}(x)=\max(0,x)\\
\text{Sigmoid}(x) = \sigma(x) = \frac{1}{1+\exp(-x)}$$

### Using only basic tensor operations
**Here you may only use basic tensor operations from the `torch` module: no `torch.nn`, `torch.nn.F` or others.**

In [None]:
class ReLU(object):
    """
    ReLU non-linear activation function.
    """

    def __init__(self):
        super(ReLU, self).__init__()

        # Define a cache variable because some of the forward pass values
        # would be used during backward pass.
        self.cache = None

    def forward(self, x):
        """
        Forward pass of ReLU non-linear activation function: y=max(0,x). Store
        input tensor as cache variable.
        
        Args:
            x: input tensor

        Returns:
            y: output tensor
        """
        ########################################################################
        #                   TODO: Update cache variable.                       #
        ########################################################################

        y =   # forward pass, Hint: use torch.clamp function
        self.cache = 

        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################

        return y

    def backward(self, dupstream):
        """
        Backward pass of ReLU non-linear activation function: return downstream
        gradient dx.
        
        Args:
            dupstream: Gradient of loss with respect to output of this layer.

        Returns:
            dx: Gradient of loss with respect to input of this layer.
        """

        # Making sure that we don't modify the incoming upstream gradient
        dupstream = dupstream.clone()

        ########################################################################
        #                  TODO: Implement this function                       #
        ########################################################################

        y = 
        dx = 
        dx[y == 0] = 

        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################

        return dx

class Sigmoid(object):
    """
    Sigmoid non-linear activation function.
    """

    def __init__(self):
        super(Sigmoid, self).__init__()

        # Define a cache variable because some of the forward pass value
        # would be used during backward pass.
        self.cache = None
    
    def forward(self, x):
        """
        Forward pass of Sigmoid non-linear activation function: y=1/(1+exp(-x)).
        Store input tensor as cache variable.

        Args:
            x: input tensor

        Returns:
            y: output tensor
        """

        ########################################################################
        #                  TODO: Implement this function                       #
        ########################################################################        

        y = 
        self.cache = 

        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################

        return y

    def backward(self, dupstream):
        """
        Backward pass of Sigmoid non-linear activation function: return
        downstream gradient dx.
        
        Args:
            dupstream: Gradient of loss with respect to output of this layer.

        Returns:
            dx: Gradient of loss with respect to input of this layer.
        """

        ########################################################################
        #                  TODO: Implement this function                       #
        ########################################################################

        y = 
        dx = 

        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################

        return dx

In [None]:
# Define layer dimensions and dummy input
n_samples, in_features = 2, 3
# Make random input tensor of dimensions [n_samples, in_features]
x = torch.randn((n_samples, in_features))

# Define upstream gradient dL/dy as ones
dy = torch.randn((n_samples, in_features))
print('dy:', dy)

relu = ReLU()
sigmoid = Sigmoid()

########################################################################
#    TODO: Perform a forward and backward pass with the ReLU and       #
#                          Sigmoid layers                              #
########################################################################

y_relu = 
dx_relu = 

y_sigmoid = 
dx_sigmoid = 

########################################################################
#                         END OF YOUR CODE                             #
########################################################################

# What will be the shapes of output tensors y_relu and y_sigmoid?
print('Shape of ouput tensors y_relu and y_sigmoid:', y_relu.shape, y_sigmoid.shape)

# What will be the shapes of gradient tensors dx_relu and dx_sigmoid?
print('Output shapes from ReLU and Sigmoid: ', dx_relu.shape, dx_sigmoid.shape)

### Using PyTorch NN module
A list of all available non-linearities in PyTorch can be found [[here](https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity)].

In [None]:
# Create a copy of the input tensor so we can use x for all other layers
# without overwriting its gradients
x_relu = x.clone()
x_sigmoid = x.clone()
# Enable requires_grad
x_relu.requires_grad = True
x_sigmoid.requires_grad = True

print('dy: ', dy)
print('---')

########################################################################
#    TODO: Perform a forward and backward pass with the ReLU and       #
#                          Sigmoid functions                           #
########################################################################

# ReLU forward pass
torch_relu = 
torch_y_relu = 
# Perform backward pass

# Sigmoid forward pass
torch_sigmoid = 
torch_y_sigmoid = 
# Perform backward pass

########################################################################
#                         END OF YOUR CODE                             #
########################################################################

# Compare outputs using torch.allclose
dx_relu_same = torch.allclose(dx_relu, x_relu.grad)
print('dx_relu identical: ', dx_relu_same) # Make sure dx_relu_same = True


dx_sigmoid_same = torch.allclose(dx_sigmoid, x_sigmoid.grad)
print('dx_sigmoid identical: ', dx_sigmoid_same) # Make sure dx_sigmoid_same = True


## Network class

### Using only basic tensor operations
**Here you may only use basic tensor operations from the `torch` module: no `torch.nn`, `torch.nn.F` or others.**

In [None]:
class Net(object):
    """
    Neural network object containing layers.
    
    Args:
        layers: list of layers in neural network
    """
    def __init__(self, layers):
        self.layers = layers

        # Initialize params
        self.reset_params()

    def reset_params(self, std=1.):
        """
        Reset network parameters. Applies `init_params` to all layers with
        learnable parameters.
        
        Args:
            std: Standard deviation of Gaussian distribution (default: 0.1)
        """

        for layer in self.layers:
            if hasattr(layer, 'init_params'):
                layer.init_params(std=std)
    
    def forward(self, x):
        """
        Performs forward pass through all layers of the network.
        
        Args:
            x: input tensor

        Returns:
            x: output tensor
        """

        ########################################################################
        #                  TODO: Implement this function                       #
        ########################################################################

        x = 

        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################
        
        return x

    def backward(self, dupstream):
        """
        Performs backward pass through all layers of the network.
        
        Args:
            dupstream: Gradient of loss with respect to output.
        """

        ########################################################################
        #                  TODO: Implement this function                       #
        ########################################################################

        dx = 

        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################

        return dx
    
    def optimizer_step(self, lr):
        """
        Updates network weights by performing a step in the negative gradient
        direction in each layer. The step size is determined by the learning
        rate.
        
        Args:
            lr: Learning rate to use for update step.
        """

        ########################################################################
        #                  TODO: Implement this function                       #
        #    Hint: use `if hasattr(layer, 'weight')` to check if a layer has   #
        #                       trainable parameters.                          #
        ########################################################################



        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################

In [None]:
# Define layer dimensions
n_samples, in_features, hidden_dim, out_features = 2, 3, 5, 4
# Make random input tensor of dimensions [n_samples, in_features]
x = torch.randn((n_samples, in_features))


# Define and initialize layers
layers = [Linear(in_features, hidden_dim),
          ReLU(),
          Linear(hidden_dim, out_features)]

########################################################################
#    TODO: Create a network from the Net object class above            #
#                 and do a forward and a backward pass                 #
########################################################################

# Initialize network
net = 

# Do forward pass
y = 

# Gradient of y w.r.t. y is 1
dy = 

# Do backward pass
dx = 


########################################################################
#                         END OF YOUR CODE                             #
########################################################################

# What will be the shape of output tensor y?
print('Shape of ouput tensor y:', y.shape)

# What will be the shape of gradient x?
print('Shape of gradient x:', dx.shape)

### Using PyTorch NN module

In [None]:
class TorchNet(nn.Module):
    """
    PyTorch neural network. Network layers are defined in __init__ and forward
    pass implemented in forward.
    
    Args:
        in_features: number of features in input layer
        hidden_dim: number of features in hidden dimension
        out_features: number of features in output layer
    """
    
    def __init__(self, in_features, hidden_dim, out_features):
        super(TorchNet, self).__init__()

        ########################################################################
        #         TODO:  Initialize layers as the previous neural net          #
        #                       having Linear, ReLU, Linear                    #
        ########################################################################

        self.layer1 = 
        self.relu = 
        self.layer2 = 

        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################

    def forward(self, x):
        ########################################################################
        #                  TODO: Implement this function                       #
        ########################################################################
        x = 
        ########################################################################
        #                         END OF YOUR CODE                             #
        ########################################################################
        return x

# Initialize Pytorch network
torch_net = TorchNet(in_features, hidden_dim, out_features)
print(torch_net)

In [None]:
# Load the parameters from our model into the Pytorch model
torch_net.layer1.weight = nn.Parameter(net.layers[0].weight.T) # transpose weight by .T
torch_net.layer1.bias = nn.Parameter(net.layers[0].bias)
torch_net.layer2.weight = nn.Parameter(net.layers[2].weight.T) # transpose weight by .T
torch_net.layer2.bias = nn.Parameter(net.layers[2].bias)

# Make copy of x
torch_x = x.clone()
torch_x.requires_grad = True


########################################################################
#    TODO: Perform a forward and a backward pass                       #
########################################################################

# Perform forward pass
torch_y = 

# Perform backward pass



########################################################################
#                         END OF YOUR CODE                             #
########################################################################

# What will be the shape of output tensor torch_y?
print('Shape of ouput tensor y:', torch_y.shape)

# What will be the shape of gradient x?
print('Shape of gradient x :', torch_x.grad.shape)

# Compare gradients using torch.allclose
dx_same = torch.allclose(dx, torch_x.grad)
print('Gradients identical: ', dx_same)

## Softmax and Cross entropy loss
In a classification task we would like to interpret the output of our network as class probabilities, i.e. for all inputs $x$ and all classes $k$ we should have $$0 \leq P(Y=k|X=x) \leq 1, \qquad \sum_kP(Y=k|X=x) = 1.$$
This can be achieved by normalizing the logits $z$ using the **Softmax layer**:
$$P(Y=k|X=x) = \text{softmax}(\mathbf{z})_k = \frac{\exp{z_k}}{\sum_i^K \exp{z_i}},$$
where $K$ is the number of classes.

The **Cross Entropy (CE)** loss ios defined as:
$$H(y,p) = -\sum_i^K y_i \log(p_i),$$
where $y$ is the one-hot encoded label and $p$ is the class probability vector from the Softmax layer.
<!-- Using the CE loss with the Softmax layer is effective as the $\log$ in the CE loss can undo the $\exp$ in the Softmax layer:
$$\log \text{softmax}(\mathbf{z})_k = z_k - \log \sum_i^K \exp z_i.$$
Because now $z_k$ has a direct contribution to the loss it can never saturate and the gradients will never become too small. -->

The Softmax layer and Cross Entropy loss are used together so often that they are implemented as a single function in Pytorch as `torch.nn.CrossEntropyLoss`.

In [None]:
def Softmax(z):
    """
    Computes softmax output for each sample in batch.

    Args:
      z: Tensor of logits, dimension [batch, classes].

    Return:
      p: Softmax probability distribution.
    """

    ########################################################################
    #                  TODO: Implement Softmax function.                   #
    ########################################################################


    p = 

    ########################################################################
    #                          END OF YOUR CODE                            #
    ########################################################################

    return p

def CrossEntropyLoss(y_true, y_pred):
    """
    Computes softmax output and cross-entropy loss.

    Args:
      y_true: Tensor containing true labels.
      y_pred: Tensor containing predictions.

    Return:
      loss: Cross-entropy loss.
      dy_pred: Gradient of loss w.r.t. y_pred.
    """
    
    # Calculate softmax using previously defined function
    softmax = Softmax(y_pred)

    # Convert one-hot vector to class id
    y_true = torch.argmax(y_true, axis=1)
    # Get number of samples in batch
    n = y_true.shape[0]
    # Calculate cross entropy loss between y_true and y_pred
    log_likelihood = -torch.log(softmax[torch.arange(n),y_true])
    # Average over all samples
    loss = torch.mean(log_likelihood)

    # Caculate the gradient 
    grad = softmax
    softmax[torch.arange(n), y_true] -= 1
    grad /= n

    return loss, grad

## MNIST digit classification

In [None]:
# Preprocessing data: convert to tensors and normalize by subtracting dataset
# mean and dividing by std.
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.1307,), (0.3081,))])

# Get data from torchvision.datasets
train_data = datasets.MNIST('../data', train=True, download=True, transform=transform)
test_data = datasets.MNIST('../data', train=False, download=True, transform=transform)

# Define data loaders used to iterate through dataset
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1000)

# Show some example images
fig, axs = plt.subplots(5, 5, figsize=(5, 5))
for i in range(25):
    x, _ = test_data[i]
    ax = axs[i // 5][i % 5]
    ax.imshow(x.view(28, 28), cmap='gray')
    ax.axis('off')
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
########################################################################
#             TODO: Define appropriate network dimensions.             #
########################################################################

in_features, hidden_dim, out_features = 28*28, 30, 10

########################################################################
#                          END OF YOUR CODE                            #
########################################################################

# Training parameters
learning_rate = 5e-1  # step size for gradient descent
epochs = 10  # how many times to iterate through the intire training set

# Define and initialize layers
layers = [Linear(in_features, hidden_dim),
          Sigmoid(),
          Linear(hidden_dim, out_features)]

# Initialize network
net = Net(layers)

# Define list to store loss of each iteration
train_losses = []
train_accs = []

for epoch in range(epochs):
    # Training loop
    for i, (x_batch, y_batch) in enumerate(train_loader):
        # Flatten input to 1D tensor
        x_batch = x_batch.flatten(start_dim=1)
        # Convert labels to one-hot encoding
        y_batch = nn.functional.one_hot(y_batch, num_classes=10)


        ########################################################################
        #      TODO: Perform forward pass and calculate Cross Entropy loss     #
        #                between prediction and labels .                       #
        ########################################################################

        y_pred = 
        loss, grad = 
        train_losses.append(loss)

        ########################################################################
        #                          END OF YOUR CODE                            #
        ########################################################################


        ########################################################################
        #            TODO: Perform backward pass and optimizer step.           #
        ########################################################################
        


        ########################################################################
        #                          END OF YOUR CODE                            #
        ########################################################################

        # Calculate accuracy of prediction
        correct = torch.argmax(y_pred, axis=1) == torch.argmax(y_batch, axis=1)
        train_accs.append(torch.sum(correct)/len(y_pred))

        # Print progress
        if i % 200 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch + 1, i * len(x_batch), len(train_loader.dataset),
                100. * i / len(train_loader), loss))

    # Validation loop    
    test_loss = 0
    total_correct = 0
    for x_batch, y_batch in test_loader:
        # Flatten input to 1D tensor
        x_batch = x_batch.flatten(start_dim=1)
        # Convert labels to one-hot encoding
        y_batch = nn.functional.one_hot(y_batch, num_classes=10)

        # Perform forward pass with x_batch as input and y_pred as output variables
        y_pred = net.forward(x_batch)

        ########################################################################
        #   TODO: Calculate Cross Entropy loss between prediction and labels.  #
        ########################################################################

        loss, grad = 

        ########################################################################
        #                          END OF YOUR CODE                            #
        ########################################################################

        # Keep track of total loss over test set
        test_loss += loss

        # Calculate accuracy of prediction
        correct = torch.argmax(y_pred, axis=1) == torch.argmax(y_batch, axis=1)
        total_correct += torch.sum(correct)

    test_loss /= len(test_loader.dataset)
    test_acc = 100. * total_correct / len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, total_correct, len(test_loader.dataset), test_acc))
    
# Plot training curves
plt.figure(figsize=(9,4))
plt.subplot(1,2,1)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.plot(train_losses)
plt.grid()

plt.subplot(1,2,2)
plt.xlabel('Iterations')
plt.ylabel('Accuracy (%)')
plt.plot(train_accs)
plt.grid()