In [None]:
!pip install ipyplot
!pip install "git+https://github.com/broutonlab/deep-learning-course.git"

In [None]:
import numpy as np
import random
import torch
import torchvision
import ipyplot
import torchvision.transforms as transforms
from typing import Tuple
from tqdm import tqdm
from IPython.display import clear_output

import dl_course.numpy_net.utils as courseutils

# CIFAR10 dataset

In this notebook you'll build a classifier from scratch using numpy!  
Today's dataset is CIFAR10 — a collection of 60000 32x32 images split into 10 classes.

In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Grayscale(),
     transforms.Lambda(lambda x: torch.flatten(x))])

dataset_params = {
    'root': './data',
    'transform': transform,
    'download': True
}

dataloader_params = {
    # decrease the batch size if you're getting "Out of memory" error
    'batch_size': 64, 
    'num_workers': 2
}

# Download the dataset using pytorch:

trainset = torchvision.datasets.CIFAR10(train=True, **dataset_params)
trainloader = torch.utils.data.DataLoader(trainset, shuffle=True,
                                          **dataloader_params)

testset = torchvision.datasets.CIFAR10(train=False, **dataset_params)
testloader = torch.utils.data.DataLoader(testset, shuffle=False,
                                         **dataloader_params)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
courseutils.preview_CIFAR10(dataset_params['root'], classes)

# Sigmoid

Have a look at Sigmoid activation function.  
(https://en.wikipedia.org/wiki/Sigmoid_function)

Like any other activation function, it is used to add non-linearity to models.

As you (hopefully) already know, neural networks train using an algorithm called **backpropagation**. So, if you want to use something inside an architecture, it must have a **backward pass** to pass gradient from the node that comes after it to the one before it.

You will be implementing backward passes for different layers today.  
Look at how Sigmoid forward and backward passes are implemented:

In [None]:
def sigmoid(X: np.ndarray):
    return 1 / (1 + np.exp(-X))

def sigmoid_backward(d: np.ndarray, prev: np.ndarray):
    X = prev.copy()
    sigma = sigmoid(X)
    dx = sigma * (1 - sigma) * d
    return dx

# ReLU

ReLU is another activation function. It has become immensely popular.  
(https://en.wikipedia.org/wiki/Rectifier_(neural_networks))

In [None]:
def relu(X: np.ndarray):
    # the second return value is 'cache' for future use,
    # you can ignore it for now
    return np.maximum(0, X), X

def relu_backward(d: np.ndarray, prev: np.ndarray):
    X = prev.copy()
    dx = d.copy()
    # TODO: implement ReLU backpropagation here
    ...
    return dx

# Linear layer

Linear layer is basically a matrix multiplication between input and its inner state (its ***weights***) summed with ***bias***.  
(just look at the forward pass if you're confused)

It's also called **affine layer** or **fully-connected** layer and it learns affine transformation of the input.

Your task is to write a backward pass with respect to input **X**, **W**eight and **b**ias.

In [None]:
def linear_backward(d: np.ndarray, prev: tuple):
    X, W, b = prev
    
    # TODO: implement backpropagation for linear layer using numpy
    ...

    return dX, dW, db

def linear(X: np.ndarray, W: np.ndarray, b: np.ndarray):
    # the second return value is 'cache' for future use,
    # you can ignore it for now
    return np.dot(X, W) + b, \
           (X, W, b)

# Loss functions
Softmax is a very common choice for classification tasks like ours.  
We completed this one for you. Feel free to explore how this works.

In [None]:
def softmax(X: np.ndarray):
    e_x = np.exp(X - np.max(X))
    if len(e_x.shape) > 1:
        return e_x / np.sum(e_x, axis=1, keepdims=True)
    return e_x / np.sum(e_x)


def softmax_loss(X: np.ndarray, y: np.ndarray):
    logits = X - np.max(X, axis=1, keepdims=True)
    Z = np.sum(np.exp(logits), axis=1, keepdims=True)
    log_probs = logits - np.log(Z)
    probs = np.exp(log_probs)
    N = X.shape[0]
    loss = -np.sum(log_probs[np.arange(N), y]) / N
    dX = probs.copy()
    dX[np.arange(N), y] -= 1
    dX /= N
    return loss, dX

# Let's see if it works!

We've implemented a very basic network:  
`Linear(1024x500) -> ReLU -> Linear(500, 10) -> Softmax`  
Let's see if your layers work properly.

In [None]:
class NNet(courseutils.BaseNet):
    def __init__(self, num_cls: int = 10, input_dim: int = 1024):
        super().__init__()
        ## Architecture
        self.params = {}
        std = 1e-4
        # first linear layer
        self.params['W1'] = std * np.random.randn(input_dim, 500)
        self.params['b1'] = np.zeros(500)

        # second linear layer
        self.params['W2'] = std * np.random.randn(500, num_cls)
        self.params['b2'] = np.zeros(num_cls)

        self.num_layers = 2
        ## params
        self.output_activation = softmax
        self.loss = softmax_loss
        self.optimizer = courseutils.Optimizer("adam")
        self.loss_history = []

        ## datasets
        self.labels2names = classes
        self.trainloader = trainloader
        self.testloader = testloader
        self.valloader = testloader

    def forward(self, X: np.ndarray) -> Tuple[list, dict]:
        caches = {}

        input = X.copy()

        h1, cache1 = linear(input, self.params["W1"], self.params["b1"])
        caches['linear1'] = cache1
        h_relu, cache_relu = relu(h1)
        caches['relu'] = cache_relu

        output, cache = linear(h_relu, self.params['W2'], self.params['b2'])

        caches["linear2"] = cache

        return output, caches

    def backward(self, X: np.ndarray, y: np.ndarray, caches: dict) -> Tuple[float, dict]:
        grads = {}
        loss, dOut = self.loss(X, y)

        dOut, grads["W2"], grads["b2"] = linear_backward(dOut, caches['linear2'])

        dOut = relu_backward(dOut, caches['relu'])
        dOut, grads['W1'], grads['b1'] = linear_backward(dOut, caches['linear1'])

        return loss, grads


# Train the model!
Training may take a while, because we're only using a CPU.

In [None]:
model = NNet()

In [None]:
model.train(num_epochs=3)

You should see about 30% accuracy.

In [None]:
model.test()

In [None]:
# this outputs images with predicted labels
courseutils.test_CIFAR10(model, classes, dataset_params['root'], transform, num_images=10)

# What now?

30% accuracy is still better than pure random (which should give 10% for 10 classes).

But this is too low to be useful. Things you can do:

*   Try different architectures (add more hidden layers) and see to what extent you can improve accuracy.  
Find a nice way to experiment with different architectures to avoid rewriting forward() and backward() each time you change something.
*   Implement new types of layers

Of course, right now you are very limited in tools (and processing power, since we're using a CPU), so don't expect much.

# Your turn

Use our previous model as an example.



In [None]:
class YourNet(courseutils.BaseNet):
    def __init__(self, num_cls: int = 10, input_dim: int = 1024):
        super().__init__()
        ## Architecture
        self.params = {}
        std = 1e-4
        
        # TODO: your layers go here.
        # It's CRUCIAL that all of your trainable parameters
        # stay inside self.params, e.g. self.params['W1']
        # (this is how our base class works)
        ...

        self.num_layers = 2
        ## params
        self.output_activation = softmax
        self.loss = softmax_loss
        self.optimizer = courseutils.Optimizer("adam")
        self.loss_history = []

        ## datasets
        self.labels2names = classes
        self.trainloader = trainloader
        self.testloader = testloader
        self.valloader = testloader

    def forward(self, X: np.ndarray) -> Tuple[list, dict]:
        caches = {}
        
        # TODO: implement forward pass.
        # Populate 'caches' variable with whatever you need for backward pass.
        # It is passed directly to backward() as 'caches' argument
        output = ...

        return output, caches

    def backward(self, X: np.ndarray, y: np.ndarray, caches: dict) -> Tuple[float, dict]:
        grads = {}
        loss, dOut = self.loss(X, y)

        # TODO: implement backward pass.
        # NOTE: for each trainable parameter in self.params
        # there should be a parameter in grads with the same key,
        # e.g. grads['W1'] is used to train self.params['W1'].
        # otherwise our optimizer will get KeyError :(
        ...

        return loss, grads

In [None]:
your_model = YourNet()

In [None]:
your_model.train(num_epochs=3)
your_model.test()

In [None]:
courseutils.test_CIFAR10(your_model, classes, dataset_params['root'], transform, num_images=10)