# Seminar 1. Intro.
Hi! Today we are going to recall our memories about numpy, sci-kit learn, matplotlib, make our own simple Neural Network and train it to solve some classification tasks.

In [None]:
!pip install mnist

In [None]:
from IPython import display
import numpy as np
import random

In [None]:
np.random.seed(42)
random.seed(42)

## Numpy

### Initialization

In [None]:
a = [1. , 1.4 , 2.5]
print(f"Simple way: {np.array(a)}")
print(f"Zeros:\n {np.zeros((2,3))}")
print(f"Range: {np.arange(10)}")
print(f"Complicated range: {np.arange(4, 12, 2)}")
print(f"Space: {np.linspace(1, 4, 6)}")
print(f"Identity matrix:\n {np.eye(4)}")

### Random

In [None]:
print(f"From 0 to 1: {np.random.rand()}")
print(f"Vector from 0 to 1: {np.random.rand(5)}")
print(f"Vector from 0 to 10: {np.random.randint(10, size=5)}")

### Matrix Operation

In [None]:
a = np.arange(10)
b = np.linspace(-10, 10, 10)
print(f"a: {a}\nshape:{a.shape}")
print(f"b: {a}\nshape:{b.shape}")
print(f"a + b: {a + b},\n\t a * b: {a * b}")
print(f"Dot product: {a.dot(b)}")
print(f"Mean: {a.mean()}, STD: {a.std()}")
print(f"Sum: {a.sum()}, Min: {a.min()}, Max: {a.max()}")
print(f"Reshape:\n{a.reshape(-1, 1)}\nshape: {a.reshape(-1, 1).shape}")
c = a.reshape(-1, 1).repeat(5, axis=1)
print(f"Repeat:\n{c}")
print(f"Transpose:\n{c.T}\nshape: {c.T}")
print(f"Unique items: {np.unique(c)}")

### Indexing

In [None]:
a = np.arange(100).reshape(10, 10)
print(f"Array:\n{a}\nshape:{a.shape}")
print(f"Get first column: {a[:, 0]}")
print(f"Get last row: {a[-1, :]}")
print(f"Add new awis:\n{a[:, np.newaxis]}\nshape: {a[:, np.newaxis].shape}")
print(f"Specific indexing:\n{a[4:6, 7:]}")

## Scikit Learn
[Docs](https://scikit-learn.org/stable/modules/classes.html)

In [None]:
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Matplotlib

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(np.arange(10), 6 * np.arange(10))

In [None]:
X, y = make_blobs((50, 50, 50))
plt.scatter(X[:, 0], X[:, 1], c=y)

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.title("Adversal data")
plt.grid()
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

plt.figure(figsize=(10, 10))
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, alpha=1.0)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, alpha=.3)
plt.grid()
plt.show()

## Neural Network
Based on [ml-mipt](https://github.com/girafe-ai/ml-mipt) course.

In [None]:
from time import time, sleep


class Module(object):
    """
    Basically, you can think of a module as of a something (black box) 
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`: 
        
        output = module.forward(input)
    
    The module should be able to perform a backward pass: to differentiate the `forward` function. 
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule. 
    
        gradInput = module.backward(input, gradOutput)
    """
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True
    
    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self, input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.
        
        This includes 
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput
    

    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.
        
        Make sure to both store the data in `output` field and return it. 
        """
        
        # The easiest case:
            
        # self.output = input 
        # return self.output
        
        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input. 
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.
        
        The shape of `gradInput` is always the same as the shape of `input`.
        
        Make sure to both store the gradients in `gradInput` field and return it.
        """
        
        # The easiest case:
        
        # self.gradInput = gradOutput 
        # return self.gradInput
        
        pass   
    
    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass
    
    def zeroGradParameters(self): 
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass
        
    def getParameters(self):
        """
        Returns a list with its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
        
    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
    
    def train(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True
    
    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False
    
    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Module"

In [None]:
class Sequential(Module):
    """
         This class implements a container, which processes `input` data sequentially. 
         
         `input` is processed by each module (layer) in self.modules consecutively.
         The resulting array is called `output`. 
    """
    
    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []
   
    def add(self, module):
        """
        Adds a module to the container.
        """
        self.modules.append(module)

    def updateOutput(self, input):
        """
        Basic workflow of FORWARD PASS:
        
            y_0    = module[0].forward(input)
            y_1    = module[1].forward(y_0)
            ...
            output = module[n-1].forward(y_{n-2})   
            
            
        Just write a little loop. 
        """

        # Your code goes here. ################################################
        self.output = input
        
        for module in self.modules:
            self.output = module.forward(self.output)
        
        return self.output

    def backward(self, input, gradOutput):
        """
        Workflow of BACKWARD PASS:
            
            g_{n-1} = module[n-1].backward(y_{n-2}, gradOutput)
            g_{n-2} = module[n-2].backward(y_{n-3}, g_{n-1})
            ...
            g_1 = module[1].backward(y_0, g_2)   
            gradInput = module[0].backward(input, g_1)   
             
             
        !!!
                
        To ech module you need to provide the input, module saw while forward pass, 
        it is used while computing gradients. 
        Make sure that the input for `i-th` layer the output of `module[i]` (just the same input as in forward pass) 
        and NOT `input` to this Sequential module. 
        
        !!!
        
        """
        # Your code goes here. ################################################
        
        for i in range(len(self.modules)-1, 0, -1):
            gradOutput = self.modules[i].backward(self.modules[i-1].output, gradOutput)
        
        self.gradInput = self.modules[0].backward(input, gradOutput)
        
        return self.gradInput
      

    def zeroGradParameters(self): 
        for module in self.modules:
            module.zeroGradParameters()
    
    def getParameters(self):
        """
        Should gather all parameters in a list.
        """
        return [x.getParameters() for x in self.modules]
    
    def getGradParameters(self):
        """
        Should gather all gradients w.r.t parameters in a list.
        """
        return [x.getGradParameters() for x in self.modules]
    
    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string
    
    def __getitem__(self,x):
        return self.modules.__getitem__(x)
    
    def train(self):
        """
        Propagates training parameter through all modules
        """
        self.training = True
        for module in self.modules:
            module.train()
    
    def evaluate(self):
        """
        Propagates training parameter through all modules
        """
        self.training = False
        for module in self.modules:
            module.evaluate()

In [None]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        
    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function 
            associated to the criterion and return the result.
            
            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result. 
            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)
    
    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput   

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Criterion"

In [None]:
class Linear(Module):
    """
    A module which applies a linear transformation 
    A common name is fully-connected layer, InnerProductLayer in caffe. 
    
    The module should work with 2D _input of shape (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()
       
        # This is a nice initialization
        stdv = 1./np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size = (n_out, n_in))
        self.b = np.random.uniform(-stdv, stdv, size = n_out)
        
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)
        
    def updateOutput(self, _input):
        # Your code goes here. ################################################
        self.output = np.dot(_input, self.W.T) + self.b
        
        return self.output
    
    def updateGradInput(self, _input, gradOutput):
        # Your code goes here. ################################################
        self.gradInput = np.dot(gradOutput, self.W)
        
        return self.gradInput
    
    def accGradParameters(self, _input, gradOutput):
        # Your code goes here. ################################################
        # self.gradW = ... ; self.gradb = ...
        self.gradW += np.dot(gradOutput.T, _input)
        self.gradb += np.sum(gradOutput, axis=0)

        pass
    
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return [self.W, self.b]
    
    def getGradParameters(self):
        return [self.gradW, self.gradb]
    
    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' %(s[1],s[0])
        return q

In [None]:
class LogSoftMax(Module):
    def __init__(self):
         super(LogSoftMax, self).__init__()
    
    def updateOutput(self, _input):
        # start with normalization for numerical stability
        self.output = np.subtract(_input, _input.max(axis=1, keepdims=True))
        
        # Your code goes here. ################################################
        self.output = - _input - np.log(np.sum(np.exp(-_input), axis=1)).reshape(-1, 1)

        return self.output
    
    def updateGradInput(self, _input, gradOutput):
        # Your code goes here. ################################################
        _input = np.subtract(_input, _input.max(axis=1, keepdims=True))
        s = 1 / np.sum(np.exp(-_input), axis=1).reshape(-1, 1)
        self.gradInput = - gradOutput + gradOutput * s * np.exp(- _input)

        return self.gradInput
    
    def __repr__(self):
        return "LogSoftMax"

In [None]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()
    
    def updateOutput(self, _input):
        self.output = np.maximum(_input, 0)
        return self.output
    
    def updateGradInput(self, _input, gradOutput):
        self.gradInput = np.multiply(gradOutput , _input > 0)
        return self.gradInput
    
    def __repr__(self):
        return "ReLU"

In [None]:
class ClassNLLCriterion(Criterion):
    def __init__(self):
        a = super(ClassNLLCriterion, self)
        super(ClassNLLCriterion, self).__init__()
        
    def updateOutput(self, _input, target): 
        # Your code goes here. ################################################

        self.output = - np.mean(_input[np.arange(target.shape[0]), target])
        return self.output

    def updateGradInput(self, _input, target):
        # Your code goes here. ################################################

        self.gradInput = np.zeros_like(_input)
        self.gradInput[np.arange(target.shape[0]), target] = -1 / _input.shape[0]
        return self.gradInput
    
    def __repr__(self):
        return "ClassNLLCriterion"

In [None]:
def simple_sgd(variables, gradients, config, state):  
    # 'variables' and 'gradients' have complex structure, accumulated_grads will be stored in a simpler one
    state.setdefault('accumulated_grads', {})
    
    var_index = 0 
    for current_layer_vars, current_layer_grads in zip(variables, gradients): 
        for current_var, current_grad in zip(current_layer_vars, current_layer_grads):
            
            current_var -= config['learning_rate'] * current_grad
            var_index += 1

### Simple classification task

In [None]:
dataset = make_blobs((200, 200, 200))

X_train, X_test, y_train, y_test = train_test_split(dataset[0], dataset[1], test_size=0.3)

plt.figure(figsize=(10, 10))
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, alpha=1.0)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, alpha=.3)
plt.grid()
plt.show()
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4)

In [None]:
net = Sequential()
net.add(Linear(2, 3))
net.add(LogSoftMax())

criterion = ClassNLLCriterion()

print(net)

In [None]:
# Optimizer params
optimizer_config = {'learning_rate' : 1e-2}
optimizer_state = {}

# Looping params
n_epoch = 20
batch_size = 128

In [None]:
# batch generator
def get_batches(dataset, batch_size):
    X, Y = dataset
    n_samples = X.shape[0]
        
    # Shuffle at the start of epoch
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        
        batch_idx = indices[start:end]
    
        yield X[batch_idx], Y[batch_idx]

In [None]:
def train():
    train_loss_history = []
    val_loss_history = [0,]
    val_acc_history = [0,]
    steps = [0,]

    for i in range(n_epoch):
        steps.append(steps[-1])
        for x_batch, y_batch in get_batches((X_train, y_train), batch_size):
            
            net.zeroGradParameters()
            
            # Forward
            predictions = net.forward(x_batch)
            loss = criterion.forward(predictions, y_batch)
        
            # Backward
            dp = criterion.backward(predictions, y_batch)
            net.backward(x_batch, dp)
            
            # Update weights
            simple_sgd(net.getParameters(), 
                    net.getGradParameters(), 
                    optimizer_config,
                    optimizer_state)      
            
            train_loss_history.append(loss)
            steps[-1] += 1

        sum_loss = 0
        sum_acc = 0
        count_val_steps = 0
        for x_batch, y_batch in get_batches((X_val, y_val), batch_size):
            predictions = net.forward(x_batch)
            loss = criterion.forward(predictions, y_batch)
            sum_loss += loss
            sum_acc += accuracy_score(y_batch, np.argmax(predictions, axis=1))
            count_val_steps += 1

        val_loss_history.append(sum_loss / count_val_steps)
        val_acc_history.append(sum_acc / count_val_steps)

        # Visualize
        display.clear_output(wait=True)
        fig, ax = plt.subplots(1, 3, figsize=(15, 5))
        # plt.figure(figsize=(8, 6))
            
        ax[0].set_title("Training loss")
        ax[0].set_xlabel("#iteration")
        ax[0].set_ylabel("loss")
        ax[0].plot(train_loss_history, 'b')
        ax[0].grid()

        ax[1].set_title("Validation loss")
        ax[1].set_xlabel("#iteration")
        ax[1].set_ylabel("loss")
        ax[1].plot(steps, val_loss_history, 'b')
        ax[1].grid()

        ax[2].set_title("Validation Accuracy")
        ax[2].set_xlabel("#iteration")
        ax[2].set_ylabel("accuracy")
        ax[2].plot(steps, val_acc_history, 'b')
        ax[2].grid()

        plt.show()
        
        print('Current loss: %f' % loss)

In [None]:
train()

Let's make network more complicated:

In [None]:
net = Sequential()
net.add(Linear(2, 10))
net.add(ReLU())
net.add(Linear(10, 3))
net.add(LogSoftMax())

criterion = ClassNLLCriterion()

print(net)

In [None]:
train()

### MNIST

In [None]:
import mnist


images = mnist.train_images() / 255
labels = mnist.train_labels()

In [None]:
_, ax = plt.subplots(5, 5, figsize=(10, 10))
for i in range(25):
    ax[i // 5, i % 5].imshow(images[i], cmap="gray")


In [None]:
images[0].shape

In [None]:
pca = PCA(n_components=2)
new_features = pca.fit_transform([i.reshape(-1) for i in images])

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(new_features[:, 0], new_features[:, 1], c=labels)
plt.grid()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(images, labels)

In [None]:
# Optimizer params
optimizer_config = {'learning_rate' : 1e-2}
optimizer_state = {}

# Looping params
n_epoch = 20
batch_size = 512

In [None]:
net = Sequential()
net.add(Linear(28*28, 10))
net.add(LogSoftMax())

criterion = ClassNLLCriterion()

print(net)

In [None]:
def train():
    train_loss_history = []
    val_loss_history = [0,]
    val_acc_history = [0,]
    steps = [0,]

    for i in range(n_epoch):
        steps.append(steps[-1])
        for x_batch, y_batch in get_batches((X_train, y_train), batch_size):
            
            net.zeroGradParameters()
            x_batch = x_batch.reshape(-1, 28*28)
            
            # Forward
            predictions = net.forward(x_batch)
            loss = criterion.forward(predictions, y_batch)
        
            # Backward
            dp = criterion.backward(predictions, y_batch)
            net.backward(x_batch, dp)
            
            # Update weights
            simple_sgd(net.getParameters(), 
                    net.getGradParameters(), 
                    optimizer_config,
                    optimizer_state)      
            
            train_loss_history.append(loss)
            steps[-1] += 1

        sum_loss = 0
        sum_acc = 0
        count_val_steps = 0
        for x_batch, y_batch in get_batches((X_val, y_val), batch_size):
            x_batch = x_batch.reshape(-1, 28*28)
            predictions = net.forward(x_batch)
            loss = criterion.forward(predictions, y_batch)
            sum_loss += loss
            sum_acc += accuracy_score(y_batch, np.argmax(predictions, axis=1))
            count_val_steps += 1

        val_loss_history.append(sum_loss / count_val_steps)
        val_acc_history.append(sum_acc / count_val_steps)

        # Visualize
        display.clear_output(wait=True)
        fig, ax = plt.subplots(1, 3, figsize=(15, 5))
            
        ax[0].set_title("Training loss")
        ax[0].set_xlabel("#iteration")
        ax[0].set_ylabel("loss")
        ax[0].plot(train_loss_history, 'b')
        ax[0].grid()

        ax[1].set_title("Validation loss")
        ax[1].set_xlabel("#iteration")
        ax[1].set_ylabel("loss")
        ax[1].plot(steps, val_loss_history, 'b')
        ax[1].grid()

        ax[2].set_title("Validation Accuracy")
        ax[2].set_xlabel("#iteration")
        ax[2].set_ylabel("accuracy")
        ax[2].plot(steps, val_acc_history, 'b')
        ax[2].grid()

        plt.show()
        
        print('Current loss: %f' % (sum_loss / count_val_steps))

In [None]:
train()