In [1]:
import numpy as np

**Module** is an abstract class which defines fundamental methods necessary for a training a neural network. You do not need to change anything here, just read the comments.

In [2]:
class Module(object):
    """
    Basically, you can think of a module as of a something (black box) 
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`: 
        
        output = module.forward(input)
    
    The module should be able to perform a backward pass: to differentiate the `forward` function. 
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule. 
    
        gradInput = module.backward(input, gradOutput)
    """
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True
    
    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self, input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.
        
        This includes 
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput
    

    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.
        
        Make sure to both store the data in `output` field and return it. 
        """
        
        # The easiest case:
            
        # self.output = input 
        # return self.output
        
        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input. 
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.
        
        The shape of `gradInput` is always the same as the shape of `input`.
        
        Make sure to both store the gradients in `gradInput` field and return it.
        """
        
        # The easiest case:
        
        # self.gradInput = gradOutput 
        # return self.gradInput
        
        pass   
    
    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass
    
    def zeroGradParameters(self): 
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass
        
    def getParameters(self):
        """
        Returns a list with its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
        
    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
    
    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Module"

# Sequential container

**Define** a forward and backward pass procedures.

In [None]:
np.random.seed(17)

In [3]:
class Sequential(Module):
    
    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []
   
    def add(self, module):

        self.modules.append(module)

    def forward(self, input):
 
        self.inputs = []
        last_output = input
        
        for i, mod in enumerate(self.modules):
            self.inputs.append(last_output)
            last_output = mod.forward(self.inputs[-1])
            
        self.output = last_output
                
        return self.output

    def backward(self, input, gradOutput):

        yinp = self.inputs[::-1]
        t = gradOutput
        
        for i, mod in enumerate(self.modules[::-1]):
            t = mod.backward(yinp[i], t)
                
        self.gradInput = t
        return self.gradInput
      

    def zeroGradParameters(self): 
        for module in self.modules:
            module.zeroGradParameters()
    
    def getParameters(self):

        return [x.getParameters() for x in self.modules]
    
    def getGradParameters(self):
    
        return [x.getGradParameters() for x in self.modules]
    
    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string
    
    def __getitem__(self,x):
        return self.modules.__getitem__(x)

# Layers

## 1. Linear transform layer
Also known as dense layer, fully-connected layer, FC-layer, InnerProductLayer (in caffe), affine transform
- input:   **`batch_size x n_feats1`**
- output: **`batch_size x n_feats2`**

You need to **define forward and backward** passes for this layer (updateOutput, updateGradInput, acccGradParameters).

In [None]:
class Linear(Module):


    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()

        # This is a nice initialization
        stdv = 1. / np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size=(n_out, n_in))
        self.b = np.random.uniform(-stdv, stdv, size=n_out)

        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)

    def updateOutput(self, input):
        self.output = input.dot(self.W.T) + self.b  

        return self.output

    def updateGradInput(self, input, gradOutput):
       
        self.gradInput = gradOutput.dot(self.W).reshape(input.shape)  

        assert self.gradInput.shape == input.shape, "wrong shape"

        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        self.gradW = gradOutput.T.dot(input).reshape(
            self.W.shape)  

        assert self.gradW.shape == self.W.shape

        self.gradb = np.ones_like(self.b) * gradOutput.sum(axis=0)

        assert self.gradb.shape == self.b.shape

        return self.gradW, self.gradb

    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)

    def getParameters(self):
        return [self.W, self.b]

    def getGradParameters(self):
        return [self.gradW, self.gradb]

    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' % (s[1], s[0])
        return q

## 2. SoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{softmax}(x)_i = \frac{\exp x_i} {\sum_j \exp x_j}$

Recall that $\text{softmax}(x) == \text{softmax}(x - \text{const})$. It makes possible to avoid computing exp() from large argument.

You need to **define forward and backward** passes for this layer (softmax, updateOutput, updateGradInput).

In [None]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()
    
    def updateOutput(self, input):
       
        input1 = np.subtract(input, input.max(axis=1, keepdims=True))
        
        self.output = np.exp(input1)/np.sum(np.exp(input1), axis=1, keepdims=True)
        
        return self.output
    
    def updateGradInput(self, input, gradOutput):
      
        exp = np.exp(np.subtract(input, input.max(axis=1, keepdims=True)))
        denom = exp.sum(axis=1, keepdims=True)
        e = np.diag(exp.dot(gradOutput.T))
        self.gradInput = - np.diag(e).dot(exp)    
        self.gradInput += exp * denom * gradOutput
        self.gradInput /= denom**2
        return self.gradInput
    
    def __repr__(self):
        return "SoftMax"

## 3. LogSoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{logsoftmax}(x)_i = \log\text{softmax}(x)_i = x_i - \log {\sum_j \exp x_j}$

The main goal of this layer is to be used in computation of log-likelihood loss.

You need to **define forward and backward** passes for this layer (softmax, updateOutput, updateGradInput).

In [None]:
class LogSoftMax(Module):
    def __init__(self):
         super(LogSoftMax, self).__init__()
    
    def softmax(self, output):
        softmax_output = forward(SoftMax, input)
        return softmax_output
    
    def updateOutput(self, input):
   
        input1 = np.subtract(input, input.max(axis=1, keepdims=True))
        
      
        self.output = input1 - np.log(np.sum(np.exp(input1), axis=1, keepdims=True))
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        exp = np.exp(np.subtract(input, input.max(axis=1, keepdims=True)))
        x = np.subtract(input, input.max(axis=1, keepdims=True))
        x = np.exp(input)/np.sum(np.exp(input), axis=1, keepdims=True)
        denom = exp.sum(axis=1, keepdims=True)
        e = np.diag(exp.dot(gradOutput.T))
        gradInputs = - np.diag(e).dot(exp)
        gradInputs += exp * denom * gradOutput
        gradInputs /= denom ** 2
        self.gradInput = gradInputs
        self.gradInput /= denom
        return self.gradInput
    
    def __repr__(self):
        return "LogSoftMax"
    

# Activation functions

Here's the complete example for the **Rectified Linear Unit** non-linearity (aka **ReLU**): 

In [None]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()
    
    def forward(self, input):
        self.output = np.where(input < 0, 0 * input, input)
        self.negative = np.where(input < 0, 0, 1)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = self.negative * gradOutput
        return self.gradInput
    
    def __repr__(self):
        return "ReLU"

## 6. Leaky ReLU
Implement [**Leaky Rectified Linear Unit**](http://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29%23Leaky_ReLUs). Expriment with slope.

You need to **define forward and backward** passes for this layer (updateOutput, updateGradInput).

In [None]:
class LeakyReLU(Module):
    def __init__(self, slope = 0.03):
        super(LeakyReLU, self).__init__()
            
        self.slope = slope
        
    def updateOutput(self, input):
        self.output = np.where(input < 0, self.slope * input, input)
        self.negative = np.where(input < 0, self.slope, 1)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = self.negative * gradOutput
        return self.gradInput
    
    def __repr__(self):
        return "LeakyReLU"

## 7. ELU
Implement [**Exponential Linear Units**](http://arxiv.org/abs/1511.07289) activations.

You need to **define forward and backward** passes for this layer (updateOutput, updateGradInput).

In [None]:
class ELU(Module):
    def __init__(self, alpha = 1.0):
        super(ELU, self).__init__()
        
        self.alpha = alpha
        
    def forward(self, input):
        input1 = np.subtract(input, input.max(axis=1, keepdims=True))
        self.output = np.where(input < 0, self.alpha * (np.exp(input1) - 1), input)
        self.negative = np.where(input < 0, -1, 1)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        gradOutput1 = np.subtract(gradOutput, gradOutput.max(axis=1, keepdims=True))
        input1 = np.subtract(input, input.max(axis=1, keepdims=True))
        self.gradInput = np.where(self.negative < 0, self.alpha * gradOutput * (np.exp(input1)), gradOutput)
        return self.gradInput
    
    def __repr__(self):
        return "ELU"

# Criterions

Criterions are used to score the models answers. 

In [None]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        
    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function 
            associated to the criterion and return the result.
            
            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result. 

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)
    
    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput   

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Criterion"

The **MSECriterion**, which is basic L2 norm usually used for regression, is implemented here for you.
- input:   **`batch_size x n_feats`**
- target: **`batch_size x n_feats`**
- output: **scalar**

In [None]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()
        
    def forward(self, input, target):   
        self.output = np.sum(np.power(input - target,2)) / input.shape[0]
        return self.output 
 
    def updateGradInput(self, input, target):
        self.gradInput  = (input - target) * 2 / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

## 9. Negative LogLikelihood criterion (numerically unstable)
You task is to implement the **ClassNLLCriterion**. It should implement [multiclass log loss](http://scikit-learn.org/stable/modules/model_evaluation.html#log-loss). Nevertheless there is a sum over `y` (target) in that formula, 
remember that targets are one-hot encoded. This fact simplifies the computations a lot. Note, that criterions are the only places, where you divide by batch size. Also there is a small hack with adding small number to probabilities to avoid computing log(0).
- input:   **`batch_size x n_feats`** - probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**

You need to **define forward and backward** passes for this criterion (updateOutput, updateGradInput).

In [None]:
class ClassNLLCriterionUnstable(Criterion):
    EPS = 1e-15
    def __init__(self):
        a = super(ClassNLLCriterionUnstable, self)
        super(ClassNLLCriterionUnstable, self).__init__()
        
    def updateOutput(self, input, target): 
     
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
        target_clamp = target
        
        self.output = - np.sum(np.multiply(target,np.log(input_clamp))) / input_clamp.shape[0]
        return self.output

    def updateGradInput(self, input, target):
       
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
        target_clamp = target
        self.gradInput  = - np.multiply(target, (1 / input_clamp)) / input_clamp.shape[0]
        return self.gradInput
    
    def __repr__(self):
        return "ClassNLLCriterionUnstable"
    

## 10. Negative LogLikelihood criterion (numerically stable)
- input:   **`batch_size x n_feats`** - log probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**

Task is similar to the previous one, but now the criterion input is the output of log-softmax layer. This decomposition allows us to avoid problems with computation of forward and backward of log().

You need to **define forward and backward** passes for this criterion (updateOutput, updateGradInput).

In [None]:
class ClassNLLCriterion(Criterion):
    EPS = 0
    def __init__(self):
        a = super(ClassNLLCriterion, self)
        super(ClassNLLCriterion, self).__init__()
        
    def updateOutput(self, input, target): 
       
        input_clamp = input
        target_clamp = target
        
        self.output = - np.sum(np.multiply(target, input_clamp))/input_clamp.shape[0]
        return self.output

    def updateGradInput(self, input, target):
    
        input_clamp = input
        target_clamp = target
        self.gradInput  =  - target/input_clamp.shape[0]
        return self.gradInput
    
    def __repr__(self):
        return "ClassNLLCriterion"
    