In [2]:
import numpy as np

# Module 

**Module** is like a black box, which operates forward and backward propagation.

In [3]:
class Module(object):
    """
    You can refer to Module as an abstract class. You need to override all the methods in inherited classes.
    """

    def __init__ (self):
        """
        output = module.forward(input)    
        gradInput = module.backward(input, gradOutput)
        """
        self.output = None
        self.gradInput = None
        self.train = True

    def forward(self, inpt):
        """
        Performs forward-propagation: takes an input, and computes the corresponding output 
        (see updateOutput method).
        """
        return self.updateOutput(inpt)

    def backward(self, inpt, gradOutput):
        """
        Performs backpropagation step through the module, with respect to the given input.
        
        This includes 
         - computing a local gradient w.r.t. input (see updateGradInput),
         - computing a gradient w.r.t. parameters to update parameters while optimizing (see accGradParameters)
        """
        self.updateGradInput(inpt, gradOutput)
        self.accGradParameters(inpt, gradOutput)
        return self.gradInput
    

    def updateOutput(self, inpt):
        """
        Stores and returns the output using the current parameter set of the class and input. 
        Used in the forward propagation.
        """
        # just some easy case        
        self.output = inpt 
        return self.output
        
        pass

    def updateGradInput(self, inpt, gradOutput):
        """
        Computing the gradient of the module with respect to its own input. 
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.
        
        The shape of `gradInput` is always the same as the shape of `input`.
        
        Make sure to both store the gradients in `gradInput` field and return it.
        """
        
        # just some easy case
        self.gradInput = gradOutput 
        return self.gradInput
        
        pass   
    
    def accGradParameters(self, inpt, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass
    
    def zeroGradParameters(self): 
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass
        
    def getParameters(self):
        """
        Returns a list with its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
        
    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
    
    def training(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.train = True
    
    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.train = False
    
    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Module"

# Sequential container

Define forward and backward propagation workflows here.

In [4]:
class Sequential(Module):
    """
    Refer to Sequential as a container, containing all the modules (layers) sequentially.
    Input is processed by each module (layer) in self.modules consecutively.
    The resulting array is the Output. 
    """
    
    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []
   
    def add(self, module):
        """
        Adds a module (layer) to the container.
        """
        self.modules.append(module)

    def updateOutput(self, inpt):
        """
        Main workflow of forward-propagation (this method is called in the forward method of base class):
        
            y_0    = module[0].forward(input)
            y_1    = module[1].forward(y_0)
            ...
            output = module[n-1].forward(y_{n-2})   
            
        """
        self.y=[]  
        
        #TODO: implement forward pass described in docstring
        
        self.y.append(self.modules[0].forward(inpt))
        for i in range(1,len(self.modules)):
            self.y.append(self.modules[i].forward(self.y[i-1]))
        self.output = self.y[-1]
        return self.output

    def backward(self, inpt, gradOutput):
        """
        Main workflow of backward-propagation:
            
            g_{n-1} = module[n-1].backward(y_{n-2}, gradOutput)
            g_{n-2} = module[n-2].backward(y_{n-3}, g_{n-1})
            ...
            g_1 = module[1].backward(y_0, g_2)   
            gradInput = module[0].backward(input, g_1)   
             
        Each module takes the input seen during the forward pass (y_{i-1}) 
        and computes corresonding gradient (g_{i}).   
        """
        
        #TODO: implement backward pass described in docstring

        y = self.y
        n = len(self.modules)
        g = [None] * n
        g[-1]=self.modules[n-1].backward(self.y[n-2],gradOutput)
        for i in range(2,n):
            g[n-i]=self.modules[n-i].backward(self.y[n-i-1],g[n-i+1])
        self.gradInput = self.modules[0].backward(inpt,g[1])
        
        return self.gradInput
      

    def zeroGradParameters(self): 
        for module in self.modules:
            module.zeroGradParameters()
    
    def getParameters(self):
        """
        Should gather all parameters in a list.
        """
        return [x.getParameters() for x in self.modules]
    
    def getGradParameters(self):
        """
        Should gather all gradients w.r.t parameters in a list.
        """
        return [x.getGradParameters() for x in self.modules]
    
    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string
    
    def __getitem__(self,x):
        return self.modules.__getitem__(x)

# Layers

In [5]:
class Linear(Module):
    """
    Performs pre-activation thus outputting weighted input.
    
    The module should work with batch input 2D input of shape (n_samples, n_features).
    """
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()
        
        #TODO: Initialize biases and weights from standard normal distribution
        
        self.b = np.random.uniform(-0.05,0.05,n_out)
        self.W = np.random.uniform(-0.05,0.05,(n_out,n_in))
    
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)
        
    def updateOutput(self, inpt):
        
        #TODO: calculate output of linear layer
        self.output = inpt.dot(self.W.T) + self.b
        
        return self.output

    def updateGradInput(self, inpt, gradOutput):
        
        self.gradInput = gradOutput.dot(self.W)
        return self.gradInput
    
    def accGradParameters(self, inpt, gradOutput):
        
        self.gradW = gradOutput.T.dot(inpt)
        self.gradb = gradOutput.sum(axis=0)
    
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return [self.W, self.b]
    
    def getGradParameters(self):
        return [self.gradW, self.gradb]
    
    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' %(s[1],s[0])
        return q

In [6]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()
    
    def updateOutput(self, inpt):
        self.output = (np.exp(inpt) / np.sum(np.exp(inpt), axis =1, keepdims = True))
        
        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        self.gradInput = self.output*(gradOutput - (np.sum(gradOutput*self.output, axis=1, keepdims = True)))
        
        return self.gradInput
    
    def __repr__(self):
        return "SoftMax"

# Activations

In [7]:
class Sigmoid(Module):
    def __init__(self):
         super(Sigmoid, self).__init__()
    
    def updateOutput(self, inpt):
        
        #TODO: implement sigmoid function
        
        self.output = 1/(1+np.exp(-inpt))

        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        
        #TODO: implement gradient of sigmoid function
        self.output = 1/(1+np.exp(-inpt))
        
        self.gradInput = gradOutput* (self.output*(1-self.output))
        
        return self.gradInput
    
    def __repr__(self):
        return "Sigmoid"
    


In [8]:
class Tanh(Module):
    def __init__(self):
         super(Tanh, self).__init__()
    
    def updateOutput(self, inpt):
        
        self.output = (1. - np.exp(-2*inpt)) / (1. + np.exp(-2*inpt))
        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        
        self.gradInput = gradOutput*(1-self.output**2)
        return self.gradInput
    
    def __repr__(self):
        return "Tanh"

In [9]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()
    
    def updateOutput(self, inpt):
        
        #TODO: implement RELU 
        self.output = np.maximum(inpt,0)

        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        
        self.gradInput = gradOutput*(inpt>0)
        return self.gradInput
    
    def __repr__(self):
        return "ReLU"

# Criterions

In [10]:
class Criterion(object):
    """
    Base class for criterions.
    """
    def __init__ (self):
        self.output = None
        self.gradInput = None
    
    def forward(self, inpt, target):
        """
        Given an input and a target, compute the loss function 
        associated to the criterion and return the result.
            
        (all the code goes in updateOutput)
        """
        return self.updateOutput(inpt, target)

    def backward(self, inpt, target):
        """
        Given an input and a target, compute the gradients of the loss function
        associated to the criterion and return the result. 

        (all the code goes in updateGradInput)
        """
        return self.updateGradInput(inpt, target)
    
    def updateOutput(self, inpt, target):
        """
        Override this.
        """
        return self.output

    def updateGradInput(self, inpt, target):
        """
        Override this.
        """
        return self.gradInput   

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Criterion"

**MSECriterion** (L2 norm)

In [26]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()

    def updateOutput(self, inpt, target):   
        
        #TODO: implement MSE
        self.output = (1/2)*np.mean((target - inpt)**2)
        
        return self.output 
 
    def updateGradInput(self, inpt, target):
        
        self.gradInput = (inpt - target)/len(inpt)
        
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

**CrossEntropyCriterion**. ([multiclass log loss](http://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy)). Remember that targets are one-hot encoded.

In [12]:
class CrossEntropyCriterion(Criterion):
    def __init__(self):
        super(CrossEntropyCriterion, self).__init__()
    
    def updateOutput(self, inpt, target): 
        
        # trick to avoid numerical errors
        input_clamp = np.maximum(1e-15, np.minimum(inpt, 1 - 1e-15) )

        self.output = np.sum(target*np.log(input_clamp))/(-len(input_clamp))
        
        return self.output

    def updateGradInput(self, inpt, target):
        
        input_clamp = np.maximum(1e-15, np.minimum(inpt, 1 - 1e-15))
                
        #TODO: calculate gradient w.r.t input
        
        self.gradInput = (-target/input_clamp) + (1-target)/(1-input_clamp)
        
        return self.gradInput
    
    def __repr__(self):
        return "CrossEntropyCriterion"

In [15]:
class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()
        self.p = p
        self.mask = None

    def updateOutput(self, inpt):        
        #TODO: calculate self.output depending on training or testing phase
        if self.train:
            self.mask = np.random.binomial(1, self.p, size=inpt.shape)
            self.output = inpt * self.mask
            return  self.output
        else:
            self.output = inpt * self.p
            return self.output  
        
    def updateGradInput(self, inpt, gradOutput):
    
        #TODO: calculate self.gradInput depending on training or testing phase
        if self.train:
            self.gradInput = gradOutput * self.mask
        return self.gradInput

    def __repr__(self):
        return "Dropout"

In [27]:
class Dropout_Inverted(Module):
    def __init__(self, p=0.5):
        super(Dropout_Inverted, self).__init__()
        self.p = p
        self.mask = None

    def updateOutput(self, inpt):        
        #TODO: calculate self.output depending on training or testing phase
        if self.train:
            self.mask = np.random.binomial(1, self.p, size=inpt.shape)/self.p
            self.output = inpt * self.mask
            return self.output
        else:
            self.output = inpt 
            return self.output  
        
    def updateGradInput(self, inpt, gradOutput):
    
        #TODO: calculate self.gradInput depending on training or testing phase
        if self.train:
            self.gradInput = gradOutput * self.mask
        return self.gradInput

    def __repr__(self):
        return "Dropout"