In [1]:
import numpy as np

In [2]:
class Operation:
    """Represents a Node in the Computation Graph"""
    
    def __init__(self, input_nodes = []):
        """Constructs an Operation with input_nodes as inputs
           which computes outputs to zero or more consumers"""
        self.input_nodes = input_nodes
        self.consumers = []
        
        # Connect this node with its inputs, by adding it as a consumer to its inputs
        for input_node in self.input_nodes:
            input_node.consumers.append(self)
        
        # Add this operation to the Computation Graph
        # TODO: provide the graph explicitly
        _default_graph.operations.append(self)
    
    def compute(self):
        """Computes the output of the operation. Depends on the specific operation."""
        pass

In [3]:
class Add(Operation):
    def __init__(self, x, y):
        super().__init__(input_nodes=[x, y])
    
    def compute(self, x, y):
        return x + y

In [4]:
class Matmul(Operation):
    def __init__(self, A, B):
        super().__init__(input_nodes=[A, B])

    def compute(self, A, B):
        return A.dot(B)

In [5]:
class Sigmoid(Operation):
    def __init__(self, x):
        super().__init__(input_nodes=[x])
    
    def compute(self, x):
        return 1 / (1 + np.exp(-x))

In [6]:
class Softmax(Operation):
    def __init__(self, x):
        super().__init__(input_nodes=[x])
    
    def compute(self, x):
        """The input of Softmax is a vector"""
        # using vector operations
        # axis=1 so that for each row we sum its colums
        # the sum will eat 1 dimension, so we broadcast with [:, None]
        return np.exp(x) / np.sum(np.exp(x), axis=1)[:, None]

In [7]:
class Log(Operation):
    def __init__(self, x):
        super().__init__(input_nodes=[x])
    
    def compute(self, x):
        return np.log(x)

In [8]:
class Multiply(Operation):
    """Element-wise multiplication of 2 matrices A and B"""
    
    def __init__(self, A, B):
        super().__init__(input_nodes=[A, B])

    def compute(self, A, B):
        return A * B

In [9]:
class ReduceSum(Operation):
    """Computes the sum of the given tensor A based on the given axis.
       axis=None computes the sum of the whole tensor A.
    """
    
    def __init__(self, A, axis=None):
        super().__init__(input_nodes=[A])
        self.axis = axis

    def compute(self, A):
        return np.sum(A, axis=self.axis)

In [10]:
class Negate(Operation):
    def __init__(self, x):
        super().__init__(input_nodes=[x])

    def compute(self, x):
        return -x

In [11]:
class Placeholder:
    """Represents an input node which doesn't have any inputs
       and can only be consumed by other Nodes in the Computation Graph.
       
       The Placeholder has a fixed value. Acts like a constant."""
    
    def __init__(self):
        self.consumers = []
        
        # Register the placeholder in the Computation Graph
        # TODO: provide the graph explicitly
        _default_graph.placeholders.append(self)

In [12]:
class Variable:
    """Represents a parameter in the Computation Graph.
       This node doesn't have any inputs and has only consumers.
       
       The Variable's value can change. It is initialized to initial_value."""
    
    def __init__(self, initial_value=None):
        self.value = initial_value
        self.consumers = []
        
        # Register the variable in the Computation Graph
        # TODO: provide the graph explicitly
        _default_graph.variables.append(self)

In [13]:
class Graph:
    """Represents the actual Computation Graph which has 3 types of Nodes:
       - placeholders
       - variables
       - operations
    """
    
    def __init__(self, placeholders=[], variables=[], operations=[]):
        self.placeholders = placeholders
        self.variables = variables
        self.operations = operations

    def as_default(self):
        global _default_graph
        _default_graph = self
        return _default_graph

In [14]:
class Session:
    """Represents a single execution of the whole Computation graph."""
    # TODO: provide the Graph explicitly
    
    def run(self, operation, feed_dict={}):
        """Performs a post-order traversal of all nodes in the Computation graph,
           so that all operations with known inputs are performed first.
        """
        
        nodes_in_post_order = Session.traverse_post_order(operation)
        
        outputs = {operation: None for operation in nodes_in_post_order}
        
        for node in nodes_in_post_order:
            if type(node) == Placeholder:
                outputs[node] = feed_dict[node]
            elif type(node) == Variable:
                outputs[node] = node.value
            elif isinstance(node, Operation):
                computed_inputs = [outputs[input_node] for input_node in node.input_nodes]
                outputs[node] = node.compute(*computed_inputs)

        return outputs[operation]

    @staticmethod
    def traverse_post_order(operation):
        operations_post_order = []
        
        def traverse(node):
            # Placeholders and Variables do not have input_nodes
            if isinstance(node, Operation):
                for input_node in node.input_nodes:
                    traverse(input_node)

            operations_post_order.append(node)
        
        traverse(operation)
        return operations_post_order

In [15]:
# Linear Perceptron

graph = Graph().as_default()

A = Variable(np.array([
    [1, 0],
    [0, -1]
]))
b = Variable(np.array([1, 1]))

x = Placeholder()

y = Add(Matmul(A, x), b)

Session().run(y, feed_dict={
    x: np.array([1, 2])
})

array([ 2, -1])

In [16]:
# Sigmoid Perceptron

graph = Graph().as_default()

x = Placeholder()
w = Variable(initial_value=np.random.normal(0, 1, 2))
b = Variable(initial_value=np.random.normal(0, 1))

perceptron = Sigmoid(Add(Matmul(w, x), b))

Session().run(perceptron, feed_dict={
    x: np.array([-1, 1])
})

0.17487770808997996

In [17]:
# Multi-class Perceptron

graph = Graph().as_default()

training_examples = np.array([
    [-3, -3],
    [-3, -4],
    [4, 5],
    [3, 6]
])

labels = np.array([
    [0, 1],
    [0, 1],
    [1, 0],
    [1, 0]
])

# will be a matrix used for batch computation
X = Placeholder()

W = Variable(np.array([
    [1, -1],
    [1, -1]
]))

b = Variable(np.array([0, 0]))

classifier = Softmax(Add(Matmul(X, W), b))

Session().run(classifier, {
    X: training_examples
})

array([[6.14417460e-06, 9.99993856e-01],
       [8.31528028e-07, 9.99999168e-01],
       [9.99999985e-01, 1.52299795e-08],
       [9.99999985e-01, 1.52299795e-08]])

In [18]:
# Cross-entropy loss

C = Placeholder()

# TODO: not sure about the ReduceSum over all dimensions here
cross_entropy_loss = Negate(ReduceSum(
    Multiply(C, Log(classifier))
))

Session().run(cross_entropy_loss, {
    X: training_examples,
    C: labels
})

7.006181810554592e-06

In [19]:
# Gradient Descent Optimizer

class GradientDescentOptimizer:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
    
    def minimize(self, loss):
        learning_rate = self.learning_rate

        class Minimize(Operation):
            def compute(self, loss):
                # TODO: how to inject compute_gradients or just the gradients at each step?
                gradients_table = compute_gradients(loss)

                for node, gradient in gradients_table.items():
                    if type(node) == Variable:
                        node.value -= learning_rate * gradient
        
        return Minimize()

In [20]:
# Backwards pass of Backpropagation which computes the gradients

from queue import Queue

def compute_gradients(loss):
    gradients_table = {}
    
    # the loss gradient with regard to itself is 1
    gradients_table[loss] = 1
    
    visited = set()
    queue = Queue()
    queue.put(loss)
    visited.add(node)
    
    # BFS from loss backwards to inputs
    while not queue.empty():
        node = queue.get()

#         if hasattr(node, 'consumers'):
        if node != loss: # the loss doesn't have any consumers
            # compute gradient of node

            gradients_table[node] = 0
            
            for consumer in node.consumers:
                # get the accumulated gradient for the consumer
                loss_gradient_wrt_consumer_output = gradients_table[consumer]
                
                # TODO: use multiplication for the chain rule and do not pass gradients around
                
                # apply the chain rule, using consumer's accumulated gradient, to compute...
                loss_gradient_wrt_consumer_inputs = node.gradient(loss_gradient_wrt_consumer_output)
                
                if len(consumer.input_nodes) == 1:
                    # the consumer has only 1 input (the current node), thus consumer's input is scalar
                    gradients_table[node] += loss_gradient_wrt_consumer_inputs
                else:
                    # otherwise the consumer consumes a vector
                    # find the index of node in consumer's inputs
                    node_index_in_consumer_inputs = consumer.input_nodes.index(node)
                    
                    # get the gradient only from the edge from node to consumer
                    loss_gradient_wrt_node = loss_gradient_wrt_consumer_inputs[node_index_in_consumer_inputs]
                    
                    # accumulate the gradient for that consumer
                    gradients_table[node] += loss_gradient_wrt_node

        # continue backwards to node's inputs
        if hasattr(node, 'input_nodes'):
            for input_node in node.input_nodes:
                if not visited.get(input_node):
                    queue.put(input_node)
                    visited.add(input_node)
    
    return gradients_table