In [1]:
import numpy as np

In [2]:
class Node(object):
    def __init__(self, inbound_nodes = []):
        self.inbound_nodes = inbound_nodes
        self.outbound_nodes = []
        
        for node in self.inbound_nodes:
            node.outbound_nodes.append(self)
    
        self.value = None
        self.gradients = {}
    
    def forward(self):
        raise NotImplementedError
        
    def backward(self):
        raise NotImplementedError
        
        
class Placeholder(Node):
    def __init__(self):
        Node.__init__(self)
        
    def forward(self, value=None):
        if value is not None:
            self.value = value
            
    def backward(self):
        self.gradients = {self: 0}
        for n in self.outbound_nodes:
            grad_cost = n.gradients[self]
            self.gradients[self] += grad_cost * 1
            
class Variable(Node):
    def __init__(self, value):
        Node.__init__(self)
    
        self.value = value
        
    def forward(self):
        pass
    
    def backward(self):
        self.gradients = {self: 0}
        for n in self.outbound_nodes:
            grad_cost = n.gradients[self]
            self.gradients[self] += grad_cost * 1
            
class Add(Node):
    def __init__(self, *inputs):
        Node.__init__(self, list(inputs))
        
    def forward(self):
        self.value = sum(val.value for val in self.inbound_nodes)
                     
class Mul(Node):
    def __init__(self, *inputs):
        Node.__init__(self, list(inputs))
        
    def forward(self):
        self.value = np.prod(np.array([val.value for val in self.inbound_nodes]))

class Linear(Node):
    def __init__(self, inputs, weights, bias):
        Node.__init__(self, [inputs, weights, bias])
        
    def forward(self):
        self.value = np.dot(self.inbound_nodes[0].value, self.inbound_nodes[1].value) \
                + self.inbound_nodes[2].value

    def backward(self):
        self.gradients = {n: np.zeros_like(n.value) for n in self.inbound_nodes}

        for n in self.outbound_nodes:
            grad_cost = n.gradients[self]
       
            self.gradients[self.inbound_nodes[0]] += np.dot(grad_cost, self.inbound_nodes[1].value.T)
            self.gradients[self.inbound_nodes[1]] += np.dot(self.inbound_nodes[0].value.T, grad_cost)
            self.gradients[self.inbound_nodes[2]] += np.sum(grad_cost, axis=0, keepdims=False)


            
class Sigmoid(Node):
    def __init__(self, node):
        Node.__init__(self, [node])

    def _sigmoid(self, x):
        return 1.0 / (1.0 + np.exp(-x))

    def forward(self):
        self.value = self._sigmoid(self.inbound_nodes[0].value)
        
    def backward(self):
        self.gradients = {n: np.zeros_like(n.value) for n in self.inbound_nodes}
        for n in self.outbound_nodes:
            grad_cost = n.gradients[self]
            self.gradients[self.inbound_nodes[0]] += grad_cost * self.value * (1.0 - self.value)

        
class MSE(Node):
    def __init__(self, y, a):
        Node.__init__(self, [y, a])

    def forward(self):
        y = self.inbound_nodes[0].value.reshape(-1, 1)
        a = self.inbound_nodes[1].value.reshape(-1, 1)
        self.diff = y - a
        self.m = self.inbound_nodes[0].value.shape[0]
        self.value = np.sum(np.square(y - a)) / self.m
        pass
    
    def backward(self):
        self.gradients[self.inbound_nodes[0]] = (2 / self.m) * self.diff
        self.gradients[self.inbound_nodes[1]] = (-2 / self.m) * self.diff


In [3]:
def topological_sort(feed_dict):
    input_nodes = [n for n in feed_dict.keys()]

    G = {}
    nodes = [n for n in input_nodes]
    while len(nodes) > 0:
        n = nodes.pop(0)
        if n not in G:
            G[n] = {'in': set(), 'out': set()}
        for m in n.outbound_nodes:
            if m not in G:
                G[m] = {'in': set(), 'out': set()}
            G[n]['out'].add(m)
            G[m]['in'].add(n)
            nodes.append(m)

    L = []
    S = set(input_nodes)
    while len(S) > 0:
        n = S.pop()

        if isinstance(n, Placeholder):
            n.value = feed_dict[n]

        L.append(n)
        for m in n.outbound_nodes:
            G[n]['out'].remove(m)
            G[m]['in'].remove(n)
            # if no other incoming edges add to S
            if len(G[m]['in']) == 0:
                S.add(m)
    return L

In [4]:
x, y =  Placeholder(), Placeholder()

add = Add(x, y)
feed_dict = {x : 10,  y : 20 }
sorted_nodes = topological_sort(feed_dict = feed_dict)


def forward_pass(output_node, sorted_nodes):
    for n in sorted_nodes:
        n.forward()

    return output_node.value

def forward_and_backward(graph):
    for n in graph:
        n.forward()
    
    for n in graph[::-1]:
        n.backward()
        
def sgd_update(trainables, learning_rate=1e-2):
    for t in trainables:
        t.value -= learning_rate * t.gradients[t]
    pass


In [5]:
mu, sigma = 0, 0.1
    
X, y = Placeholder(), Placeholder()
W1, b1 = Variable(np.random.normal(mu, sigma, (2, 5))), Variable(np.random.normal(mu, sigma, (5)))
W2, b2 = Variable(np.random.normal(mu, sigma, (5, 1))), Variable(np.random.normal(mu, sigma, (1)))

l1 = Linear(X, W1, b1)
s1 = Sigmoid(l1)
l2 = Linear(s1, W2, b2)
cost = MSE(l2, y)

X_ = np.array([[-1., -2.], [-1, -2]])
y_ = np.array([1, 2])

feed_dict = {
    X: X_,
    y: y_ ,
    W1 : None,
    b1: None,
    W2 : None,
    b2: None,
}

graph = topological_sort(feed_dict)

print (graph)

forward_and_backward(graph)

print (cost.value)
print (X.outbound_nodes)

gradients = [t.gradients[t] for t in [X, y, W1, b1, W2, b2]]

print(gradients)

forward_and_backward(graph)
print (cost.value)
forward_and_backward(graph)
print (cost.value)
forward_and_backward(graph)
print (cost.value)

[<__main__.Variable object at 0x0000021FC571E898>, <__main__.Placeholder object at 0x0000021FC571E4A8>, <__main__.Placeholder object at 0x0000021FC571E8D0>, <__main__.Variable object at 0x0000021FC571E358>, <__main__.Variable object at 0x0000021FC571E5C0>, <__main__.Variable object at 0x0000021FC571EDD8>, <__main__.Linear object at 0x0000021FC571EC18>, <__main__.Sigmoid object at 0x0000021FC571EF60>, <__main__.Linear object at 0x0000021FC571EF28>, <__main__.MSE object at 0x0000021FC571EFD0>]
2.31231747044
[<__main__.Linear object at 0x0000021FC571EC18>]
[array([[ 0.00048625,  0.0061361 ],
       [ 0.0010057 ,  0.01269123]]), array([[ 0.93607711],
       [ 1.93607711]]), array([[ 0.03037664, -0.07852801,  0.09250184, -0.00452026,  0.00383185],
       [ 0.06075327, -0.15705602,  0.18500368, -0.00904052,  0.0076637 ]]), array([-0.03037664,  0.07852801, -0.09250184,  0.00452026, -0.00383185]), array([[-1.92010901],
       [-1.51600467],
       [-1.7258849 ],
       [-1.38240431],
       [-

In [18]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.utils import shuffle, resample

# Load data
data = load_boston()

X_ = data['data']
y_ = data['target']

# Normalize data
X_ = (X_ - np.mean(X_, axis=0)) / np.std(X_, axis=0)

n_features = X_.shape[1]
n_hidden = 10

mu, sigma = 0, 0.1
    
X, y = Placeholder(), Placeholder()
W1, b1 = Variable(np.random.normal(mu, sigma, (n_features, n_hidden))), Variable(np.random.normal(mu, sigma, (n_hidden)))
W2, b2 = Variable(np.random.normal(mu, sigma, (n_hidden, 1))), Variable(np.random.normal(mu, sigma, (1)))

l1 = Linear(X, W1, b1)
s1 = Sigmoid(l1)
l2 = Linear(s1, W2, b2)
cost = MSE(l2, y)


feed_dict = {
    X: X_,
    y: y_ ,
    W1 : None,
    b1: None,
    W2 : None,
    b2: None,
}


epochs = 1090
# Total number of examples
m = X_.shape[0]

batch_size = 11
steps_per_epoch = m // batch_size

graph = topological_sort(feed_dict)
trainables = [W1, b1, W2, b2]

print("Total number of examples = {}".format(m))

# Step 4
for i in range(epochs):
    loss = 0
    for j in range(steps_per_epoch):
        # Step 1
        # Randomly sample a batch of examples
        X_batch, y_batch = resample(X_, y_, n_samples=batch_size)

        # Reset value of X and y Inputs
        X.value = X_batch
        y.value = y_batch

        # Step 2
        forward_and_backward(graph)

        # Step 3
        sgd_update(trainables, 0.01)

        loss += graph[-1].value

    print("Epoch: {}, Loss: {:.3f}".format(i+1, loss/steps_per_epoch))


Total number of examples = 506
Epoch: 1, Loss: 108.988
Epoch: 2, Loss: 33.833
Epoch: 3, Loss: 26.047
Epoch: 4, Loss: 27.612
Epoch: 5, Loss: 25.595
Epoch: 6, Loss: 21.142
Epoch: 7, Loss: 16.972
Epoch: 8, Loss: 18.198
Epoch: 9, Loss: 13.070
Epoch: 10, Loss: 13.922
Epoch: 11, Loss: 13.354
Epoch: 12, Loss: 12.371
Epoch: 13, Loss: 16.164
Epoch: 14, Loss: 11.801
Epoch: 15, Loss: 10.918
Epoch: 16, Loss: 14.995
Epoch: 17, Loss: 11.945
Epoch: 18, Loss: 12.300
Epoch: 19, Loss: 11.247
Epoch: 20, Loss: 13.156
Epoch: 21, Loss: 10.711
Epoch: 22, Loss: 11.379
Epoch: 23, Loss: 11.247
Epoch: 24, Loss: 8.591
Epoch: 25, Loss: 8.535
Epoch: 26, Loss: 9.445
Epoch: 27, Loss: 7.935
Epoch: 28, Loss: 10.067
Epoch: 29, Loss: 10.589
Epoch: 30, Loss: 9.826
Epoch: 31, Loss: 11.094
Epoch: 32, Loss: 9.410
Epoch: 33, Loss: 11.571
Epoch: 34, Loss: 7.423
Epoch: 35, Loss: 11.034
Epoch: 36, Loss: 10.288
Epoch: 37, Loss: 7.106
Epoch: 38, Loss: 10.808
Epoch: 39, Loss: 10.790
Epoch: 40, Loss: 8.257
Epoch: 41, Loss: 8.382
Epo