This exercies has an implementation on an arbitrary length network defined by you the user. This network was written for 
$$ Y = w_n(cos(w_{n-1}*z_{n-1})) $$
Where $$ z_{n-1} = (cos(w_{n-2}*z_{n-2})) $$
starting with $z_0$
This first block just initiallizes some random weights to train both the residual and nonresidual network.

In [9]:
import numpy as np

X = 3.
Y = 1.

numnodes = 50

eta = 1.0e-4

nodeval = np.random.rand(numnodes)
nodeval[0] = X

weights = np.random.rand(numnodes-1)
nodegrad = np.random.rand(numnodes-1)

nodeval_1 = nodeval + 0
weights_1 = weights + 0
nodegrad_1 = nodegrad + 0

nodeval_res = nodeval + 0
weights_res = weights + 0
nodegrad_res = nodegrad + 0

This defines forward and backward modes for these derivatives. I'm about 80 percent sure this is right.

In [10]:
def forward(nodes, weights):

    for i in range(len(nodes)-2):
        nodenext = np.sin(weights[i]*nodes[i])
        nodes[i+1] = nodenext
    
        grad = np.cos(weights[i]*nodes[i])*weights[i]
        nodegrad[i+1] = grad
    
    nodes[-1] = weights[-1]*nodes[-2]
    nodegrad[-1] = weights[-1]
    
    return nodes, nodegrad


def backward(nodes, weights, nodegrad, seed):
    
    gradientw = []
    
    gradienttot = seed
    gradientw.append(gradienttot*nodes[-2])
    
    for i in range(len(nodegrad)-1):
        gradienttot *= nodegrad[-(i+2)]
        gradientw.append(gradienttot*nodes[-(3+i)]*np.cos(nodes[-(3+i)]*weights[-(2+i)]))
    
    gradientw = np.array(gradientw)
    
    return gradientw[::-1]

def dL(out, target):
    L = (out - target)**2
    dL = 2*(out - target)
    return L, dL

In [11]:
for i in range(100000):
    nodeval_1, nodegrad_1 = forward(nodeval_1, weights_1)
    
    loss, lossgrad = dL(nodeval_1[-1], Y)
    
    gradients = backward(nodeval_1, weights_1, nodegrad_1, lossgrad)
    
    weights_1 -= gradients*eta
    
    if i%10000 == 0:
        print(loss)
    
print(nodeval)
print(nodeval_1)
nodeval_1, nodegrad_1 = forward(nodeval_1, weights_1)
print('Target = ', Y, ' Guess = ', nodeval_1[-1])
    

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
[3.00000000e+00 4.08384981e-01 3.39149921e-01 7.02314940e-01
 2.92176620e-01 1.02602672e-01 6.99597249e-01 5.12020183e-01
 3.57203802e-02 3.19347867e-01 8.28991872e-01 4.72988378e-01
 2.36260510e-01 6.93387570e-01 3.21415819e-01 1.09443502e-01
 5.89612030e-01 9.58107473e-01 5.61776521e-01 9.81404366e-01
 4.22523368e-01 4.80065522e-01 6.43824559e-01 9.91975580e-02
 1.35539200e-02 6.41008420e-01 8.62239531e-01 1.92078642e-02
 9.91075847e-01 6.71430352e-01 2.20105950e-01 5.76294054e-01
 9.09666328e-01 3.19807408e-01 8.49576541e-01 4.92339558e-01
 2.32753866e-01 5.05685447e-01 4.33598341e-01 3.05250763e-01
 7.10683700e-01 2.20124917e-01 7.58456632e-02 1.91547454e-03
 4.01390100e-01 5.45318115e-01 6.77529410e-01 2.43566017e-01
 8.87893888e-01 8.25949022e-01]
[3.00000000e+00 7.49275943e-01 4.80825628e-01 4.45580539e-01
 6.42168000e-02 3.69042739e-02 2.69952430e-02 1.67088381e-02
 1.57038015e-02 1.49521352e-02 8.32293256e-03 4.92112499e-03
 4.47615958e-

Well, that isn't the best. Let's try a residual implementation where $$ Y = w_n(cos(w_{n-1}*z_{n-1})) + z_{n-1}$$
Where $$ z_{n-1} = (cos(w_{n-2}*z_{n-2})) + z_{n-2} $$
starting with $z_0$

In [12]:
def forward_res(nodes, weights):

    for i in range(len(nodes)-2):
        nodenext = np.sin(weights[i]*nodes[i]) + nodes[i]
        nodes[i+1] = nodenext
    
        grad = np.cos(weights[i]*nodes[i])*weights[i] + 1
        nodegrad[i+1] = grad
    
    nodes[-1] = weights[-1]*nodes[-2] + nodes[-2]
    nodegrad[-1] = weights[-1] + 1
    
    return nodes, nodegrad


def backward_res(nodes, weights, nodegrad, seed):
    
    gradientw = []
    
    gradienttot = seed*nodes[-2]
    gradientw.append(gradienttot)
    
    for i in range(len(nodegrad)-1):
        gradienttot *= nodegrad[-(i+2)]
        gradientw.append(gradienttot*nodes[-(3+i)]*np.cos(nodes[-(3+i)]*weights[-(2+i)]))
    
    gradientw = np.array(gradientw)
    
    return gradientw[::-1]


In [13]:
for i in range(100000):
    nodeval_res, nodegrad_res = forward_res(nodeval_res, weights_res)
    
    loss, lossgrad = dL(nodeval_res[-1], Y)

    gradients = backward_res(nodeval_res, weights_res, nodegrad_res, lossgrad)
    
    weights_res -= gradients*eta
    
    if i%10000 == 0:
        print(loss)
        
print(nodeval)
print(nodeval_1)
nodeval_res, nodegrad_res = forward_res(nodeval_res, weights_res)
print('Target = ', Y, ' Guess = ', nodeval_res[-1])

198.5660328412957
1.97599535109164e-07
2.8008147958213646e-13
3.975275718052928e-19
5.427917202706479e-25
9.201273598497882e-27
9.201273598497882e-27
9.201273598497882e-27
9.201273598497882e-27
9.201273598497882e-27
[3.00000000e+00 4.08384981e-01 3.39149921e-01 7.02314940e-01
 2.92176620e-01 1.02602672e-01 6.99597249e-01 5.12020183e-01
 3.57203802e-02 3.19347867e-01 8.28991872e-01 4.72988378e-01
 2.36260510e-01 6.93387570e-01 3.21415819e-01 1.09443502e-01
 5.89612030e-01 9.58107473e-01 5.61776521e-01 9.81404366e-01
 4.22523368e-01 4.80065522e-01 6.43824559e-01 9.91975580e-02
 1.35539200e-02 6.41008420e-01 8.62239531e-01 1.92078642e-02
 9.91075847e-01 6.71430352e-01 2.20105950e-01 5.76294054e-01
 9.09666328e-01 3.19807408e-01 8.49576541e-01 4.92339558e-01
 2.32753866e-01 5.05685447e-01 4.33598341e-01 3.05250763e-01
 7.10683700e-01 2.20124917e-01 7.58456632e-02 1.91547454e-03
 4.01390100e-01 5.45318115e-01 6.77529410e-01 2.43566017e-01
 8.87893888e-01 8.25949022e-01]
[3.00000000e+00 7.49

Assuming, I implemented this correctly, just increadible.

I was trying to get this to work with T-money's autodiff, but was failing. It would be more general and fun if we can get it working. 

In [1]:
import numpy as np

class Constant:
    
    def __init__(self, a):
        # Initialize the value of this variable with the value passed in
        self.a_value = a
        # to make sure our children don't throw an exception when they access our grad variable 
        # as in backward() in the BinaryAdd()
        self.grad = 0
    
    def forward(self):
        # what should this return?
        # TODO: fill in the return value
        # Remember this is the last node of the graph.
        return self.a_value
    
    def backward(self):
        # What should go here if it's a constant?
        # Nothing. We want the backpropagation to stop here. 
        # In python, we use pass as shorthand for "return None".
        # Doing this means we can inject constants at any point in the computational graph.
        pass

class BinaryAdd:
    
    def __init__(self, a, b):
        # record the two parents of the binary add
        self.a = a
        self.b = b
        # and initialize the gradient to 0.
        self.grad = 0
        
    def forward(self):
        # a _value and b_value
        # are intermediate values in the computational graph
        # like v4 in Table 3 in the paper.
        # We don't have to store the value of a or b, 
        # but caching them now means we don't have to recompute them on the backward pass.
        self.a_value = self.a.forward()
        self.b_value = self.b.forward()
        return self.a_value + self.b_value
    
    def backward(self):
        # z = a + b
        # dz/da = ?
        # TODO: fill in the None values.
        # Remember, a and b are the parents of this object. 
        dzda = 1
        dzdb = 1
        self.a.grad += dzda*self.grad
        self.b.grad += dzdb*self.grad

class BinaryMul:
    
    def __init__(self, a, b):
        self.a = a
        self.b = b
        self.grad = 0
        # TODO: what's missing? What other value do we have to store in a node?
        
    
    def forward(self):
        # again, we don't have to cache self.a_value or self.b_value
        # but it makes the backward pass not have to call a.forward() or b.forward()
        # TODO: fill in None
        self.a_value = self.a.forward()
        self.b_value = self.b.forward()
        return self.a_value * self.b_value

    def backward(self):
        # TODO: fill in the gradient values below (dzda, dzdb)
        # z = a*b 
        # dz/da = ?
        dzda = self.b_value
        dzdb = self.a_value
        self.a.grad += dzda*self.grad
        self.b.grad += dzdb*self.grad
        
class Ln:
    
    def __init__(self, a):
        self.a = a
        self.grad = 0

    def forward(self):
        # TODO: fill me in
        self.a_value = self.a.forward()
        return np.log(self.a_value)
    
    def backward(self):
        # TODO: fill me in
        # z = ln(a)
        # dz/da = ?
        self.a.grad += 1/self.a_value*self.grad

        
class Sin:
    
    def __init__(self, a):
        self.a = a
        self.grad = 0
        
    def forward(self):
        # TODO: fill me in
        self.a_value = self.a.forward()
        return np.sin(self.a_value)
    
    def backward(self):
        # TODO: fill me in 
        # z = sin(a)
        # dz/da = ?
        self.a.grad += np.cos(self.a_value)*self.grad


class BinarySub:
    
    def __init__(self, a, b):
        # TODO:
        # record the two parents of the binary subtract
        self.a = a
        self.b = b
        # and initialize the gradient to 0.
        self.grad = 0
        
    def forward(self):
        # TODO: How do I get a and b?
        self.a_value = self.a.forward()
        self.b_value = self.b.forward()
        return self.a_value - self.b_value
    
    def backward(self):
        # z = a - b
        # dz/da = ?
        # TODO: fill in the None values.
        # Remember, a and b are the parents of this object. 
        dzda = 1
        dzdb = -1
        self.a.grad += dzda*self.grad
        self.b.grad += dzdb*self.grad  # Why is this a += operator?

In [7]:
X = Constant(3)
Y = 2.

w1 = Constant(.4)
w2 = Constant(.1)

v1 = BinaryMul(X, w1)
v2 = Sin(v1)
v3 = BinaryMul(v2, w2)

eta = 1.0e-5

for i in range(100000):

    y = v3.forward()
    
    #print('y: {}'.format(y))
    #print(.5*np.sin(.4*4))

    # TODO: seed the gradient value
    v3.grad = 2*(y - Y) 
    v3.backward()

    # Instead of calling backward() on each node by hand, put the nodes in 
    # a list and iterate over it backwards.

    ls = [X, w1, v1, v2, w2]

    for node in ls[::-1]:
        #print(node.grad)
        node.backward()
    
    w1.a_value -= w1.grad*eta
    w2.a_value -= w2.grad*eta

print('y: {}'.format(y))


y: -0.9785235488295365


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (9,9)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
np.random.seed(0)

In [None]:
train_dataset = dsets.MNIST(root='./data',
                           train=True,
                           transform=transforms.ToTensor(),
                           download=True)

test_dataset = dsets.MNIST(root='./data',
                           train=False,
                           transform=transforms.ToTensor())
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                          batch_size=100,
                                          shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=100,
                                          shuffle=False)

In [None]:
class Net(nn.Module):
    def __init__(self,layers, input_size, num_class):
        super(Net, self).__init__()
        self.input = input_size
        self.classes = num_class
        self.layers = layers
        self.linears = nn.ModuleList([nn.Linear(self.input,self.input)])
        self.linears.extend([nn.Linear(self.input, self.input) for i in range(1, self.layers-1)])
        self.linears.append(nn.Linear(self.input, self.classes))
        
    def forward(self, x):
        temp = None
        i = 0
        for layer in self.linears:
            if i%3 == 0:
                x = torch.sigmoid(layer(x))
                i += 1
            else:
                x = layer(x)
                i += 1

        return x    

In [None]:
input_size = 784       # The image size = 28 x 28 = 784
num_classes = 10       # The number of output classes. In this case, from 0 to 9
num_epochs = 10         # The number of times entire dataset is trained
batch_size = 100       # The size of input data took for one iteration
learning_rate = 1e-3  # The speed of convergence
net = Net(10, input_size,num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(nn.ParameterList(net.parameters()), lr=learning_rate)

In [None]:
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):   # Load a batch of images with its (index, data, class)
        images = Variable(images.view(-1,28*28))         # Convert torch tensor to Variable: change image from a vector of size 784 to a matrix of 28 x 28
        labels = Variable(labels)
        optimizer.zero_grad()                             # Intialize the hidden weight to all zeros
        outputs = net(images)                             # Forward pass: compute the output class given a image
        loss = criterion(outputs, labels)                 # Compute the loss: difference between the output class and the pre-given label
        loss.backward()                                   # Backward pass: compute the weight
        optimizer.step()
    total=0
    correct=0
    # Loop over all the test examples and accumulate the number of correct results in each batch
    for d,t in test_loader:
        outputs = net(d.view(-1,28*28))
        _, predicted = torch.max(outputs.data,1)
        total += Variable(t).size(0)
        correct += (predicted==t).sum()
        
    # Print the epoch, the training loss, and the test set accuracy.
    print(epoch,loss.item(),(100.*correct/total).item())