In [1]:
from autograd import grad
import autograd.numpy as np

### Recursion

This seems similar to boosting. We are (recusrively) adding on an extra (residual) learner. However the only difference between resnets and gradient boosting is when/where they add them. What difference does this make? Or does it reduce down to the same thing?

Let just have a play with using recusrion to define some algorithms.

In [2]:
sigmoid = lambda z:1/(1+np.exp(-z))
#A simple on layer NN function to play with
class Net():
    count = 0
    def __init__(self,size):
        Net.count += 1; self.name = str(Net.count)
        
        self.weights = np.random.standard_normal(size)
        self.biases = np.zeros((size[1],1))
    def __call__(self,x):
        return sigmoid(np.dot(self.weights,x) + self.biases)
    def __repr__(self):
        return 'f' + self.name

In [None]:
#A recursive function that will apply a list of functions, F, to some data,
#as according to some mapping
def Unroll(F,x,mapping): ## this is actually just fold...
    if len(F) == 1:
        return mapping(x,F[0])
    else:
        return Unroll(F[1:],mapping(x,F[0]),mapping)

<img src='../Images/Unrollmapper.png' height = 400 width=400>

* What if we swapped around the mappers and the functions?
* It's cool how we can look at the functions as objects like this.

What are these maps really doing? It gives us a way to think about how we combine variables and functions (which is really just another function). How does this help us?

In [4]:
#Interesting cases
ff_map = lambda z,f: f(z) #a feedforward net
skip_map = lambda z,f: f(z)+z #a Resnet
rnn = lambda h,x: f(x) + h #a RNN
boost = lambda z,f: f(x) + z #boosting

#others?
#mul_map = lambda f,x: x * f(x)  ???
#

In [5]:
###inits
#some input
x = np.random.standard_normal((3,3)) 

#a list of functions/layers
F = [Net((3,3)) for i in range(5)]  # = [f_1,f_2,f_3... f_n]
print(F)

[f1, f2, f3, f4, f5]


In [6]:
#A feedforward net
print(Unroll(F,x,ff_map)) # =  f_n(... f_3(f_3(f_1(x))) ...)

#=  f_3(  f_2(  f_1(y_1,y_2)  ,y_3)  ,y_4)
#!! so if we set y_1 = the target
#then we get  f_3(  f_2(  loss(target,out_1),  out_2)  ,out_3)

[[ 0.21072356  0.21826266  0.21358719]
 [ 0.21964227  0.21001793  0.21359689]
 [ 0.34807249  0.34477761  0.34626691]]


In [7]:
#A ResNet
print(Unroll(F,x,skip_map)) # = f_n( ... f_3(f_2(f_1(x) + x) + f_1(x) + x) f_2(x) + f_1(x) + x ) + ... complicated...

[[ 3.20512124  0.22183679  0.91949788]
 [ 0.43095705  2.70578206  3.64931753]
 [ 1.66645688  1.53153556  2.51976068]]


In [8]:
#A RNN
g = Net((3,3))
G = [g for i in range(5)]#f is the same across time
print(Unroll(G,x,rnn))

#ahh, doesnt quite work. x should be changing with each layer

#what about a multilayer RNN??
#we are passing information in two directions now.
#across in time and down in depth.

[[ 2.72497842  2.42460916  0.06777119]
 [ 2.00572831  2.576681    4.35027651]
 [ 3.89167146  0.07942172  1.81858963]]


In [9]:
#Boosting
print(Unroll(F,0,boost)) # = f_1(x_1) + f_2(x_1) + f_3(x_1)

[[ 3.32100509  2.1610769   1.56781679]
 [ 2.06170987  3.81795911  3.43916054]
 [ 2.10083201  3.61580354  3.26092478]]


### Reduce

In [10]:
func_out = list(map(lambda f:f(x),F))
print('Sum of function outputs = \n',sum(func_out ))
from functools import reduce
print('Map reduce of addition onto function outputs \n',reduce(lambda x,y:x+y,func_out ))

Sum of function outputs = 
 [[ 3.32100509  2.1610769   1.56781679]
 [ 2.06170987  3.81795911  3.43916054]
 [ 2.10083201  3.61580354  3.26092478]]
Map reduce of addition onto function outputs 
 [[ 3.32100509  2.1610769   1.56781679]
 [ 2.06170987  3.81795911  3.43916054]
 [ 2.10083201  3.61580354  3.26092478]]


In [11]:
### Resnet
#forward
print('Forward = \n',reduce(skip_map, [x]+F))
#backward
dL = np.random.random((3,3))
print('Backward =\n',reduce(skip_map, [dL] + list(reversed(F))))

Forward = 
 [[ 3.20512124  0.22183679  0.91949788]
 [ 0.43095705  2.70578206  3.64931753]
 [ 1.66645688  1.53153556  2.51976068]]
Backward =
 [[ 3.71010021  2.81894675  3.96976147]
 [ 2.60590452  2.75511983  1.90141416]
 [ 2.12412577  2.15911346  2.05853539]]


In [18]:
### RNN
#forward
rnn = lambda h,z:g(z) + h
inputs = [np.random.random((3,3)) for i in range(5)]
print('Forward = \n',reduce(rnn, inputs))

Forward = 
 [[ 1.77422587  1.61150717  1.55036097]
 [ 2.45322177  2.15327711  1.73290924]
 [ 3.59526762  2.95965196  3.28490744]]


### A quick implemetation of a ResNet using autograd

In [12]:
class Layer():
    def __init__(self,num):
        self.w = 'W' + str(num)
        self.b = 'b' + str(num)
    def __call__(self,params,x):
        return x + sigmoid(np.dot(x,params[self.w]) + params[self.b])

In [13]:
def generate_net():
    params = {}; Net = []
    for i in range(10):
        params['W' + str(i)] = np.random.standard_normal((3,3))
        params['b' + str(i)] = np.zeros((1,3))
        Net.append(Layer(i))
    return params,Net

params,Net = generate_net()

In [14]:
def predict(x,Net,params):
    #the forward propagation through the net
    output = x
    for layer in Net:
        output = layer(params,output)
    return output

def Loss(params,ims,labels):
    #just calculate the squared error
    return np.sum((labels - predict(ims,Net,params))**2)
dL = grad(Loss)

In [15]:
#some data...
x = np.random.random((10,3))
target = np.random.random((10,3))

def update(epochs = 10,learning_rate = 0.01):
    #for each epoch
    for e in range(epochs):
        #for each batch
        for i in range(10):
            batch = np.random.standard_normal((10,3))
            dL_dparams = dL(params,batch,target)
            #update each parameter
            for key in params:
                params[key] -= learning_rate * dL_dparams[key] #SGD
            
        print(Loss(params,batch,target))
    return params
params = update()

147.300333652
79.1481480023
66.0446403205
55.8333736782
23.491532155
25.433560046
32.0363912341
15.0434883477
17.7814038765
23.4146620214
