### 1d case for starters

In [2]:
w = rand(5)
t = 1.5   # observed value

1.5

In [3]:
y(w,x) = isempty(w) ? x :  w[end] * y(w[1:end-1],x)  # w[end]*w[end-1]...w[1]*x
δ(w,x) = y(w,x)-t
E(w,x) = δ(w,x)^2/2  # loss function

E (generic function with 1 method)

In [4]:
x = 2
ForwardDiff.gradient( w->E(w,x), w)

5-element Array{Float64,1}:
 -0.332454
 -0.326179
 -0.204273
 -0.184636
 -0.483139

In [5]:
#  backward part  δ'(w,x)*w[end]*w[end-1] ... w[k+1]
#  forward  part   x*w[1]*w[2]*...*w[k-1]
[ δ(w,x) * prod(w[k+1:end]) *  prod(w[1:k-1]) * x for  k=1:5]

5-element Array{Float64,1}:
 -0.332454
 -0.326179
 -0.204273
 -0.184636
 -0.483139

In [6]:
[x;accumulate(*,x,w[1:end-1])] .* [reverse(accumulate(*,δ(w,x),reverse(w[2:end])));δ(w,x)]

5-element Array{Float64,1}:
 -0.332454
 -0.326179
 -0.204273
 -0.184636
 -0.483139

### Matrix Case

In [344]:
n = [7,5,3,2]
n = [1,1,1]
d = length(n)
W = [ rand(n[i+1],n[i]) for i=1:d-1] # Neural network weights
x = rand(n[1])                       # start of neural network
nodes = [x]
t = rand(n[d])                       # observation

1-element Array{Float64,1}:
 0.44125

In [345]:
# a[1] is just x  (vector of length n[1])
# a[2] is just W[1]*x (vector of length n[2])
# z[2] is h.(a[2])
# a[3] is just W[2]*z[2] = W[2]*h.(W[1]*x)


# a[1] = x
# z[1] = a[1]
# for k=2:???
#  a[k] is W[k-1]*z[k]
#  z[k] is h.(a[k])
# end

In [346]:
# FORWARD:  the nodes
for i=1:d-2  push!(nodes,  W[i]*nodes[i]) end
yᵖ = W[d-1]*nodes[d-1] # predicted value
δ⁰ = yᵖ-t      # error in prediction
deltas = [δ⁰']
# BACKWARD: the Deltas
for i=d-1:-1:2 deltas=append!( [deltas[1]*W[i]],deltas) end

In [347]:
G = deltas'.*nodes'
for i=1:d-1 
    display(G[i]) 
end

1×1 Array{Float64,2}:
 -0.0843216

1×1 Array{Float64,2}:
 -0.0865783

 These are the gradients of  δ'δ/2 with respect to each of the W matrices in the W vector
 where δ depends on W,x, and t

## Check using numerical FowardDiff

In [348]:
function a(W,x,t)  
    z = x
    for i=1:(d-1)
        z = W[i]*z
    end
    sum(abs2,z-t)/2
end
        

a (generic function with 2 methods)

In [349]:
for k=1:(d-1)
  Z = zeros(W[k])
  ϵ = .0001
  for i=1:size(W[k],1), j=1:size(W[k],2)
    WW = deepcopy(W)
    WW[k][i,j] += ϵ  
    Z[i,j]=  (a(WW,x,t)-a(W,x,t))/ϵ
  end
  display(Z)
end

1×1 Array{Float64,2}:
 -0.0843183

1×1 Array{Float64,2}:
 -0.0865748

## Add a function h

In [350]:
h(x)  =   x^1.5
h′(x) = 1.5*x^.5

h′ (generic function with 1 method)

In [384]:
d

3

In [405]:
# FORWARD:  the nodes
nodes = [x, W[1]*x]
hnodes = h.(nodes[2])
for i=2:(d-2)     
     push!(nodes,  W[i]*hnodes[i-1])
     push!(hnodes, h.(nodes[end]))
end
yᵖ = (W[d-1]*hnodes[d-2]) # predicted value
δ⁰ = yᵖ-t      # error in prediction
deltas = [δ⁰']
# BACKWARD: the Deltas
for i=d-1:-1:3 deltas=append!( [h′.(nodes[i]').*(deltas[1]*W[i])],deltas) end
deltas=append!( [(deltas[1]*W[2])],deltas)

2-element Array{Array{Float64,2},1}:
 [-0.164155]
 [-0.383237]

In [391]:
nodes

3-element Array{Array{Float64,1},1}:
 [0.599655] 
 [0.263729] 
 [0.0580129]

In [406]:
hnodes

1-element Array{Float64,1}:
 0.135437

In [407]:
[nodes;yᵖ]

3×1 Array{Any,2}:
  [0.599655]
  [0.263729]
 0.0580129  

In [399]:
[x  W[1]*x  W[2]*h.(W[1]*x)]

1×3 Array{Float64,2}:
 0.599655  0.263729  0.0580129

In [392]:
deltas

2-element Array{RowVector{Float64,Array{Float64,1}},1}:
 [-0.140617]
 [-0.328284]

In [393]:
hnodes

1-element Array{Array{Float64,1},1}:
 [0.135437]

In [423]:
function a(W,x,t)  
    z = x
    for i=1:(d-2)
        z = h.(W[i]*z)
    end
    z = W[d-1]*z
    sum(abs2,z-t)/2
end

a (generic function with 2 methods)

In [421]:
a(W,x,t),yᵖ,W[2]*hnodes[1],h.(W[1]*x),W[2]*h.(W[1]*x)

[0.135437]


([0.0580129], [0.0580129], [0.0580129], [0.135437], [0.0580129])

In [424]:
for k=1:(d-1)
  Z = zeros(W[k])
  ϵ = .001
  for i=1:size(W[k],1), j=1:size(W[k],2)
    WW = deepcopy(W)
    WW[k][i,j] += ϵ  
    Z[i,j]=  (a(WW,x,t)-a(W,x,t))/ϵ
  end
  display(Z)
end

1×1 Array{Float64,2}:
 -0.0758509

1×1 Array{Float64,2}:
 -0.0518953

In [425]:
G = deltas'.*nodes'
for i=1:d-1 
    display(G[i]) 
end

1×1 Array{Float64,2}:
 -0.0984364

1×1 Array{Float64,2}:
 -0.101071

In [430]:
δ⁰ .* h.(W[1]*x)

1×1 Array{Float64,2}:
 -0.0519045

In [431]:
δ⁰ .*w[2]*h′.(W[1]*x)

1-element Array{Float64,1}:
 -0.148004

In [429]:
δ⁰

1×1 Array{Float64,2}:
 -0.383237