In [1]:
using LinearAlgebra

In [2]:
h(x) = exp(-x)
h′(x,y) = -y
𝓁(x,y) = sum(abs2,x-y)/2
𝓁′(x,y) = x-y
init(sizes...) = 0.1randn(sizes...)

init (generic function with 1 method)

In [3]:
𝜀 = .0001
n = [5,4,3,1]
N = length(n)-1
B = 7

7

### Scalar Neural Network

In [4]:
function neural_net(params, input; h=h, h′=h′, N=length(params))
    δ = [];
    X = [input];
    for i=1:N
        x = sum(params[i] .* [X[i],1])
        push!(X,h(x))
        push!(δ, h′.(x,X[i+1]))
    end
    return X,δ
end


neural_net (generic function with 1 method)

In [5]:
params =[[init(),init()] for i=1:N] # W and B
x,y = init(),init() # input and output

(0.07672914106422761, -0.06442261441765063)

In [6]:
X,δ = neural_net(params,x)
L   = Bidiagonal(zeros(N),[δ[i] * params[i][1] for i=2:N],:L)
D   = Diagonal(δ.*[[X[i],1]' for i=1:N])
g   = [zeros(N-1);𝓁′(X[N+1],y)]
∇J  = D'*((I-L')\g)

3-element Array{Array{Float64,1},1}:
 [-0.000313304, -0.00408325]
 [-0.0779776, -0.0748074]   
 [-1.47829, -1.11171]       

In [7]:
# ∇Jfd is gradient calculated with finite differences method
∇Jfd = ∇J * 0
ϵ    = ∇J * 0
for i=1:N, j=1:2       
    ϵ[i][j] = 𝜀
    ∇Jfd[i][j]=(𝓁(neural_net(params.+ϵ,x)[1][N+1],y)-𝓁(neural_net(params.-ϵ,x)[1][N+1],y))/2𝜀
    ϵ[i][j] = .0
end
∇Jfd

3-element Array{Array{Float64,1},1}:
 [-0.000313304, -0.00408325]
 [-0.0779776, -0.0748074]   
 [-1.47829, -1.11171]       

### Simple Matrix Neural Network

In [8]:
import Base: +,-,*,/,zero,one,adjoint,convert,inv,size,iszero,transpose,length

# Abstract type for operations
abstract type Map; end

#Right Multiplication Operation
struct RM <: Map; A; end
-(K::RM) = RM(-K.A)
*(K::RM, X::Union{AbstractArray,Number}) =  X * K.A
adjoint(K::RM) = RM(K.A')

# Hadamard Operation
struct (Δ)   <: Map; A; end
-(X::Δ) = Δ(-X.A)
*(X::Δ,Y::Union{AbstractArray,Number}) = X.A .* Y
*(Y::Union{AbstractArray,Number},X::Δ) = Y .* X.A
adjoint(X::Δ) = Δ(X.A)

# Generic zero mapping
struct Zero  <: Map; end # zero of the vector space
iszero(X::Map) = isa(X,Zero)

*(X::Zero,Y) = Zero()
*(Y,X::Zero) = Zero()
*(::Zero,::Zero) = Zero()

+(X::Zero,Y) = Y
+(Y,X::Zero) = X+Y
+(::Zero,::Zero) = Zero()

-(X::Zero)   = X
-(X::Zero,Y) = -Y
-(Y,X::Zero) = Y
-(::Zero,::Zero) = Zero()
adjoint(X::Zero) = X

# Identity mapping
struct One   <: Map; sign::Bool; end # identity map of the vector space

*(Y,X::One) = X.sign ?  Y : -Y
*(X::One,Y) = Y*X
-(X::One) = One(!X.sign)
One() = One(true)
adjoint(X::One) = X

# Composition of Mappings. It applies a chain of two operation.
# Since ∘ symbol is already defined in Julia Base, I used ⦿
struct (⦿) <: Map; A; B; end
*(C::⦿, X::Union{AbstractArray,Number}) = C.A*(C.B*X)
adjoint(K::⦿) = K.B' ⦿ K.A'
-(K::⦿) = -K.A ⦿ K.B

- (generic function with 183 methods)

In [9]:
# I needed to create a box type because 
# i) D and L is actually Matrix of Map. Matrix of an Abstract Type causes problems
# ii) We needa a global Zero in backsolve
struct Box; X; end

value(R::Box) = R.X
value(A::Array) = value.(A)
zero(::Type{Box}) = Box(Zero())
zero(::Box) = zero(Box)
iszero(R::Box) = isa(value(R),Zero)
one(::Type{Box})  = Box(One())
one(::Box) = one(Box)
adjoint(R::Box)   = Box(adjoint(value(R)))
inv(R::Box) = Box(inv(value(R)))
convert(::Type{Box},x) = Box(x)
convert(::Type{Box},x::Box) = x

*(R::Box, X) = Box(value(R) * X)
*(X, R::Box) = Box(X * value(R))
*(R1::Box, R2::Box) = Box(value(R1) * value(R2))

-(R::Box)    = Box(-value(R))
-(R::Box, X) = Box(value(R)-X)
-(X,R::Box)  = Box(X-value(R))
-(R1::Box,R2::Box) = Box(value(R1) - value(R2))

+(R1::Box,R2::Box) = Box(value(R1) + value(R2))

/(X,R::Box)  = Box(X*inv(R))

array(x) = fill(x,1,1)

array (generic function with 1 method)

In [10]:
function neural_net(params,input;h=h,h′= h′)
    X     = [input]
    δ     = []
    for i=1:length(params)
        x = params[i][1]*X[i] .+ params[i][2]         
        push!(X,h.(x))
        push!(δ,h′.(x,X[i+1]))
    end 
    X,δ
end


neural_net (generic function with 1 method)

In [11]:
# params: `Wi` and `Bi`s
params =[[init(n[i+1],n[i]),init(n[i+1])] for i=1:N]
x, y = init(n[1],B), init(1,B)

([-0.0217737 0.122212 … 0.00382922 0.00916764; 0.0343039 0.00938758 … -0.0459479 -0.122375; … ; 0.112194 -0.0227026 … 0.13616 -0.130963; -0.0852267 -0.0389311 … -0.0801138 -0.013431], [-0.119377 -0.200791 … 0.151157 0.0513181])

In [12]:
X,δ = neural_net(params,x)
D = Diagonal([[Δ(δ[i]) ⦿ RM(X[i]) Δ(δ[i])] for i=1:N])
L = Bidiagonal(zeros(Box,N), [Box(Δ(δ[i]) ⦿ params[i][1]) for i=2:N] , :L)
g = [[zero(Box) for i=1:N-1]; Box(𝓁′(X[N+1],y))]
∇J = D'*array.((UnitUpperTriangular(-L')\g))

3-element Array{Array{Any,2},1}:
 [Box([0.00101469 0.00594186 … 0.00310045 -0.00723866; -0.00121427 -0.00716893 … -0.00350869 0.00879899; 0.000325603 0.00297791 … 0.00130538 -0.00351086; 0.000386531 0.0022207 … 0.0010546 -0.00270529]); Box([0.0323801 0.0348919 … 0.0253965 0.0268607; -0.0395638 -0.0425254 … -0.0304471 -0.034312; 0.0154709 0.0166175 … 0.0117816 0.013146; 0.012057 0.0132513 … 0.00930622 0.0105188])]
 [Box([-0.770839 -0.809318 -0.755258 -0.845816; -0.507298 -0.532644 -0.49705 -0.556675; 0.99776 1.04751 0.97756 1.09478]); Box([-0.128534 -0.13656 … -0.0983171 -0.109804; -0.0842735 -0.090233 … -0.0644232 -0.0726468; 0.166101 0.177418 … 0.127445 0.141214])]                                                                                                                              
 [Box([-7.47286 -6.28883 -6.30022]); Box([-1.21219 -1.29607 … -0.928966 -1.03681])]                                                                                                                    

In [13]:
# ∇Jfd is gradient calculated with finite differences method
∇Jfd = params*0
ϵ=params*0
for i=1:length(params), wb=1:2
    for j=1:length(ϵ[i][wb])
        ϵ[i][wb][j] = 𝜀
        ∇Jfd[i][wb][j] =(𝓁(neural_net(params+ϵ,x)[1][N+1],y)-𝓁(neural_net(params-ϵ,x)[1][N+1],y))/2𝜀
        ϵ[i][wb][j] = .0
     end
end
∇Jfd

3-element Array{Array{Array{Float64,N} where N,1},1}:
 [[0.00101469 0.00594186 … 0.00310045 -0.00723866; -0.00121427 -0.00716893 … -0.00350869 0.00879899; 0.000325603 0.00297791 … 0.00130538 -0.00351086; 0.000386531 0.0022207 … 0.0010546 -0.00270529], [0.207448, -0.254527, 0.100064, 0.0783629]]
 [[-0.770839 -0.809318 -0.755258 -0.845816; -0.507298 -0.532644 -0.49705 -0.556675; 0.99776 1.04751 0.97756 1.09478], [-0.826567, -0.543981, 1.06985]]                                                                                           
 [[-7.47286 -6.28883 -6.30022], [-7.81211]]                                                                                                                                                                                                      

In [14]:
∇Jfd[1][1]

4×5 Array{Float64,2}:
  0.00101469    0.00594186  -0.00590623   0.00310045  -0.00723866
 -0.00121427   -0.00716893   0.00746274  -0.00350869   0.00879899
  0.000325603   0.00297791  -0.00296868   0.00130538  -0.00351086
  0.000386531   0.0022207   -0.00225397   0.0010546   -0.00270529

In [15]:
value(∇J[1][1])

4×5 Array{Float64,2}:
  0.00101469    0.00594186  -0.00590623   0.00310045  -0.00723866
 -0.00121427   -0.00716893   0.00746274  -0.00350869   0.00879899
  0.000325603   0.00297791  -0.00296868   0.00130538  -0.00351086
  0.000386531   0.0022207   -0.00225397   0.0010546   -0.00270529

### Densely Connected Matrix Network

In [16]:
function neural_net(params,input;h=h, h′= h′)
    X     = [input]
    δ     = []
    for i=1:length(params)
       x = broadcast(+,(params[i] .* [X;One()])...)
       push!(X,h.(x))
       push!(δ,h′.(x,X[i+1]))
    end 
    X,δ
end
array(x) = fill(x,1,1);

In [17]:
params = [[j==i+1 ?  init(n[i+1],1) : init(n[i+1],n[j])  for j=1:i+1] for i=1:N]
x,y = init(n[1],B), init(1,B);

In [18]:
X,δ = neural_net(params,x)
D = Diagonal([[[ (Δ(δ[i]) ⦿ RM(X[j]))' for j=1:i]' Δ(δ[i])] for i=1:N])
L = LowerTriangular(zeros(Box,N,N)) 
for i=2:N, j=1:i-1
    L[i,j] = Box(RM(Δ(δ[i])) ⦿ params[i][j+1]) 
end
g  = [[zero(Box) for i=1:N-1]; Box(𝓁′(X[N+1],y))]
∇J = D'*array.(UnitUpperTriangular(-L')\g)

3-element Array{Array{Any,2},1}:
 [Box([0.0115569 0.00549805 … -0.0161394 0.00147656; 0.00306045 0.0017063 … -0.00473592 0.00038049; -0.0276811 -0.0140677 … 0.0418369 -0.00394034; -0.0251779 -0.0112138 … 0.0351862 -0.00343662]); Box([-0.0430054 -0.0506701 … -0.0449932 -0.0509088; -0.0119413 -0.0141006 … -0.0125106 -0.015109; 0.110069 0.127833 … 0.113095 0.129818; 0.0971891 0.109575 … 0.100665 0.108758])]                                             
 [Box([-0.00919674 -0.00497564 … 0.0145846 -0.00143558; 0.00844697 0.0043144 … -0.0115445 0.00113046; 0.00422829 0.00194669 … -0.00651271 0.000514676]); Box([0.244772 0.329478 0.367253 0.26078; -0.197947 -0.266371 -0.296825 -0.210844; -0.112565 -0.151513 -0.168872 -0.119915]); Box([0.0380329 0.0431343 … 0.0384889 0.044549; -0.0297322 -0.0366535 … -0.0314979 -0.0367982; -0.0175076 -0.020336 … -0.0186085 -0.0207271])]
 [Box([0.181412 0.0937544 … -0.272871 0.0253847]); Box([-4.63588 -6.23969 -6.9542 -4.9386]); Box([-5.40713 -5.47197 -6.78082]);

In [19]:
# ∇Jfd is gradient calculated with finite differences method
∇Jfd = params*0
ϵ=params*0
for i=1:length(ϵ), j=1:length(ϵ[i]), k=1:length(ϵ[i][j])
        ϵ[i][j][k] = 𝜀
        ∇Jfd[i][j][k] =(𝓁(neural_net(params+ϵ,x)[1][N+1],y)-𝓁(neural_net(params-ϵ,x)[1][N+1],y))/2𝜀
        ϵ[i][j][k] = .0
end

In [20]:
∇Jfd[1][1]

4×5 Array{Float64,2}:
  0.0115569    0.00549805  -0.015176    -0.0161394    0.00147656
  0.00306045   0.0017063   -0.00454535  -0.00473592   0.00038049
 -0.0276811   -0.0140677    0.0394598    0.0418369   -0.00394034
 -0.0251779   -0.0112138    0.0325317    0.0351862   -0.00343662

In [21]:
value(∇J[1][1])

4×5 Array{Float64,2}:
  0.0115569    0.00549805  -0.015176    -0.0161394    0.00147656
  0.00306045   0.0017063   -0.00454535  -0.00473592   0.00038049
 -0.0276811   -0.0140677    0.0394598    0.0418369   -0.00394034
 -0.0251779   -0.0112138    0.0325317    0.0351862   -0.00343662