In [1]:
using LinearAlgebra
import Base: \
function LinearAlgebra.Bidiagonal(dv::Vector{T}, ev::Vector{S}, uplo::Symbol) where {T,S}
    TS = promote_type(T,S)
    return Bidiagonal{TS,Vector{TS}}(dv, ev, uplo)
end


## The base method narrows the type too much. We'll have to ensure that it's as least as wide as the input
function  \(adjA::Adjoint{<:Any,<:Union{UnitUpperTriangular,UnitLowerTriangular}}, B::AbstractVector)
    A = adjA.parent
    TAB = promote_type(eltype(A), eltype(B), typeof(zero(eltype(A))*zero(eltype(B)) + zero(eltype(A))*zero(eltype(B))))
    BB = similar(B, TAB, size(B))
    copyto!(BB, B)
    ldiv!(adjoint(convert(AbstractArray{TAB}, A)), BB)
end

\ (generic function with 152 methods)

In [2]:
h(x) = exp(-x)
h′(x,y) = -y
𝓁(x,y) = sum(abs2,x-y)/2
𝓁′(x,y) = x-y
init(sizes...) = 0.1randn(sizes...)

init (generic function with 1 method)

In [3]:
𝜀 = .0001
n = [5,4,3,1]
N = length(n)-1
B = 7

7

### Scalar Neural Network

In [4]:
function neural_net(params, input; h=h, h′=h′, N=length(params))
    δ = [];
    X = [input];
    for i=1:N
        x = sum(params[i] .* [X[i],1])
        push!(X,h(x))
        push!(δ, h′.(x,X[i+1]))
    end
    return X,δ
end


neural_net (generic function with 1 method)

In [5]:
params =[[init(),init()] for i=1:N] # W and B
x,y = init(),init() # input and output

(0.042904888226299086, -0.15626856145304235)

In [6]:
X,δ = neural_net(params,x)
L   = Bidiagonal(zeros(N),[δ[i] * params[i][1] for i=2:N],:L)
D   = Diagonal(δ.*[[X[i],1]' for i=1:N])
g   = [zeros(N-1);𝓁′(X[N+1],y)]
∇J  = D'*((I-L')\g)

3-element Array{Array{Float64,1},1}:
 [8.47879e-5, 0.00197618]
 [0.0639805, 0.0654314]  
 [-0.866442, -0.933566]  

In [7]:
# ∇Jfd is gradient calculated with finite differences method
∇Jfd = ∇J * 0
ϵ    = ∇J * 0
for i=1:N, j=1:2       
    ϵ[i][j] = 𝜀
    ∇Jfd[i][j]=(𝓁(neural_net(params.+ϵ,x)[1][N+1],y)-𝓁(neural_net(params.-ϵ,x)[1][N+1],y))/2𝜀
    ϵ[i][j] = .0
end
∇Jfd

3-element Array{Array{Float64,1},1}:
 [8.47879e-5, 0.00197618]
 [0.0639805, 0.0654314]  
 [-0.866442, -0.933566]  

### Matrix Neural Network

In [8]:
import Base: +,-,*,/,∘

struct LinearMatrixOp # Is parametric type necessary? It causes un-readable error messages and some other issues.
    f
    fadj
end
LinearMatrixOp(f::Function) = LinearMatrixOp(f,f)

LeftMul(A::Matrix) = LinearMatrixOp(X->A*X, X->A'*X)
RightMul(A::Matrix) = LinearMatrixOp(X->X*A, X->X*A')
HadMul(A::Matrix) = LinearMatrixOp(X->X.*A)
ZeroMul() = LinearMatrixOp(X->Zero())
IdentMul() = LinearMatrixOp(X->X) #not neccessary, can be commented

Base.zero(::Type{LinearMatrixOp}) = ZeroMul() 
Base.one(::Type{LinearMatrixOp}) = IdentMul()
Base.adjoint(A::LinearMatrixOp) = LinearMatrixOp(A.fadj,A.f)
Base.copy(A::LinearMatrixOp) =  LinearMatrixOp(A.f,A.fadj)

*(A::LinearMatrixOp,X::Union{AbstractArray,Number}) = A.f(X)
-(A::LinearMatrixOp) = LinearMatrixOp(X->-A.f(X), X->-A.fadj(X))
∘(A::LinearMatrixOp, B::LinearMatrixOp) = LinearMatrixOp(A.f ∘ B.f, B.fadj ∘ A.fadj)

# A zero
struct Zero end
Base.zero(::Type{Any}) = Zero()
+(::Zero, ::Zero) = Zero()
-(::Zero, A) = -A
+(::Zero, A) = A
*(::Zero, ::Zero) = Zero()
*(X, ::Zero) = Zero()

* (generic function with 349 methods)

In [9]:
function neural_net(params,input;h=h,h′= h′)
    X     = [input]
    δ     = []
    for i=1:length(params)
        x = params[i][1]*X[i] .+ params[i][2]         
        push!(X,h.(x))
        push!(δ,h′.(x,X[i+1]))
    end 
    X,δ
end
array(x)= fill(x,1,1)

array (generic function with 1 method)

In [10]:
# params: `W_i` and `b_i`s: x_{i+1} <- Wi*x_i .+ b_i
params =[[init(n[i+1],n[i]),init(n[i+1])] for i=1:N]
x, y = init(n[1],B), init(1,B);

In [11]:
X,δ = neural_net(params,x)
D = Diagonal([[HadMul(δ[i]) ∘ RightMul(X[i]) HadMul(δ[i])] for i=1:N])
ImL = Bidiagonal([I for i in 1:N], -[HadMul(δ[i]) ∘ LeftMul(params[i][1]) for i=2:N] , :L)
g = push!(Any[Zero() for i=1:N-1],𝓁′(X[N+1],y))
∇J = D'*array.(ImL'\g);

In [12]:
# ∇Jfd is gradient calculated with finite differences method
∇Jfd = params*0
ϵ=params*0
for i=1:length(params), wb=1:2
    for j=1:length(ϵ[i][wb])
        ϵ[i][wb][j] = 𝜀
        ∇Jfd[i][wb][j] =(𝓁(neural_net(params+ϵ,x)[1][N+1],y)-𝓁(neural_net(params-ϵ,x)[1][N+1],y))/2𝜀
        ϵ[i][wb][j] = .0
     end
end
∇Jfd;

In [13]:
∇Jfd[1][1]

4×5 Array{Float64,2}:
 -0.00407991    0.00285509   -0.0428952    0.000825005  -0.0118724  
  0.000410254  -0.000251142   0.00349368  -2.9138e-5     0.000856534
  0.000198321  -0.000139445   0.00190502   2.56003e-6    0.000499563
 -0.00191193    0.00135827   -0.0205804    0.000203086  -0.00526457 

In [14]:
∇J[1][1]

4×5 Array{Float64,2}:
 -0.00407991    0.00285509   -0.0428952    0.000825005  -0.0118724  
  0.000410254  -0.000251142   0.00349368  -2.9138e-5     0.000856534
  0.000198321  -0.000139445   0.00190502   2.56003e-6    0.000499563
 -0.00191193    0.00135827   -0.0205804    0.000203086  -0.00526457 

### A Showcase: Densely Connected Matrix Network

In [15]:
function neural_net(params,input;h=h, h′= h′)
    X     = [input]
    δ     = []
    for i in 1:length(params)
       x = broadcast(+,(params[i] .* [X..., I])...)
       push!(X,h.(x))
       push!(δ,h′.(x,X[i+1]))
    end 
    X,δ
end;
array(x) = fill(x,1,1);

In [16]:
params = [[j==i+1 ?  init(n[i+1],1) : init(n[i+1],n[j])  for j=1:i+1] for i=1:N]
x,y = init(n[1],B), init(1,B);

In [17]:
X,δ = neural_net(params,x)
D = Diagonal([[[(HadMul(δ[i]) ∘ RightMul(X[j]))' for j=1:i]' HadMul(δ[i])] for i=1:N])
ImL = UnitLowerTriangular(Matrix{Any}(undef,N,N))
for i=2:N, j=1:i-1
    ImL[i,j] = -HadMul(δ[i]) ∘ LeftMul(params[i][j+1]) 
end
g = push!(Any[Zero() for i=1:N-1],𝓁′(X[N+1],y))
∇J = D'*array.(ImL'\g)

3-element Array{Array{Any,2},1}:
 [[0.0379171 0.0596779 … -0.00351366 -0.0183997; 0.0176358 0.0267665 … -0.000936798 -0.00829942; -0.00426545 -0.00667903 … 0.000503375 0.00207844; -0.0298646 -0.0460088 … 0.00486409 0.0139724]; [0.132619 0.158898 … 0.122803 0.140189; 0.0590691 0.0733649 … 0.0568234 0.0645881; -0.0149404 -0.0176034 … -0.0135742 -0.0151078; -0.106533 -0.120959 … -0.0929069 -0.103307]]                                                
 [[-0.0270448 -0.0421929 … 0.00362485 0.0128705; 0.00335261 0.00530938 … -0.000566726 -0.00162885; -0.00223139 -0.00341808 … 0.000263075 0.00107243]; [-0.740224 -0.735038 -0.880599 -0.81063; 0.0912298 0.0905783 0.108544 0.0999296; -0.0587999 -0.0583911 -0.0699529 -0.0643888]; [-0.0958499 -0.111914 … -0.086044 -0.0965746; 0.012141 0.0137523 … 0.0105494 0.0116968; -0.00767671 -0.00905481 … -0.00697657 -0.00763726]]
 [[-0.20336 -0.315236 … 0.0260609 0.0969537]; [-5.50542 -5.46695 -6.54949 -6.02892]; [-6.25354 -5.21931 -5.53321]; [-0.713717 -0.8373

In [18]:
# ∇Jfd is gradient calculated with finite differences method
∇Jfd = params*0
ϵ=params*0
for i=1:length(ϵ), j=1:length(ϵ[i]), k=1:length(ϵ[i][j])
        ϵ[i][j][k] = 𝜀
        ∇Jfd[i][j][k] =(𝓁(neural_net(params+ϵ,x)[1][N+1],y)-𝓁(neural_net(params-ϵ,x)[1][N+1],y))/2𝜀
        ϵ[i][j][k] = .0
end

In [19]:
∇Jfd[1][1]

4×5 Array{Float64,2}:
  0.0379171    0.0596779   -0.00049355   -0.00351366   -0.0183997 
  0.0176358    0.0267665   -0.000829261  -0.000936798  -0.00829942
 -0.00426545  -0.00667903  -1.16189e-5    0.000503375   0.00207844
 -0.0298646   -0.0460088   -0.000608528   0.00486409    0.0139724 

In [20]:
∇J[1][1]

4×5 Array{Float64,2}:
  0.0379171    0.0596779   -0.00049355   -0.00351366   -0.0183997 
  0.0176358    0.0267665   -0.000829261  -0.000936798  -0.00829942
 -0.00426545  -0.00667903  -1.16189e-5    0.000503375   0.00207844
 -0.0298646   -0.0460088   -0.000608528   0.00486409    0.0139724 