In [None]:
using Pkg,Statistics,Random,Printf,GZip,Knet,Plots,LinearAlgebra,Distributions #,Interact,StatsBase

## Generating samples from a noisy Gaussian (training data)

In [None]:
# ENV["GRDIR"]=""
# Pkg.build("GR")

In [None]:
Random.seed!(4);
Range=3.0; # range of the x values for the target Gaussian function
Incr = 0.03; # determines the number of samples from which we'll learn
Noise_std=0.1; # add noise on the Gaussian

In [None]:
# generate the data from which we'll learn the Gaussian function
# obligatory arguments listed before ";" while optional arguments come after ";".
function gen_noisy_gaussian(;range=1.0,noise=0.1)
    x = collect(-Range:Incr:Range)
    y = exp.(-x.^2) + randn(length(x))*noise; # additive gaussian noise
    return (x,y)
end
# output is two vectors x,y

In [None]:
(x_train,y_train) = gen_noisy_gaussian(range=Range,noise=Noise_std);
pop!(x_train);pop!(y_train);

In [None]:
Ntrain =length(x_train) # number of training data points

In [None]:
plot(x_train,[y_train,exp.(-x_train.^2)])

## Construct the network, the loss function, etc.

In [None]:
HiddenSize = 2; # number of neurons in the hidden layer
Batchsize = 10;
RegWeight=0.001; # lambda for L2 regularization
InitNorm = 0.5; # initial weight norm

LearningRate = 0.01; # Ditto

In [None]:
# The deep learning package requires a certain array structure for the weights
# but it is easier for the later analysis to dump them all into a single column vector
function flat(w) # make a single vector out of all weights
    return vcat(w[1],w[2],w[3],w[4])
end

In [None]:
# reconstruct the weight array from the flat weight vector
function unflat(wf)
    return [wf[1:HiddenSize],wf[HiddenSize+1:2*HiddenSize],wf[2*HiddenSize+1:3*HiddenSize],wf[end]]
end

In [None]:
# one layer network performing: tanh.(w[hidden,input] * x[input,batchsize] .+ b[hidden,1])
# The dot "." is for "broadcasting": performing the operation pointwise on each input element

In [None]:
# Change this seed to try different initial weigths w/o changing the training data
Random.seed!(2);

In [None]:
# initialize weights: w = [w1,w2,w3,w4] -> output = w3*tanh.(w1*x .+ w2) .+ w4
w = [rand(HiddenSize),rand(HiddenSize),rand(HiddenSize),rand()];
w = InitNorm*w/norm(flat(w)); # rescale w so that the norm is InitNorm
Nweights = length(flat(w));

In [None]:
# both x and y are ordered in columns per training data point
function sqloss(w,x,y)
    guesses =  w[3]'*tanh.(w[1]*x' .+ w[2]) .+ w[4];
    return mean(abs2,y'-guesses)
end

function reg(w)
    return RegWeight*sum(norm(w[i])^2  for i=1:4)
end

function loss(w,x,y)
    return sqloss(w,x,y) + reg(w)
end

### Construct gradient-calculating functions

In [None]:
# grad() is a "functional" whose input and output
# is a function. Note that grad() requires loss to be a scalar function
lossgradient = grad(loss)
sqlossgradient = grad(sqloss)
reggradient = grad(reg)

In [None]:
# calculate gradient at the initial w
# dw has dimensions of w: each weight w_i is replaced with the gradient wrt w_i
dw = lossgradient(w,x_train,y_train)

### Effective loss function for type-II

In [None]:
function eloss(w,x,y;lr=LearningRate)
    n = floor(Int,Ntrain/Batchsize);
    losslead = n*loss(w,x,y)
    losscorr = norm(flat(lossgradient(w,x,y)))^2
    losscorr += norm(flat(sqlossgradient(w,x,y)))^2/(Ntrain-1)
    losscorr -= sum(norm(flat(sqlossgradient(w,[x[i]],[y[i]])))^2 for i=1:Ntrain)/(Ntrain*(Ntrain-1))
    losscorr *= 0.25*lr*n*(n-1)
    return (losslead - losscorr)
end

In [None]:
elossgradient=grad(eloss)

In [None]:
eloss(w,x_train,y_train)

In [None]:
elossgradient(w,x_train,y_train)

### Training function

In [None]:
# THE TRAINING FUNCTION THAT PRESENTS THE TRAINING SET IN RANDOM BATCHES (WITH NO REPLACEMENT)
# For random batches WITH replacement, move the line "perm = .." inside the for loop

function mytrain!(w;lr=0.1)
    Nbatch = floor(Int,Ntrain/Batchsize); # few training samples will be left out if Ntrain/Batchsize != integer
    perm = randperm(Ntrain); # a random permutation of [1:Ntrain] - pick batches as chunks from this array
    for nb=0:Nbatch-1
        x = [x_train[n] for n in perm[nb*Batchsize+1:(nb+1)*Batchsize]]
        y = [y_train[n] for n in perm[nb*Batchsize+1:(nb+1)*Batchsize]]

        # calculate gradient over the batch and update
        dw = lossgradient(w,x,y);
        for i=1:length(w)
                w[i] -= lr*dw[i]
        end
    end
    return w
end

## Initial training run

In [None]:
Nepoch = 1000; # For a quick training run

In [None]:
# Collect weights after each epoch in an array (trajectory)
@time w_training = [ deepcopy(mytrain!(w,lr=LearningRate)) for epoch=1:Nepoch ];  # copy only copies the top layer, does not descend.
wf_training = zeros(Nepoch,Nweights); for i=1:Nepoch wf_training[i,:] = flat(w_training[i]) end

In [None]:
# Compare the learned function with the actual gaussian
xplot=collect(-Range:0.01:Range) # create an array of x values within the range
y_pred = w[3]'*tanh.(w[1]*xplot' .+ w[2]) .+ w[4] # generate the predicted y values
# plot the converged function, the initial gaussian and the noisy training samples
plot(xplot,[y_pred',exp.(-xplot.^2)]); scatter!(x_train,y_train,leg=false)

In [None]:
# Plot loss vs epoch
SamplingRate=10;
x = collect(1:SamplingRate:Nepoch);
y = [loss(w_training[i],x_train,y_train) for i in x];
plot(x,y)
#plot(x,y,xaxis=:log10,yaxis=:log10) # can also plot in log-log scale

## Diffusion tensor, Hessian, Covariance Matrix

In [None]:
# Calculate the diffusion tensor by sampling the noise
function diffusiontensor_num(w,n;lr=LearningRate) # n: number of samples used for estimation
    wstart = deepcopy(w)
    wlist = zeros(Nweights,n)
    
    for i=1:n
        ww = deepcopy(wstart)
        mytrain!(ww,lr=LearningRate)
        wlist[:,i] = flat(ww)
    end

    # subtract mean
    wlist .-= sum(wlist[:,i] for i=1:n)/n

    D = zeros(Nweights,Nweights)
    for α=1:Nweights
        for β=1:Nweights
            for i=1:n
                D[α,β] += wlist[α,i]*wlist[β,i]
            end
            D[α,β] /= n;
        end
    end
    return 0.5*D
end

In [None]:
# To calculate the Hessian,
# define a function returning elements of the loss-gradient vector dL/dw_j

function lossgradj(w,x,y,j)
    return flat(lossgradient(w,x,y))[j]
end

In [None]:
# Define a function calculating a column of Hessian matrix:
# Returns d^2(L)/dw_idw_j for all i and given j
lossgradgrad = grad(lossgradj)

In [None]:
function hessianmatrix(w,x,y)
    Nw = length(flat(w));
    Hess = zeros(Nw,Nw);
    for j=1:Nw
        Hess[:,j] = flat(lossgradgrad(w,x,y,j))
        end
    return Hess
end

In [None]:
# This calculation is from Michael's overleaf notes:
# https://www.overleaf.com/2523873322bvvnxpwnskfk
function covariancematrix(D,H;lr=LearningRate)
    F = eigen(H);
    h = F.values
    O = F.vectors
    Nw = length(h)
    ODO = O'*D*O;
    Delta = zeros(Nw,Nw);
    for i=1:Nw
        for j=1:Nw
            Delta[i,j] = ODO[i,j]/(h[i]+h[j])
        end
    end
    return (2/lr)*O*Delta*O'
end

### Use Newton's Method to find the minimum of loss (not eloss)

In [None]:
# Using Newton's method get the the true minimum of the full loss function
wf = flat(w_training[end]);
# implement Newton's method to find the true minimum
for n=1:10
    Hess = hessianmatrix(unflat(wf),x_train,y_train)
    gradwf = flat(lossgradient(unflat(wf),x_train,y_train))
    wf = wf - inv(Hess)*gradwf
end

wminf = wf[:,1]
wmin = unflat(wminf)

In [None]:
lossgradient(wmin,x_train,y_train)

### Hessian at the loss minimum

In [None]:
# Hessian at the loss minimum
Hessmin = hessianmatrix(wmin,x_train,y_train)

### Diffusion tensor at the loss minimum

In [None]:
# diffusion tensor at the loss minimum (not the minimum of effective loss)
Dmin = diffusiontensor_num(wmin,10000;lr=LearningRate)

### Covariance matrix as a function of Hessian and Diffusion matrix

In [None]:
Covmin = covariancematrix(Dmin,Hessmin,lr=LearningRate)

In [None]:
# test: must be zero
Hessmin*Covmin + Covmin*Hessmin - (2/LearningRate)*Dmin

## Steady-state

In [None]:
Random.seed!(2) # Verified that the results don't change for different seeds.
Nepoch2 = 100000; # will take about 10 min for 100,000 epochs
LearningRate = 0.001;

In [None]:
w = deepcopy(wmin); # start from the minimum of the potential

In [None]:
@time w_ss = [ deepcopy(mytrain!(w,lr=LearningRate)) for epoch=1:Nepoch2 ];

In [None]:
# Construct the flat trajectory
wf_ss = zeros(Nepoch2,Nweights); for i=1:Nepoch2 wf_ss[i,:] = flat(w_ss[i]) end

### Visualize the steady-state distribution

In [None]:
x_index = 5
y_index = 6;

In [None]:
using StatsBase
trans=30000 # earlier points are transient from loss minimum to the "effective loss" minimum
hrange=collect(trans:length(wf_ss[:,1]))
resxy=(200,200)
fith = fit(Histogram,(wf_ss[hrange,x_index],wf_ss[hrange,y_index]),nbins=resxy)
fith.weights
fith.edges
maxhist=maximum(fith.weights)
#heatmap(fith.weights)
#histogram2d(wf_ss[1:end,x_index],wf_ss[1:end,y_index],bins=resxy)
histogram2d(wf_ss[trans:end,x_index],wf_ss[trans:end,y_index],bins=resxy)
scatter!([wminf[x_index,1]],[wminf[y_index,1]],leg=false,markercolor="cyan",markersize=4) # loss minimum

### Fit a Mv-Gaussian to the equilibrium data

In [None]:
Fit_ss = fit_mle(MvNormal,wf_ss[trans:end,:]')

### Steady-state mean

In [None]:
meanw = Distributions.mean(Fit_ss)

### Covariance matrix

In [None]:
Cov_ss = Distributions.cov(Fit_ss)

In [None]:
eigvals(Cov_ss)

In [None]:
Cov_ss_inv = inv(Cov_ss)

In [None]:
Cov_xy_inv = inv(Cov_ss[[x_index,y_index],[x_index,y_index]])

### Diffusion tensor (numerical)

In [None]:
# diffusion tensor at the minimum of effective loss out of 10,000 samples
Dmin_ef = diffusiontensor_num(unflat(meanw),10000;lr=LearningRate)

In [None]:
# Using gradient descend get to the minimum of the effective loss function
# starting from the steady-state mean
wf = copy(meanw);
delta = 0.03;
# implement GD method to find the true minimum
for n=1:300
    gradwf = flat(elossgradient(unflat(wf),x_train,y_train))
    wf = wf - delta*gradwf # the prefactor needs to be chosen properly by trial-and-error
end

wmin_ef = wf[:,1]
wmin_e = unflat(wmin_ef)

In [None]:
elossgradient(wmin_e,x_train,y_train)

In [None]:
# diffusion tensor at the minimum of effective loss
Dmin_ef = diffusiontensor_num(wmin_e,10000;lr=LearningRate)

### Visualize the steady-state distribution on top of the loss landscape

In [None]:
# Construct a grid enclosing the steady-state trajectory
Lx = (maximum(wf_ss[trans:end,x_index])-minimum(wf_ss[trans:end,x_index]))/2
Ly = (maximum(wf_ss[trans:end,y_index])-minimum(wf_ss[trans:end,y_index]))/2
xrange = 1.5*Lx
yrange = 1.5*Ly
Nx = Ny = 10
dx = xrange/Nx
dy = yrange/Ny
xx = -xrange:dx:xrange;
yy = -yrange:dy:yrange;
x = collect(xx) .+ meanw[x_index]
y = collect(yy) .+ meanw[y_index]
Identity = Diagonal(ones(Nweights,Nweights));
Imask = Identity; Imask[x_index,x_index]=0;Imask[y_index,y_index]=0;
xmask = zeros(Nweights); xmask[x_index]=1.0;
ymask = zeros(Nweights); ymask[y_index]=1.0;

histogram2d(wf_ss[trans:end,x_index],wf_ss[trans:end,y_index],bins=200)

fexp(xi,yi) = -(([xi yi]-meanw[[x_index y_index]])*Cov_xy_inv*([xi yi]-meanw[[x_index y_index]])')[1]
ffit(xi,yi) =  maxhist * fexp(xi,yi)/fexp(x[end],y[end])
contour!(x,y,ffit,linestyle=:dash)

## Actual loss contours
#lossxy(x,y) = 5e9*(loss(unflat(Imask*meanw + x*xmask + y*ymask),x_train,y_train)-loss(unflat(meanw),x_train,y_train))
#contour!(x,y,lossxy)
fexp(xi,yi) = loss(unflat(Imask*meanw + xi*xmask + yi*ymask),x_train,y_train) - loss(wmin,x_train,y_train)
flossxy(xi,yi) = maxhist * log(fexp(xi,yi))/log(fexp(x[Nx],y[Ny]))
contour!(x,y,flossxy)

## Effective loss contours - takes a while to compute since eloss() is not simple
elossmin = eloss(wmin_e,x_train,y_train)
fexp(xi,yi) = eloss(unflat(Imask*meanw + xi*xmask + yi*ymask),x_train,y_train) - elossmin
elossxy(xi,yi) = maxhist * log(fexp(xi,yi))/log(fexp(x[Nx],y[Ny]))
contour!(x,y,elossxy,linestyle=:dashdot)


### Move to the eigen-coordinates

In [None]:
# most relevant directions
Xidx = Nweights
Yidx = Nweights-1

O = eigvecs(Cov_ss);
W_ss = wf_ss*O; # sample weights are row vectors
COV_ss = O'*Cov_ss*O
COV_xy_inv = inv(COV_ss[[Xidx,Yidx],[Xidx,Yidx]])

meanW = O'*meanw
Wmin_ef = O'*wmin_ef
Wminf = O'*wminf


In [None]:
histogram2d(W_ss[trans:end,Xidx],W_ss[trans:end,Yidx],bins=200,aspect_ratio=1)

Lx = (maximum(W_ss[trans:end,Xidx])-minimum(W_ss[trans:end,Xidx]))/2
Ly = (maximum(W_ss[trans:end,Yidx])-minimum(W_ss[trans:end,Yidx]))/2
xrange = 1.5*Lx
yrange = 1.5*Ly
Nx = Ny = 10
dx = xrange/Nx
dy = yrange/Ny
xx = -xrange:dx:xrange;
yy = -yrange:dy:yrange;
x = collect(xx) .+ meanW[Xidx]
y = collect(yy) .+ meanW[Yidx]
Identity = Array(Diagonal(ones(Nweights,Nweights)));
Imask = Identity; Imask[Xidx,Xidx]=0;Imask[Yidx,Yidx]=0;
xmask = zeros(Nweights); xmask[Xidx]=1.0;
ymask = zeros(Nweights); ymask[Yidx]=1.0;

# Contours of the fit mv-Gaussian
fexp(xi,yi) = -(([xi yi]-meanW[[Xidx Yidx]])*COV_xy_inv*([xi yi]-meanW[[Xidx Yidx]])')[1]
Ffit(xi,yi) = maxhist* fexp(xi,yi)/fexp(x[end],y[end])
#contour!(x,y,Ffit,linestyle=:dash)

# contours of the original loss
fexp(xi,yi) = loss(unflat(O*(Imask*meanW + xi*xmask + yi*ymask)),x_train,y_train) - loss(wmin,x_train,y_train)
Flossxy(xi,yi) = (maxhist/5) * (log(fexp(xi,yi)) - log(fexp(x[Nx],y[Ny])))
contour!(x,y,Flossxy)

# contours of effective loss - takes a while since eloss() is not simple
#elossmin = eloss(wmin_e,x_train,y_train)
#fexp(xi,yi) = eloss(unflat(O*(Imask*meanW + xi*xmask + yi*ymask)),x_train,y_train) - elossmin
#Felossxy(xi,yi) = (maxhist/2) * log(fexp(xi,yi))/log(fexp(x[Nx],y[Ny]))
#contour!(x,y,Felossxy,linestyle=:dashdot)

## Test Mike's calculations for effective loss and diffusion matrix in type-II

### First test the effective loss function

In [None]:
# visual confirmation (minimum of effective loss is in the middle of the cloud)
histogram2d(W_ss[trans:end,Xidx],W_ss[trans:end,Yidx],bins=200)#,aspect_ratio=1)

scatter!([Wmin_ef[Xidx,1]],[Wmin_ef[Yidx,1]],leg=false,markercolor="white",markersize=6)
scatter!([meanW[Xidx]],[meanW[Yidx]],leg=false,markercolor="blue",markersize=4)
# scatter!([Wminf[Xidx]],[Wminf[Yidx]],leg=false,markercolor="yellow",markersize=4)

### Next test the diffusion tensor expression

In [None]:
# test the theory for the diffusion matrix of type-II
function sqlossgradj(w,x,y,j)
    return flat(sqlossgradient(w,x,y))[j]
end
sqlossgradgrad = grad(sqlossgradj)

function reggradj(w,j)
    return flat(reggradient(w))[j]
end
reggradgrad = grad(reggradj)

In [None]:
# V matrix
V = zeros(Nweights,Ntrain) # initialize the diffusion matrix
for i=1:Ntrain
    x=x_train[i]
    y=y_train[i]
    V[:,i] = flat(sqlossgradient(w,[x],[y]))
end
V /= Ntrain;

In [None]:
# U matrix
U = zeros(Nweights,Nweights,Ntrain)
for i=1:Ntrain
    x=x_train[i]
    y=y_train[i]
    for α=1:Nweights
        U[:,α,i] = flat(sqlossgradgrad(w,[x],[y],α))
    end
end
U /= Ntrain;

In [None]:
# X vector
X = flat(reggradient(w));

In [None]:
# Y vector
Y = zeros(Nweights,Nweights);
for j=1:Nweights
    Y[:,j] = flat(reggradgrad(w,j))
end

In [None]:
Nbatch = floor(Int,Ntrain/Batchsize)

In [None]:
# Z matrix
Z = Nbatch * (Y*V - sum(U[:,i,:]*X[i] for i=1:Nweights));

In [None]:
# S matrix
S = zeros(Nweights,Ntrain,Ntrain)
for j=1:Ntrain
    S[:,:,j] = sum(U[:,β,:]*V[β,j] for β=1:Nweights)
end
S *= Nbatch^2;

# SDelta
SDelta = copy(S)
for i=1:Ntrain
    SDelta[:,i,i]=zeros(Nweights)
end

In [None]:
# B matrix
B = sum(SDelta[:,j,:] for j=1:Ntrain);

In [None]:
# C matrix
C = sum(SDelta[:,:,j] for j=1:Ntrain);

In [None]:
# F matrix
F = sum(sum(SDelta[:,i,j]*SDelta[:,i,j]' for i=1:Ntrain) for j=1:Ntrain);

In [None]:
# G matrix
G = sum(sum(SDelta[:,i,j]*SDelta[:,j,i]' for i=1:Ntrain) for j=1:Ntrain);

In [None]:
# Coefficients a_i
M = Ntrain
m = Batchsize
n = Nbatch

a0 = (M-m)*(M+m)/(12m^2)
a1 = (n+1)*(M-m)/(12(M-1))
a2 = (M-m)*((M+m)*(M-4)+6)/(12(M-2)*(M-1)^2)
a3 = (M-m)*(M+m-2)/(4(M-1)^2)
a4 = -(0.5*(M-m)/(M-1))^2
a5 = -(M-m)*(12M+(M-4)*(M+6)*m)/(12M*(M-2)*(M-1)^2);

In [None]:
Dtheory = 0.5*LearningRate^4*(
    a0*Z*Z' +
    a1*(Z*(B'-C') + (B-C)*Z') +
    a2*(B*B') +
    a2*(C*C') +
    (a3-2a2)*F +
   (a4-2a5)*G +
    a5*(B*C' + C*B')
    )

In [None]:
Dtheory./Dmin_ef

### An approximation for Diffusion and Covariance matrices

In [None]:
Hessmin_e = hessianmatrix(wmin_e,x_train,y_train)

In [None]:
# check the approximation for D
coefD = a2*(n*LearningRate)^4/2
Dapprx = coefD * (Hessmin_e*V*V'*Hessmin_e)

In [None]:
Dapprx./Dmin_ef

In [None]:
# Use the FD relation to calculate the covariance matrix for D ∼ HVV'H

coefD = a2*(n*LearningRate)^4/2
h = eigvals(Hessmin_e);
O = eigvecs(Hessmin_e);
W = O'*V*V'*O
hmat = zeros(Nweights,Nweights)
for α=1:Nweights
    for β=1:Nweights
        hmat[α,β] = (2/LearningRate)*coefD*h[α]*h[β]/(h[α]+h[β])
    end
end
Cov_transformed = hmat.*W
Covapprx = O*Cov_transformed*O'
Covapprx./Cov_ss ./20 # why the extra factor of 20?