In [None]:
using Pkg,Statistics,Random,Printf,GZip,Knet,Plots,LinearAlgebra,Distributions

In [None]:
# GENERATING THE NOISY GAUSSIAN (TRAINING DATA)

In [None]:
Random.seed!(4);
Range=3.0; # range of the x values for the target Gaussian function
Incr = 0.03; # determines the number of samples from which we'll learn
Noise_std=0.1; # add noise on the Gaussian

In [None]:
# generate the data from which we'll learn the Gaussian function
# obligatory arguments listed before ";" while optional arguments come after ";".
function gen_noisy_gaussian(;range=1.0,noise=0.1)
    x = collect(-Range:Incr:Range)
    y = exp.(-x.^2) + randn(length(x))*noise; # additive gaussian noise
    return (x,y)
end
# output is two vectors x,y

In [None]:
(x_train,y_train) = gen_noisy_gaussian(range=Range,noise=Noise_std);

In [None]:
Ntrain =length(x_train) # number of training data points

In [None]:
plot(x_train,[y_train,exp.(-x_train.^2)])

In [None]:
# CONSTRUCT THE NETWORK AND THE LOSS FUNCTION

In [None]:
HiddenSize = 2; # number of neurons in the hidden layer
Batchsize = 10;
RegWeight=0.001; # lambda for L2 regularization
InitNorm = 0.5; # initial weight norm

In [None]:
# The deep learning package requires a certain array structure for the weights
# but it is easier for the later analysis to dump them all into a single column vector
function flat(w) # make a single vector out of all weights
    return vcat(w[1],w[2],w[3],w[4])
end

In [None]:
# reconstruct the weight array from the flat weight vector
function unflat(wf)
    return [wf[1:HiddenSize],wf[HiddenSize+1:2*HiddenSize],wf[2*HiddenSize+1:3*HiddenSize],wf[end]]
end

In [None]:
# one layer network performing: tanh.(w[hidden,input] * x[input,batchsize] .+ b[hidden,1])
# The dot "." is for "broadcasting": performing the operation pointwise on each input element

In [None]:
# Change this seed to try different initial weigths w/o changing the training data
Random.seed!(1);

In [None]:
# initialize weights: w = [w1,w2,w3,w4] -> output = w3*tanh.(w1*x .+ w2) .+ w4
w = [rand(HiddenSize),rand(HiddenSize),rand(HiddenSize),rand()];
w = InitNorm*w/norm(flat(w)); # rescale w so that the norm is InitNorm
Nweights = length(flat(w));

In [None]:
# dimensions:
# x: (input x batchsize) - input
# w[1]: (hidden x input) - (input->hidden) weights
# w[2]: (hidden x 1) - (input->hidden) biases
# w[3]: (hidden x output) - (hidden-> output) weights
# w[4]: (output x 1) - (hidden->output) bias

# both x and y are ordered in columns per training data point
function loss(w,x,y)
    guesses =  w[3]'*tanh.(w[1]*x' .+ w[2]) .+ w[4];
    return mean(abs2,y'-guesses) + RegWeight*sum(norm(w[i])^2  for i=1:4)
end


In [None]:
# construct the gradient-calculating function. grad() is a "functional" whose input and output
# is a function. Note that grad() requires loss to be a scalar function
lossgradient = grad(loss)

In [None]:
# calculate gradient at the initial w
# dw has dimensions of w: each weight w_i is replaced with the gradient wrt w_i
dw = lossgradient(w,x_train,y_train)

In [None]:
# The function calculating the diffusion tensor at the point w for a given batchsize
# The prefactor (learning_rate)^2 is NOT INCLUDED

function diffusiontensor(w,xt,yt,Nb) # xt=x_train, yt=y_train, Nb=Batchsize
Nweights = length(flat(w)) # number of weights, that is, dimensions of the diffusion tensor
Nt = length(xt) # number of training examples to be summed over
D = zeros(Nweights,Nweights) # initialize the diffusion matrix
Prefac = ((1/Nb)-(1/Nt))/(2*Nt)
for i=1:Nt
    x=xt[i]
    y=yt[i]
    dwflat = flat(lossgradient(w,[x],[y]))
    D += dwflat.*dwflat'
end
D *= Prefac
return D
end

# use the diffusion tensor at the loss minimum
wmin = [-1.1100157770342636,1.2336447839089348,0.6175562778179838,0.6682464780489719,0.9071070290803469,0.8871744628416607,-0.0297615338498045]
Dmin = diffusiontensor(unflat(wmin),x_train,y_train,Batchsize)
Dmin = Dmin/norm(Dmin,2)

In [None]:
# THE TRAINING FUNCTION THAT PRESENTS THE TRAINING SET IN RANDOM BATCHES (WITH NO REPLACEMENT)
# For random batches WITH replacement, move the line "perm = .." inside the for loop

function mytrain!(w;lr=0.1)
    Nbatch = floor(Int,Ntrain/Batchsize); # few training samples will be left out if Ntrain/Batchsize != integer
    for nb=0:Nbatch-1
        # construct batch
        perm = randperm(Ntrain); # a random permutation of [1:Ntrain] - pick batches as chunks from this array
        x = [x_train[n] for n in perm[nb*Batchsize+1:(nb+1)*Batchsize]]
        y = [y_train[n] for n in perm[nb*Batchsize+1:(nb+1)*Batchsize]]
        # calculate gradient over the batch
#        dw = lossgradient(w,x,y);
        dw = unflat(Dmin*flat(lossgradient(w,x,y)));
        # update weights
        for i=1:length(w)
                w[i] -= lr*dw[i]
        end
    end
    return w
end

In [None]:
# TRAIN THE NETWORK

In [None]:
Nepoch = 10000; # For a quick training run
LearningRate = 0.5; # Ditto

In [None]:
# Collect weights after each epoch in an array (trajectory)
@time weights = [ deepcopy(mytrain!(w,lr=LearningRate)) for epoch=1:Nepoch ];  # copy only copies the top layer, does not descend.

In [None]:
# Checking if the training worked. Compare the learned function with the actual gaussian
xplot=collect(-Range:0.01:Range) # create an array of x values within the range
y_pred = w[3]'*tanh.(w[1]*xplot' .+ w[2]) .+ w[4] # generate the predicted y values
# plot the converged function, the initial gaussian and the noisy training samples
plot(xplot,[y_pred',exp.(-xplot.^2)]); scatter!(x_train,y_train,leg=false)

In [None]:
# Plot loss vs epoch
SamplingRate=10;
x = collect(1:SamplingRate:Nepoch);
y = [loss(weights[i],x_train,y_train) for i in x];
#plot(x,y)
plot(x,y,xaxis=:log10,yaxis=:log10) # can also plot in log-log scale