In [None]:
using Pkg,Statistics,Random,Printf,GZip,Knet,Plots,LinearAlgebra

In [None]:
Random.seed!(1);

In [None]:
Range=3.0; # range of the x values for the target Gaussian function

In [None]:
Incr = 0.3; # determines the number of samples from which we'll learn

In [None]:
Noise_std=0.1; # add noise on the Gaussian

In [None]:
# generate the data from which we'll learn the Gaussian function
# obligatory arguments listed before ";" while optional arguments come after ";".
function gen_noisy_gaussian(;range=1.0,noise=0.1)
#    x = sort(randn(len)*range) # randn(): normal distributed 
    x = collect(-Range:Incr:Range)
    
#    y = exp.(-x.^2).*(-noise*2*(rand(length(x)).-0.5).+1) # fractional noise
     y = exp.(-x.^2) + randn(length(x))*noise; # additive gaussian noise
    return (x,y)
end

In [None]:
(x_train,y_train) = gen_noisy_gaussian(range=Range,noise=Noise_std);

In [None]:
N_train =length(x_train) # number of training data points

In [None]:
plot(x_train,[y_train,exp.(-x_train.^2)])

In [None]:
# transpose for easier manipulation during training
x_train = permutedims(x_train);
y_train = permutedims(y_train);

In [None]:
Layersize = 50; # number of neurons in the hidden layer

In [None]:
# output = 1
# input = 1
# hidden = 50
# batchsize = 1
# one layer: tahn.(w[hidden,input] * x[input,batchsize] .+ b[hidden,1])

In [None]:
Random.seed!(2); # Modify weight initialization w/o changing the training data after kernel resets.

In [None]:
# initialize weights
w = [0.1*rand(Layersize,1),0.1*rand(Layersize,1),0.1*rand(1,Layersize),0.1*rand(1,1)]

In [None]:
# dimensions:
# w[1]: (hidden x input) - input->hidden weights
# x: (input x batchsize) - input
# w[2]: (hidden x 1) - input->hidden bias
# w[3]: (hidden x output) - hidden-> output weights
# w[4]: (output x 1) - hidden->output bias

function loss(w,x,y)
    guesses = sum(w[3] * tanh.(w[1]*x.+w[2]) .+ w[4],dims=1) # w[1]=w, w[2]=w0, w[3]=w', w[4]=w0'
    return mean(abs2,y-guesses)
end

In [None]:
# construct the gradient-calculating function
lossgradient = grad(loss)

In [None]:
dw = lossgradient(w,[x_train[1]],[y_train[1]])  # dw has dimnensions of w
                                                # output is the gradient w.r.t. the corresponding weight

In [None]:
function mytrain!(w;lr=0.1)
    for n=1:N_train
#    for n=1:randperm(N_train)
        dw = lossgradient(w,[x_train[n]],[y_train[n]]);
        for i=1:length(w)
            for j=1:length(w[i])
                w[i][j] -= lr*dw[i][j]
            end
        end
    end
    return w
end

In [None]:
Nepoch = 200000; # This needs to be determined by trial and error, depending on the data size.

In [None]:
Learning_Rate = 0.01;

In [None]:
# collect weights after each epoch in an array (trajectory)
@time weights = [ deepcopy(mytrain!(w,lr=Learning_Rate)) for epoch=1:Nepoch ];  # copy only copies the top layer, does not descend.

In [None]:
xplot=collect(-Range:0.01:Range);

In [None]:
y_pred = permutedims(sum(w[3] * tanh.(w[1]*permutedims(xplot).+w[2]) .+ w[4],dims=1));

In [None]:
# plot the converged function, the initial gaussian and the noisy training samples
plot(xplot,[y_pred,exp.(-xplot.^2)]); scatter!(x_train,y_train,leg=false)

In [None]:
SamplingRate=10;

x = collect(1:SamplingRate:Nepoch);

y = [loss(weights[i],x_train,y_train) for i in x];

plot(x,y;yscale=:log,xscale=:log)

In [None]:
# STOP HERE AND GUESS THE "GOOD" MINIMUM FROM THE PLATEAU IN THE LOSS CURVE

In [None]:
Ngoodmin = 1000;

In [None]:
# check the predicted function after Ngoodmin epochs

In [None]:
y_pred = permutedims(sum(weights[Ngoodmin][3] * tanh.(weights[Ngoodmin][1]*permutedims(xplot).+weights[Ngoodmin][2]) .+ weights[Ngoodmin][4],dims=1));

In [None]:
# plot the converged function, the initial gaussian and the noisy training samples
plot(xplot,[y_pred,exp.(-xplot.^2)]); scatter!(x_train,y_train,leg=false)

In [None]:
# Look at the loss on (w0 -> wg), (wg -> w*) and (w0 -> w*) segments (wg is the "good" solution)

In [None]:
x = collect(1:100);

y = [loss(weights[1]+0.01*i*(weights[Ngoodmin]-weights[1]),x_train,y_train) for i in x];

plot(x,y)

In [None]:
x = collect(1:100);

y = [loss(weights[Ngoodmin]+0.01*i*(weights[Nepoch]-weights[Ngoodmin]),x_train,y_train) for i in x];

plot(x,y)

In [None]:
x = collect(1:100);

y = [loss(weights[1]+0.01*i*(weights[Nepoch]-weights[1]),x_train,y_train) for i in x];

plot(x,y)

In [None]:
# sgd steps
deltaweights = [ vcat(weights[i][1],weights[i][2],weights[i][3]',weights[i][4])-vcat(weights[i-1][1],weights[i-1][2],weights[i-1][3]',weights[i-1][4]) for i=2:Nepoch ]; 

In [None]:
# coarse sgd steps
Ncoarse = 100;
coarse_dw = [ sum(deltaweights[1+(i-1)*Ncoarse:i*Ncoarse]) for i=1:floor(Int,(Nepoch-1)/Ncoarse)];

In [None]:
coarse_normdw = [ v/norm(v) for v in coarse_dw];

In [None]:
dot_normdw = dot.(coarse_normdw,coarse_normdw');

In [None]:
heatmap(dot_normdw) # would be better if I could plot this heatmap in log-scale. I need to sample dw's accordingly.
# Note that each increment in x and y axes corresponds to Ncoarse epochs (the "good" prediction is already there at 10)

In [None]:
Nzoom = floor(Int,5*Ngoodmin/Ncoarse);
heatmap(dot_normdw[1:Nzoom,1:Nzoom])

In [None]:
# now try to find the dimensionality of the GD trajectory

In [None]:
function project_out(v,basis) # returns the component of v orthogonal to the support of orthonormal cols of "basis"
    if length(basis)==0
        return v
    else
        dots = v'*basis
        return (v-sum(basis*diagm(0=>dots[:]),dims=2))
    end
end

In [None]:
function gd_support(gdsteps;minnorm=0.5) # minnorm = threshold beyond which perp. gradient component is considered new
    mybasis = Array{Float64}(undef,length(gdsteps[1]),0) # records the basis vectors for the past steps
    mystrides = []; # records the number of gd steps taken in the current manifold
    nsteps = 1;
    for v in gdsteps
        vperp = project_out(v,mybasis)
        if (norm(vperp) > minnorm)
            mybasis = hcat(mybasis,vperp/norm(vperp))
            mystrides = push!(mystrides,nsteps)
            nsteps = 1
        else
            nsteps += 1
        end
    end
    push!(mystrides,nsteps)
    return(mybasis,mystrides)
end

In [None]:
(mybasis,mystrides) = gd_support(coarse_normdw);

In [None]:
length(mystrides) # this is the effective dimension of the gd trajectory

In [None]:
# If the gd steps were random vectors what would be the calculated dimension?

In [None]:
rand_dw = [ randn(length(coarse_dw[1])) for i=1:floor(Int,(Nepoch-1)/100)];

In [None]:
rand_normdw = [ v/norm(v) for v in rand_dw];

In [None]:
(randbasis,randstrides) = gd_support(rand_normdw);

In [None]:
length(randstrides) # effective dimension of the "random" gd steps (same number of steps, same vector size)