# FluxSense

A notebook that works on finding the a dynamical trajectory (dubbed "noise") that maximizes either |dJ/dw|^2 or |dJ/dnoise|^2.

Largely superseded by Reverse Time Network, which simply starts from point(s) on the decision boundary and runs time backwards as a method of finding goof trajectories that will give us decent gradients.

# Preliminaries

In [1]:
using PyCall
using PyPlot
using ForwardDiff
using DiffBase

pygui(true)

import Base.convert
convert(::Type{Float64}, x::ForwardDiff.Dual) = Float64(x.value)
function convert(::Array{Float64}, x::Array{ForwardDiff.Dual}) 
    y = zeros(size(x)); 
    for i in 1:prod(size(x)) 
        y[i] = convert(Float64, x[i]) 
    end
    return y
end

include("general_utils.jl")
include("hessian_utils.jl")

"""
We define functions to convert Duals, the variable types used by ForwardDiff, 
to Floats. This is useful if we want to print out the value of a variable 
(since print doesn't know how to Duals). Note that after being converted to a Float, no
differentiation by ForwardDiff can happen!  e.g. after
    x = convert(Float64, y)
ForwardDiff can still differentiate y, but it can't differentiate x
"""





"We define functions to convert Duals, the variable types used by ForwardDiff, \nto Floats. This is useful if we want to print out the value of a variable \n(since print doesn't know how to Duals). Note that after being converted to a Float, no\ndifferentiation by ForwardDiff can happen!  e.g. after\n    x = convert(Float64, y)\nForwardDiff can still differentiate y, but it can't differentiate x\n"

In [2]:
npoints = 100
data_sigma = 10


function make_data(;npoints=100, data_sigma=10, seedrand=NaN)
    if ~isnan(seedrand)
        srand(seedrand)
    end
    return data_sigma*randn(npoints,1)
end

data1 = make_data(npoints=100, data_sigma=10, seedrand=10);


# Trust region method for Hessian minimization

In [3]:
"""
function trust_region_Hessian_minimization(seed, func; start_eta=10, tol=1e-6, maxiter=400,
    verbose=false)

(below, x stands for delta_x, the step from the current x=x0 position at which the cost = const)

cost = 0.5*x'*H*x + grad*x + const

dcost/dx = H*x + grad  ;   dcost/dx = 0  ==> x =  - inv(H)*grad

Trust-region says have a parameter lambda, and replace H with hat{H} = H +  I/eta.  
When eta is very large, this is equivalent to a straight Newton method jump, 
because hat{H} ~= H.  But when eta is small, this is more like a small gradient
descent step, because for small eta inv(hat{H}) ~= eta and therefore the delta x is like 
-eta*grad.  So, if the cost function is going down, make eta larger, and if it is going
up, make eta a lot smaller. Just like we do in other adaptive methods

PARAMETERS:
===========

seed        column vector, representing the starting value of the parameters.

func        Function that takes a vector and returns a scalar.  If you want to
            work with a function that tales more parameterrs and returns more than one 
            output, you can use something like

                    x -> orig_func(x, other_params)[1]

            You only need the "[1]" part if the orig_func returns more outputs than a scalar. 

OPTIONAL PARAMETERS:
====================

start_eta=10   Starting value of eta.  It's good to start with somethibg biggish, if it is
               too much, it'll quickly get cut down.

tol=1e-6       Numerical tolerance. If a proposed jump produces a change in func that is less than
               this, the minimization stops.

maxiter=400    Maximum number of iterations to do before stopping

verbose=false   If true, print out a report on each iteration of iteration number, radius size (eta),
                what type jump was proposed ("Newton" means going straight to global min, "constrained" means jump has 
                norm eta, failed means that finding the minimum at a given radius somehow didn't work). Will also
                print out the cosine of the angle between the proposed jump and the gradient.

RETURNS:
========

params       A vector the size of seed that has the last values of the minimizing parameters for func

"""
function trust_region_Hessian_minimization(seed, func; start_eta=10, tol=1e-6, maxiter=400,
    verbose=false)

    params = seed
    eta = start_eta

    cost, grad, hess = vgh(func, params)


    for i in [1:maxiter;]
        hathess    = hess + eye(length(grad), length(grad))/eta        
        new_params = params - inv(hathess)*grad
        new_cost, new_grad, new_hess = vgh(func, new_params)
            
        if abs(new_cost - cost) < tol
            break
        end

        if new_cost >= cost
            eta = eta/2
            costheta = NaN
        else
            eta = eta*1.1
            costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))

            params = new_params
            cost = new_cost
            grad = new_grad
            hess = new_hess
        end

        if verbose
            @printf "%d: eta=%.3f cost=%.4f costheta=%.3f ps=" i eta cost  costheta
            print_vector(params)
            @printf "\n"
        end
    end
    
    return params
end


trust_region_Hessian_minimization

### Testing the trust region function

In [4]:
npoints = 1000
args = ["baseline", "amplitude", "threshold", "slope"]

params = [1 5 0.5 0.8]
x = rand(npoints, 1)*6-3
y = params[1] + params[2]*0.5*(tanh((x-params[3])/params[4])+1) + randn(npoints,1)*2


figure(1); clf();
subplot(3,1,1);
plot(x, y, ".")

seed = [8, 3.1, 0, -4]
xx = -3:0.01:3

plot(xx, seed[1] + seed[2]*0.5*(tanh((xx-seed[3])/seed[4])+1), "g-")

function JJ(x, y; baseline=0, amplitude=1, threshold=0, slope=1)
    yhat =  baseline + amplitude*0.5*(tanh((x-threshold)/slope)+1) 
    err = yhat - y
    return sum(err.*err)
end

opars = trust_region_Hessian_minimization(seed, (w) -> JJ(x, y; make_dict(args, w)...), 
verbose=false, start_eta=0.001)

plot(xx, opars[1] + opars[2]*0.5*(tanh((xx-opars[3])/opars[4])+1), "r-")

1-element Array{Any,1}:
 PyObject <matplotlib.lines.Line2D object at 0x324626990>

### Defining J()   -- this one is for the single layer

In [5]:
function J(data1; noisy=[], threshold=0.5, slope=0.25, theta1 = 0.15, theta2=0.2, beta=0.005,
    do_plot=true, nderivs=0, difforder=0, verbose=true)

    if length(noisy) > 0
        data1 = [data1 ; noisy]
    end
    npoints = length(data1)

    d1 = tanh((data1 - threshold)*slope)/2+0.5

    hits = 0.5*(1 + tanh.((d1-0.5)/theta1))
    difs = tanh((d1 - 0.5)/theta2).^2
    
    if do_plot
        figure(1); clf();
        subplot(3,1,1)
        plot(1:npoints, d1, "b."); # @printf("Plotted %d points\n", npoints)
        ylabel("d1"); 
        title(@sprintf("threshold=%.3f slope=%.3f", convert(Float64, threshold), convert(Float64, slope)))
        subplot(3,1,2)
        plot(1:npoints, hits, ".")
        ylabel("hits")
        subplot(3,1,3)
        plot(1:npoints, difs, ".")
        ylabel("difs")        
        title(@sprintf("<hits>=%.3f <difs>=%.3f", convert(Float64, mean(hits)), convert(Float64, mean(difs))))
    end
    cost1 = (mean(hits) - 0.75)^2
    cost2 = -mean(difs) 

    cost = cost1 + beta*cost2

    if verbose
        @printf("        cost1=%g, cost2=%g, mean(hits)=%.4f, mean(difs)=%.4f\n", 
            convert(Float64, cost1), beta*convert(Float64, cost2), 
            convert(Float64, mean(hits)), convert(Float64, mean(difs)))
    end

    return cost
end



J (generic function with 1 method)

### Defining JJ()   -- this one is for the two-layer network

In [6]:
function JJ(inputs; noisy=[], k1=2, k2=-2, th1=0.5, th2=0.5, slope1=0.25, slope2=0.25,
        theta1 = 0.15, theta2=0.2, beta=0.005, do_plot=true, nderivs=0, difforder=0, verbose=true)

    data1 = ForwardDiffZeros(length(inputs), 1, nderivs=nderivs, difforder=difforder)
    for i=1:length(inputs); data1[i] = inputs[i]; end;

    npoints = length(data1)

    if length(noisy)==0
        noisy = ForwardDiffZeros(2, 1, nderivs=nderivs, difforder=difforder)
    end
    
    function g(d, slope, thresh)
        return 0.5 + 0.5*(tanh.((d-thresh)*slope))
    end

    data1[end] += noisy[1]    
    v1 = g(data1 + k1, slope1, th1)

    v1[end] += noisy[2]
    v2 = g(v1 + k2, slope2, th2)
    v1[end] -= noisy[2]

    hits = 0.5*(1 + tanh.((v2-0.5)/theta1))
    difs = tanh.((v2 - 0.5)/theta2).^2
    
    if do_plot
        figure(1); clf();
        ax1 = subplot(4,1,1)
        plot(1:npoints, v1, "b."); # @printf("Plotted %d points\n", npoints)
        ylabel("v1"); 
        title(@sprintf("th1=%.3f slope1=%.3f", convert(Float64, th1), convert(Float64, slope1)))
        ax2 = subplot(4,1,2)
        plot(1:npoints, v2, "b."); # @printf("Plotted %d points\n", npoints)
        ylabel("v2"); 
        title(@sprintf("th2=%.3f slope2=%.3f", convert(Float64, th2), convert(Float64, slope2)))
        ax3 = subplot(4,1,3)
        plot(1:npoints, hits, ".")
        ylabel("hits")
        ax4 = subplot(4,1,4)
        plot(1:npoints, difs, ".")
        ylabel("difs")        
        title(@sprintf("<hits>=%.3f <difs>=%.3f", convert(Float64, mean(hits)), convert(Float64, mean(difs))))
        remove_xtick_labels([ax1, ax2, ax3])
    end
    cost1 = (mean(hits) - 0.75)^2
    cost2 = -mean(difs) 

    cost = cost1 + beta*cost2

    if verbose
        @printf("        cost1=%g, cost2=%g, mean(hits)=%.4f, mean(difs)=%.4f\n", 
            convert(Float64, cost1), beta*convert(Float64, cost2), 
            convert(Float64, mean(hits)), convert(Float64, mean(difs)))
    end

    return cost, hits
end


srand(10); inputs = randn(10,1);
1;
JJ(inputs)

noiseval, pgrad, noisegrad = bring_the_noise((;pars...)->JJ(inputs;do_plot=false, verbose=false, pars...)[1], 
["th1", "th2", "slope1", "slope2"], [0.5, 0.5, 0.25, 0.25], 2, verbose=false, ncycles=8000, growth_factor=1.5)


JJ(inputs; noisy=noiseval)

print(noisegrad); print("\n")
print(noiseval); print("\n")
print(pgrad); print("\n")


        cost1=0.481859, cost2=-0.00309028, mean(hits)=0.0558, mean(difs)=0.6181


LoadError: UndefVarError: bring_the_noise not defined

# Function BRING_THE_NOISE()

**Lessons I may have learnt:**

1. The zone of succes for bring_the_noise() may be narrow; therefore, for the adaptive eta, it pays to make the growth factor small. If a big jump takes us too far, we can end up in a zero gradient region.
2. It also pays to have enough ncycles that we really finish
3. Finally, in J(), the diffs factor (which applies through theta2, when beta > 0), can actually make for local maxima. There can be a zone where |dJ/dw|^2 goes through a narrow maximum that is actually the one we want; further off, there might be a local minimum and then further positive gradient zones. If we have large jumps and skip over the narrow maximum and the local minimum, then we go all the way off into badland, without a hope of returning.  Reinforcing the importance of a small growth factor.

**New issue revealed:**  FIXED:   ~~If we suddenly move into a region where the gradient is much larger than in the previous step size, we may take an unfortunate big jump. It's nto all about |eta|.  We have to fix that.~~

**Another thing:** In our current cost function (I'll call it J() here although above it is defined as J2, because I'm resrving J2 for J2 = |dJ/dw|^2 ), the value of dJ/dw and therefore the value of J2 *does* depend on non-noise data points. They acan therefore affect the gradient. This may be a particularly strong problem only when we get close to the optimum (wishful thinking, this, trying to just wish it away?)

In [7]:
"""

noise_value, grad_magnitude = bring_the_noise(func, args, seed, n_noise_params; 
            init_noise=NaN, verbose=false, ncycles=100)

Given a scalar function that takes some keyword-value arguments, as well as a "noise" vector 
(it doesn't really have to represent noise, it could be anything), finds the value of the "noise" vector 
that would maximize the magnitude of the gradient of func w.r.t. the keyword-value parameters.

PARAMETERS:
===========

func       A scalar function, with keyword-value parameters.  These MUST include noisy=[], which will be used 
           to indicate the value of the noise vector.  They MUST also include nderivs=0 and difforder=0, 
           used internally together with ForwardDiffZeros in order to make sure new arrays and vectors are 
           differentiable.

args       A list of strings, indicating the keyword parameters for which differentiation is desired.

seed       A list of the initial values (all scalars) of those keyword parameters

n_noise_params    The desired length of the "noise" vector


OPTIONAL PARAMETERS:
====================

init_noise  Default NaN, in which case it is ignored and noise is initialized randomly. If not NaN, it 
            should be a column vector, length n_noise_p repraesenting the initil value of the "noise"

verbose     Default false. If true, prints out debugging information at each cycle of the iterative search for the
            best noise vector value
      
ncycles     Default 100. Number of iterations of the adaptive gradient descent that will be used to find the 
            best noise vector value.

start_eta   Default 1. starting value of learning rate.

growth_factor    Default 1.2.  Factor by which eta gets multiplies every time a step successfully leads to an
            increase in J2 = |d(func)/d(params)|^2

RETURNS:
========

noise_val   The value of the noise at the end of the iterations seeking to maximize |d(func)/d(params)|^2

paramgrad_mag  J2 = |d(func)/d(params)|^2

noise_grad    d(J2)/dnoise    since we're trying to find the noise that maximizes J2, 
                            if we were successful this will be very small at the end of the iterations

param_grad    d(J2)/dparams

init_noise_grad   d(J2)/dnoise at the beginning (not end) of the iterations

dJdnoise      d(func)/d(noise)

"""
function bring_the_noise(func, args, seed, n_noise_params; init_noise = NaN, verbose=false, 
    ncycles=100, start_eta=1, growth_factor = 1.2, mode="ngrad")

    function unit_vector(vec)
        return vec/sqrt(sum(vec.*vec))
    end
    
    if length(init_noise)==1 && isnan(init_noise[1])
        noise_val = randn(n_noise_params, 1)
    else
        noise_val = init_noise
    end
    nparams   = length(seed)
    myargs    = Array{Any, 1}(nparams+1)
    
    for i=1:nparams
        myargs[i] = args[i]
    end
    myargs[nparams+1] = ["noisy", n_noise_params]
    
    eta = start_eta
    
    value, grad, hess = keyword_vgh((;pars...) -> func(; pars...), myargs, [seed ; noise_val])
    dJdw      = grad[1:nparams]
    dJdn      = grad[nparams+1:end]
        

    if mode == "wgrad"
        # J stands for func;  JJ stands for |d func/d weights |^2
        #
        # In this mode, we want to maximize JJ 
        dJdw_msqu = sum(dJdw.*dJdw)
        dJJdnoise = hess[nparams+1:end,1:nparams]*grad[1:nparams]
    elseif mode == "ngrad"
        # The gradient, and what we're trying to maximize, is | d func/dnoise|^2
        dJdw_msqu = sum(dJdn.*dJdn)        
        dJJdnoise = hess[nparams+1:end,nparams+1:end]*grad[nparams+1:end]
    end
    
    starting_step_size = start_eta*sqrt(sum(dJJdnoise.*dJJdnoise))
    step_size = starting_step_size
    
    if verbose
        @printf("0: eta is %g, dJJdnoise is ", eta); print_vector_g(dJJdnoise);
        @printf("  |dJdw|^2 is %g\n", dJdw_msqu)
    end
    
    init_dJJdnoise = dJJdnoise

    for i=1:ncycles
        new_noise_val = noise_val + step_size*unit_vector(dJJdnoise)

        new_value, new_grad, new_hess = keyword_vgh((;pars...) -> func(; pars...), myargs, [seed ; new_noise_val])
        new_dJdw = new_grad[1:nparams]
        new_dJdn = new_grad[nparams+1:end]
        if mode == "wgrad"
            new_dJdw_msqu = sum(new_dJdw.*new_dJdw)
            new_dJJdnoise = new_hess[nparams+1:end,1:nparams]*new_grad[1:nparams]
        elseif mode=="ngrad"
            new_dJdw_msqu = sum(new_dJdn.*new_dJdn)
            new_dJJdnoise = new_hess[nparams+1:end,nparams+1:end]*new_grad[nparams+1:end]            
        end
        if verbose
            @printf("%d: step_size is %g, |new_dJdw|^2 is %g, delta in |dJdw|^2 is %g\n", i, step_size, 
            new_dJdw_msqu, new_dJdw_msqu-dJdw_msqu)
            @printf("new_noise_val: ");  print_vector(new_noise_val);   @printf("\n")
            @printf("new_dJJdnoise: ");  print_vector_g(new_dJJdnoise); @printf("\n")
        end
        
        if new_dJdw_msqu-dJdw_msqu > 0
            step_size *= growth_factor
            noise_val  = new_noise_val
            dJJdnoise  = new_dJJdnoise 
            dJdw_msqu  = new_dJdw_msqu
            
            value = new_value
            grad  = new_grad
            hess  = new_hess
        elseif new_dJdw_msqu-dJdw_msqu == 0
            break
        else
            step_size /=5
            if verbose
                @printf("   Going back to noise_val: "); print_vector(noise_val); @printf("\n")
            end
        end
    end
    
    dJJdw = hess[1:nparams,1:nparams]*grad[1:nparams]

    return noise_val, dJdw_msqu, dJJdnoise, dJJdw, init_dJJdnoise, dJdn
end


bring_the_noise

### Sometime helper function compute_grad

In [8]:
"""
dJdnoise, dJ/dw, d[ J2=|dJ/dnoise|^2 ]/dnoise, d[ J2=|dJ/dw|^2 ]/dnoise, |dJ/dw|^2 = 
    compute_grads(func, args, seed, init_noise)

func() should take only keyword-value args
"""
function compute_grads(func, args, seed, init_noise) 

    nparams = length(seed)
    n_noise_params = length(init_noise)
    
    myargs    = Array{Any, 1}(nparams+1)
    for i=1:nparams
        myargs[i] = args[i]
    end
    myargs[nparams+1] = ["noisy", n_noise_params]

    value, grad, hess = keyword_vgh(func, myargs, [seed ; init_noise])
    pgrad = grad[1:nparams]

    dJ2wDnoise     = hess[nparams+1:end,1:nparams]*grad[1:nparams]
    dJ2noiseDnoise = hess[nparams+1:end,nparams+1:end]*grad[nparams+1:end]
    current_grad_mag = sum(pgrad.*pgrad)

    return grad[nparams+1:end], grad[1:nparams], dJ2noiseDnoise, dJ2wDnoise, current_grad_mag
end





compute_grads

# Adapting code to 2-layer JJ() -- Minimizing with noise to maximize parameter gradient

In [17]:
#function JJ(inputs; noisy=[], k1=2, k2=-2, th1=0.5, th2=0.5, slope1=0.25, slope2=0.25,
#        theta1 = 0.15, theta2=0.2, beta=0.005, do_plot=true, nderivs=0, difforder=0, verbose=true)

srand(10); data1 = randn(100, 1)
#    data1 = make_data(npoints=100, data_sigma=10, seedrand=15);

args = ["th1", "th2", "slope1", "slope2"]

seed = [0.5, 0.5, 0.25, 0.25]

params = seed; new_params = 0; new_cost = 0; delta_params=0; new_grad=0; new_noiseval=0; dJdn=0;
eta = 0.001
beta = 0.05

n_noise  = 2
n_params = length(seed)
maxiter  = 200
tol      = 1e-9
verbose  = true
verbose_level = 2
bmode   = "ngrad"

init_noise_cycles = 8000

use_quick_noise = true

# We currently halt if |dJ2/dnoise|^2 is small  (we're going to have a hard time optimizing the noise).
# But note that in both "ngrad" (J2 = |dJ/dnoise|^2) and "wgrad" (J2 = |dJ/dw|^2) modes, 
# as long as |dJ/dnoise|^2 is large, it shouldn't be so bad if |dJ2/dnoise|^2 is small. We could check
# for that and NOT halt of |dJ/dnoise|^2 is large.   *** note difference between dJ/dnoise and dJ2/dnoise!  :)

func       = (;pars...)     -> JJ(data1; beta=beta, do_plot=false, verbose=false, pars...)[1]
func_noisy = (nval;pars...) -> func(;noisy=nval, pars...)

# ================================= ALL FUNCTION AND TASK-SPECIFIC STUFF SHOULD BE ABOVE  =========
#
#   We do assume that the function takes the optional Boolean-valued parameters "do_plot" and "verbose"


trajectory = zeros(2 + n_params + n_noise, 0)

if use_quick_noise
    # use threshold minus k; k1=2, k2=-2
    noiseval = [params[1]-2-data1[end], params[2]+2-0.5]
    gradmag = 0;    J2noisegrad=0;    J2parmsgrad=0; initJ2noisegrad=0
else
    @printf("\nAbout to start first bring the noise attempt\n")
    noiseval, gradmag, J2noisegrad, J2parmsgrad, initJ2noisegrad =  
        bring_the_noise(func, args, seed, n_noise, mode=bmode, ncycles=init_noise_cycles)
    @printf("\nDone with first bringt the noise round\n\n")
end

cost, grad, hess = vgh( (w) -> func_noisy(noiseval; do_plot=true, verbose=true, make_dict(args, w)...), params)

#bring_the_noise((;pars...) -> J2(data1;do_plot=false, verbose=false, beta=beta, pars...), 
#        verbose=false, args, params, n_noise)        
# cost, grad, hess = vgh( (w) -> J2([data1;noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), params)

@printf("|J2_noisegrad|^2 = %g, |J2_parmsgrad|2 = %g, |grad|^2 = %g, |init_J2noisegrad|^2 = %g\n", 
    sum(J2noisegrad.*J2noisegrad), sum(J2parmsgrad.*J2parmsgrad), 
    sum(grad.*grad), sum(initJ2noisegrad.*initJ2noisegrad))
print_vector_g(:params)
print_vector_g(:grad)
print_vector_g(:noiseval)
@printf("cost=%g, eta=%g\n", cost, eta)


for i in 1:maxiter;
    hathess    = hess + eye(length(grad), length(grad))/eta        
    new_params = params - inv(hathess)*grad
    new_cost, new_grad, new_hess = vgh((w)->func_noisy(noiseval;do_plot=true, verbose=true, make_dict(args, w)...), new_params)
    # vgh((w) -> J2([data1;noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), new_params)

    delta_cost = new_cost - cost
    if abs(delta_cost) < tol
        break
    end

if use_quick_noise
    # use threshold minus k; k1=2, k2=-2
        new_noiseval = [new_params[1]-2-data1[end], new_params[2]+2-0.5]
    new_gradmag = 1; new_J2mag_noisegrad=1; new_J2mag_parmsgrad=1; new_init_J2mag_noisegrad=1; dJdn=1
else
    new_noiseval, new_gradmag, new_J2mag_noisegrad, new_J2mag_parmsgrad, new_init_J2mag_noisegrad, dJdn = 
    bring_the_noise(func, args, new_params, n_noise; init_noise=noiseval, mode=bmode, ncycles=150, growth_factor=1.5)
#    bring_the_noise((;pars...) -> J2(data1;do_plot=false, verbose=false, beta=beta, pars...), 
#        args, new_params, n_noise, init_noise=noiseval, ncycles=150, growth_factor=1.5)
end
    
#    new_hits = JJ(data1; beta=beta, do_plot=false, verbose=false, 
#        th1=new_params[1], th2=new_params[2], slope1=new_params[3], slope2=new_params[4], 
#        noisy=new_noiseval)[2]
#    print_vector_g(:new_hits)
    print_vector_g(:new_params)
    delta_params = new_params - params; 
    print_vector_g(:delta_params)
    print_vector_g(:new_grad)
    print_vector_g(:new_noiseval)
    print_vector_g(:dJdn)
    @printf("cost=%g, new_cost=%g, Dcost=%g\n", cost, new_cost, new_cost-cost)
    @printf("===\n")
    
    iJ2m_n = sum(new_init_J2mag_noisegrad.*new_init_J2mag_noisegrad)

    if new_cost >= cost || new_gradmag < 1e-15 || (iJ2m_n < 1e-15 && sum(dJdn.*dJdn) < 1e-15)
        if verbose
            if new_cost >= cost
                @printf("--- cost went up\n")
            elseif new_gradmag < 1e-8
                @printf("--- new_gradmag was too small, it was %g\n", new_gradmag)
            else
                @printf("--- initial grad of J2 = |dJ/dw| w.r.t. noise was too small, it was %g\n", iJ2m_n)
            end
        end
        eta = eta/2
        costheta = NaN
    else
        eta = eta*1.1
        costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))
        
        params = new_params
        noiseval = new_noiseval
        gradmag  = new_gradmag

        cost, grad, hess = vgh( (w) -> func_noisy(noiseval; do_plot=true, verbose=true, make_dict(args, w)...), params)

#        cost, grad, hess = vgh( (w) -> J2([data1;noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), params)
    end

    if verbose
        @printf("%d: eta=%g cost=%.4f Dcost=%g costheta=%.3f gradmag=%g, ps=", 
            i, eta, cost, delta_cost, costheta, gradmag)
        print_vector(params)
        @printf "\n"
        if verbose_level >= 2
            @printf("    noiseval="); print_vector(noiseval); @printf("\n")
            J2m_n  = sum(new_J2mag_noisegrad.*new_J2mag_noisegrad)
            J2m_p  = sum(new_J2mag_parmsgrad.*new_J2mag_parmsgrad)
            iJ2m_n = sum(new_init_J2mag_noisegrad.*new_init_J2mag_noisegrad)
            
            @printf("    init_J2mag_noisegrad= = %g, J2mag_noisegrad = %g,  J2mag_parmsgrad = %g\n",
                iJ2m_n, J2m_n, J2m_p)
        end
    end

    trajectory = [trajectory [i; eta; params; noiseval]]
end

        cost1=0.476152, cost2=-0.0306745, mean(hits)=0.0600, mean(difs)=0.6135
|J2_noisegrad|^2 = 0, |J2_parmsgrad|2 = 0, |grad|^2 = 0.368516, |init_J2noisegrad|^2 = 0
params = [0.5, 0.5, 0.25, 0.25]
grad = [0.00922776, 0.0884178, -0.0524281, 0.598218]
noiseval = [-2.79607, 2]
cost=0.445478, eta=0.001
        cost1=0.475712, cost2=-0.0306063, mean(hits)=0.0603, mean(difs)=0.6121
new_params = [0.499991, 0.499911, 0.250053, 0.249398]
delta_params = [-9.26122e-06, -8.87293e-05, 5.26249e-05, -0.000601998]
new_grad = [0.00926125, 0.0887295, -0.052625, 0.602009]
new_noiseval = [-2.79608, 1.99991]
dJdn = [1]
cost=0.445478, new_cost=0.445106, Dcost=-0.000371975
===
        cost1=0.475712, cost2=-0.0306063, mean(hits)=0.0391, mean(difs)=0.6121
1: eta=0.0011 cost=0.4451 Dcost=-0.000371975 costheta=-1.000 gradmag=1, ps=[0.500, 0.500, 0.250, 0.249]
    noiseval=[-2.796, 2.000]
    init_J2mag_noisegrad= = 1, J2mag_noisegrad = 1,  J2mag_parmsgrad = 1
        cost1=0.475222, cost2=-0.0305306, mean(hi

In [None]:
#function JJ(inputs; noisy=[], k1=2, k2=-2, th1=0.5, th2=0.5, slope1=0.25, slope2=0.25,
#        theta1 = 0.15, theta2=0.2, beta=0.005, do_plot=true, nderivs=0, difforder=0, verbose=true)

srand(10); data1 = randn(10, 1)
#    data1 = make_data(npoints=100, data_sigma=10, seedrand=15);

args = ["th1", "th2", "slope1", "slope2"]

seed = [0.5, 0.5, 0.25, 0.25]

params = seed; new_params = 0; new_cost = 0;
eta = 0.001
beta = 0.05

n_noise  = 2
n_params = length(seed)
maxiter  = 200
tol      = 1e-9
verbose  = true
verbose_level = 2
bmode   = "ngrad"

init_noise_cycles = 8000

# We currently halt if |dJ2/dnoise|^2 is small  (we're going to have a hard time optimizing the noise).
# But note that in both "ngrad" (J2 = |dJ/dnoise|^2) and "wgrad" (J2 = |dJ/dw|^2) modes, 
# as long as |dJ/dnoise|^2 is large, it shouldn't be so bad if |dJ2/dnoise|^2 is small. We could check
# for that and NOT halt of |dJ/dnoise|^2 is large.   *** note difference between dJ/dnoise and dJ2/dnoise!  :)

func       = (;pars...)     -> JJ(data1; beta=beta, do_plot=false, verbose=false, pars...)[1]
func_noisy = (nval;pars...) -> func(;noisy=nval, pars...)

# ================================= ALL FUNCTION AND TASK-SPECIFIC STUFF SHOULD BE ABOVE  =========
#
#   We do assume that the function takes the optional Boolean-valued parameters "do_plot" and "verbose"


trajectory = zeros(2 + n_params + n_noise, 0)

@printf("\nAbout to start first bring the noise attempt\n")
noiseval, gradmag, J2noisegrad, J2parmsgrad, initJ2noisegrad =  
    bring_the_noise(func, args, seed, n_noise, mode=bmode, ncycles=init_noise_cycles)
@printf("\nDone with first bringt the noise round\n\n")

cost, grad, hess = vgh( (w) -> func_noisy(noiseval; do_plot=true, verbose=true, make_dict(args, w)...), params)

#bring_the_noise((;pars...) -> J2(data1;do_plot=false, verbose=false, beta=beta, pars...), 
#        verbose=false, args, params, n_noise)        
# cost, grad, hess = vgh( (w) -> J2([data1;noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), params)

@printf("|J2_noisegrad|^2 = %g, |J2_parmsgrad|2 = %g, |grad|^2 = %g, |init_J2noisegrad|^2 = %g\n", 
    sum(J2noisegrad.*J2noisegrad), sum(J2parmsgrad.*J2parmsgrad), 
    sum(grad.*grad), sum(initJ2noisegrad.*initJ2noisegrad))
print_vector_g(:params)
print_vector_g(:grad)
print_vector_g(:noiseval)
@printf("cost=%g, eta=%g\n", cost, eta)

i=0

In [None]:
for zz=1:10
    i = i+1
    hathess    = hess + eye(length(grad), length(grad))/eta        
    new_params = params - inv(hathess)*grad
    new_cost, new_grad, new_hess = vgh((w)->func_noisy(noiseval;do_plot=true, verbose=true, make_dict(args, w)...), new_params)
    # vgh((w) -> J2([data1;noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), new_params)

    delta_cost = new_cost - cost
    if abs(delta_cost) < tol
        @printf("\n\n============ COST CHANGE BELOW TOLERANCE =======\n\n")
    end

new_noiseval, new_gradmag, new_J2mag_noisegrad, new_J2mag_parmsgrad, new_init_J2mag_noisegrad, dJdn = 
    bring_the_noise(func, args, new_params, n_noise; init_noise=noiseval, mode=bmode, ncycles=150, growth_factor=1.5)
#    bring_the_noise((;pars...) -> J2(data1;do_plot=false, verbose=false, beta=beta, pars...), 
#        args, new_params, n_noise, init_noise=noiseval, ncycles=150, growth_factor=1.5)
    
    new_hits = JJ(data1; beta=beta, do_plot=false, verbose=false, 
        th1=new_params[1], th2=new_params[2], slope1=new_params[3], slope2=new_params[4], 
        noisy=new_noiseval)[2]
    print_vector_g(:new_hits)
    print_vector_g(:new_params)
    delta_params = new_params - params; 
    print_vector_g(:delta_params)
    print_vector_g(:new_grad)
    print_vector_g(:new_noiseval)
    print_vector_g(:dJdn)
    @printf("cost=%g, new_cost=%g, Dcost=%g\n", cost, new_cost, new_cost-cost)
    @printf("===\n")
    
    iJ2m_n = sum(new_init_J2mag_noisegrad.*new_init_J2mag_noisegrad)

    if new_cost >= cost || new_gradmag < 1e-8 || (iJ2m_n < 1e-15 && sum(dJdn.*dJdn) < 1e-15)
        if verbose
            if new_cost >= cost
                @printf("--- cost went up\n")
            elseif new_gradmag < 1e-8
                @printf("--- new_gradmag was too small, it was %g\n", new_gradmag)
            else
                @printf("--- initial grad of J2 = |dJ/dw| w.r.t. noise was too small, it was %g\n", iJ2m_n)
            end
        end
        eta = eta/2
        costheta = NaN
    else
        eta = eta*1.3
        costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))
        
        params = new_params
        noiseval = new_noiseval
        gradmag  = new_gradmag

        cost, grad, hess = vgh( (w) -> func_noisy(noiseval; do_plot=true, verbose=true, make_dict(args, w)...), params)

#        cost, grad, hess = vgh( (w) -> J2([data1;noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), params)
    end

    if verbose
        @printf("%d: eta=%g cost=%.4f Dcost=%g costheta=%.3f gradmag=%g, ps=", 
            i, eta, cost, delta_cost, costheta, gradmag)
        print_vector(params)
        @printf "\n"
        if verbose_level >= 2
            @printf("    noiseval="); print_vector(noiseval); @printf("\n")
            J2m_n  = sum(new_J2mag_noisegrad.*new_J2mag_noisegrad)
            J2m_p  = sum(new_J2mag_parmsgrad.*new_J2mag_parmsgrad)
            iJ2m_n = sum(new_init_J2mag_noisegrad.*new_init_J2mag_noisegrad)
            
        @printf("    init_J2mag_noisegrad=%g, J2mag_noisegrad=%g, J2mag_parmsgrad=%g, |dJdn|^2=%g\n",
        iJ2m_n, J2m_n, J2m_p, sum(dJdn.*dJdn))
        end
    end

    trajectory = [trajectory [i; eta; params; noiseval]];
end

In [None]:
[cost func(;noisy=noiseval, make_dict(args, params)...)]

In [None]:
cost, grad, hess = vgh( (w) -> func_noisy(noiseval; do_plot=true, verbose=true, make_dict(args, w)...), params)

In [None]:
i = 31; eta = trajectory[2,i]; params = trajectory[3:length(seed)+2,i]; noiseval = trajectory[length(seed)+3:end,i]
@printf("=======\n")
print_vector_g(trajectory[:,i])
@printf("\n=======\n")

cost, grad, hess = vgh( (w) -> func_noisy(noiseval; do_plot=true, verbose=true, make_dict(args, w)...), params)


In [None]:
make_dict(args, params)

In [None]:
i = 31; eta = trajectory[2,i]; params = trajectory[3:length(seed)+2,i]; noiseval = trajectory[length(seed)+3:end,i]
@printf("=======\n")
print_vector_g(trajectory[:,i])
@printf("\n=======\n")



cost, grad, hess = vgh( (w) -> J2([dd ; noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), params)

hathess    = hess + eye(length(grad), length(grad))/eta        
new_params = params - inv(hathess)*grad
new_params = [-8.033, 1.106]

new_cost, new_grad, new_hess = 
    vgh((w) -> J2([dd ; noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), new_params)
            
new_noiseval, new_gradmag, new_J2mag_noisegrad, new_J2mag_parmsgrad, new_init_J2mag_noisegrad = 
    bring_the_noise((;pars...) -> J2(dd;do_plot=true, verbose=false, beta=beta, pars...), 
    args, new_params, n_noise, init_noise=noiseval, verbose=true, ncycles=100, growth_factor=1.5, mode="ngrad")
    
iJ2m_n = sum(new_init_J2mag_noisegrad.*new_init_J2mag_noisegrad)
J2m_n = sum(new_J2mag_noisegrad.*new_J2mag_noisegrad)
@printf("iJ2m_n = %g, J2m_n = %g\n", iJ2m_n, J2m_n)

new_noiseval
    

In [None]:
new_hits = JJ(data1; beta=beta, do_plot=false, verbose=false, 
    th1=new_params[1], th2=new_params[2], slope1=new_params[3], slope2=new_params[4], 
    noisy=new_noiseval)[2]


# Real-life testing of bring_the_noise()

In [None]:
i = 70; eta = trajectory[2,i]; params = trajectory[3:4,i]; noiseval = trajectory[5:end,i]
@printf("=======\n")
print_vector_g(trajectory[:,i])
@printf("\n=======\n")


dd = data1[1:99]
dd = [data1[1:10] ; data1[91:100]]
dd = -8.01*ones(30,1)
#dd = []
dd = data1

cost, grad, hess = vgh( (w) -> J2([dd ; noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), params)

hathess    = hess + eye(length(grad), length(grad))/eta        
new_params = params - inv(hathess)*grad
new_params = [-8.033, 1.106]

new_cost, new_grad, new_hess = 
    vgh((w) -> J2([dd ; noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), new_params)
            
new_noiseval, new_gradmag, new_J2mag_noisegrad, new_J2mag_parmsgrad, new_init_J2mag_noisegrad = 
    bring_the_noise((;pars...) -> J2(dd;do_plot=true, verbose=false, beta=beta, pars...), 
    args, new_params, n_noise, init_noise=noiseval, verbose=true, ncycles=100, growth_factor=1.5, mode="ngrad")
    
iJ2m_n = sum(new_init_J2mag_noisegrad.*new_init_J2mag_noisegrad)
J2m_n = sum(new_J2mag_noisegrad.*new_J2mag_noisegrad)
@printf("iJ2m_n = %g, J2m_n = %g\n", iJ2m_n, J2m_n)

new_noiseval
    

# For single-layer J() -- Minimizing with noise to maximize parameter gradient

In [None]:
data1 = make_data(npoints=100, data_sigma=10, seedrand=15);

args = ["threshold", "slope"]
bbox = [-20.1 20.1 ; 0.01 200]

seed = [0.5, 10.1]
seed = [-5.758, 1.1]

params = seed; new_params = 0; new_cost = 0;
eta = 1
beta = 0.05

n_noise  = 5
n_params = length(seed)
maxiter  = 200
tol      = 1e-9
verbose  = true
verbose_level = 2
bmode   = "ngrad"  

# We currently halt if |dJ2/dnoise|^2 is small  (we're going to have a hard time optimizing the noise).
# But note that in both "ngrad" (J2 = |dJ/dnoise|^2) and "wgrad" (J2 = |dJ/dw|^2) modes, 
# as long as |dJ/dnoise|^2 is large, it shouldn't be so bad if |dJ2/dnoise|^2 is small. We could check
# for that and NOT halt of |dJ/dnoise|^2 is large.   *** note difference between dJ/dnoise and dJ2/dnoise!  :)

func       = (;pars...)     -> J(data1; beta=beta, do_plot=false, verbose=false, pars...)
func_noisy = (nval;pars...) -> func(;noisy=nval, pars...)

# ================================= ALL FUNCTION AND TASK-SPECIFIC STUFF SHOULD BE ABOVE  =========
#
#   We do assume that the function takes the optional Boolean-valued parameters "do_plot" and "verbose"


trajectory = zeros(2 + n_params + n_noise, 0)

noiseval, gradmag, J2noisegrad, J2parmsgrad, initJ2noisegrad =  
    bring_the_noise(func, args, seed, n_noise, mode=bmode, ncycles=200)

cost, grad, hess = vgh( (w) -> func_noisy(noiseval; do_plot=true, verbose=true, make_dict(args, w)...), params)

#bring_the_noise((;pars...) -> J2(data1;do_plot=false, verbose=false, beta=beta, pars...), 
#        verbose=false, args, params, n_noise)        
# cost, grad, hess = vgh( (w) -> J2([data1;noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), params)

@printf("|J2_noisegrad|^2 = %g, |J2_parmsgrad|2 = %g, |grad|^2 = %g, |init_J2noisegrad|^2 = %g\n", 
    sum(J2noisegrad.*J2noisegrad), sum(J2parmsgrad.*J2parmsgrad), 
    sum(grad.*grad), sum(initJ2noisegrad.*initJ2noisegrad))

for i in 1:maxiter;
    hathess    = hess + eye(length(grad), length(grad))/eta        
    new_params = params - inv(hathess)*grad
    new_cost, new_grad, new_hess = vgh((w)->func_noisy(noiseval;do_plot=true, verbose=true, make_dict(args, w)...), new_params)
    # vgh((w) -> J2([data1;noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), new_params)
            
    delta_cost = new_cost - cost
    if abs(delta_cost) < tol
        break
    end

    new_noiseval, new_gradmag, new_J2mag_noisegrad, new_J2mag_parmsgrad, new_init_J2mag_noisegrad = 
    bring_the_noise(func, args, new_params, n_noise; init_noise=noiseval, mode=bmode)
#    bring_the_noise((;pars...) -> J2(data1;do_plot=false, verbose=false, beta=beta, pars...), 
#        args, new_params, n_noise, init_noise=noiseval, ncycles=150, growth_factor=1.5)
    
    iJ2m_n = sum(new_init_J2mag_noisegrad.*new_init_J2mag_noisegrad)

    if new_cost >= cost || new_gradmag < 1e-8 || iJ2m_n < 1e-15
        if verbose
            if new_cost >= cost
                @printf("--- cost went up\n")
            elseif new_gradmag < 1e-8
                @printf("--- new_gradmag was too small, it was %g\n", new_gradmag)
            else
                @printf("--- initial grad of J2 = |dJ/dw| w.r.t. noise was too small, it was %g\n", iJ2m_n)
            end
        end
        eta = eta/2
        costheta = NaN
    else
        eta = eta*1.1
        costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))
        
        params = new_params
        noiseval = new_noiseval
        gradmag  = new_gradmag

        cost, grad, hess = vgh( (w) -> func_noisy(noiseval; do_plot=true, verbose=true, make_dict(args, w)...), params)

#        cost, grad, hess = vgh( (w) -> J2([data1;noiseval]; do_plot=true, verbose=true, beta=beta, make_dict(args, w)...), params)
    end

    if verbose
        @printf("%d: eta=%g cost=%.4f Dcost=%g costheta=%.3f gradmag=%g, ps=", 
            i, eta, cost, delta_cost, costheta, gradmag)
        print_vector(params)
        @printf "\n"
        if verbose_level >= 2
            @printf("    noiseval="); print_vector(noiseval); @printf("\n")
            J2m_n  = sum(new_J2mag_noisegrad.*new_J2mag_noisegrad)
            J2m_p  = sum(new_J2mag_parmsgrad.*new_J2mag_parmsgrad)
            iJ2m_n = sum(new_init_J2mag_noisegrad.*new_init_J2mag_noisegrad)
            
            @printf("    init_J2mag_noisegrad= = %g, J2mag_noisegrad = %g,  J2mag_parmsgrad = %g\n",
                iJ2m_n, J2m_n, J2m_p)
        end
    end

    trajectory = [trajectory [i; eta; params; noiseval]]
end