**Wed 2017-09-06 1pm:** Time to go to ProAnti. I don't think it's worth stopping to examine further whether fluxSense() is worth it, just doesn't seem the issue right now. Can revisit if necessary.

**Wed 2017-09-06 11am:** Current status: everythign working, bbox_hessian_minimization fully debugged and report diagnostic info. Basic MGO minimizations are doing their thing.  Have now added self-connection weights, and can very successfully train the MGO network.



<h1 id="tocheading">TABLE OF CONTENTS</h1>
<div id="toc"></div>

**Updates to the table of contents are periodic, but run the cell below to first start or force an update.**

In [49]:
macro javascript_str(s) display("text/javascript", s); end

javascript"""
$.getScript('https://sites.google.com/site/brodylabhome/files/make_table_of_contents.js')
"""



In [2]:

using PyCall
using PyPlot
using ForwardDiff
using DiffBase
using MAT

pygui(true)

import Base.convert
convert(::Type{Float64}, x::ForwardDiff.Dual) = Float64(x.value)
function convert(::Array{Float64}, x::Array{ForwardDiff.Dual}) 
    y = zeros(size(x)); 
    for i in 1:prod(size(x)) 
        y[i] = convert(Float64, x[i]) 
    end
    return y
end

include("general_utils.jl")
include("hessian_utils.jl")

"""
We define functions to convert Duals, the variable types used by ForwardDiff, 
to Floats. This is useful if we want to print out the value of a variable 
(since print doesn't know how to Duals). Note that after being converted to a Float, no
differentiation by ForwardDiff can happen!  e.g. after
    x = convert(Float64, y)
ForwardDiff can still differentiate y, but it can't differentiate x
"""



"We define functions to convert Duals, the variable types used by ForwardDiff, \nto Floats. This is useful if we want to print out the value of a variable \n(since print doesn't know how to Duals). Note that after being converted to a Float, no\ndifferentiation by ForwardDiff can happen!  e.g. after\n    x = convert(Float64, y)\nForwardDiff can still differentiate y, but it can't differentiate x\n"

# Setup -- definitions of forwardModel() and backwardsModel()

In [144]:
"""
o = g(z)    squashing tanh function, running from 0 to 1, is equal to 0.5 when input is 0.
"""
function g(z)
    return 0.5*tanh.(z)+0.5
end
    
"""
z = g^-1(o)    inverse of squashing tanh function, input must be in (0, 1), output is zero when passed 0.5.
"""
function ginverse(z)
    return 0.5*log.(z./(1-z))
end


"""
forwardModel(startU; dt=0.01, tau=0.1, nsteps=100, input=[0.1, 0], noise=[], W=[0 -5;-5 0], 
init_add=0, start_add=0, const_add=0, sigma=0, gleak=1, U_rest=0, 
    do_plot=false, nderivs=0, difforder=0, clearfig=true, fignum=1, dUdt_mag_only=false)

Runs a tanh() style-network forwards in time, given its starting point, using simple Euler integration
    tau dU/dt = -U + W*V + I
    V = 0.5*tanh(U)+ 0.5

**PARAMETERS:**

startU     A column vector, nunits-by-1, indicating the values of U at time zero


**OPTIONAL PARAMETERS**

dt      Scalar, timestep size

tau     Scalar, in seconds

gleak   
        dUdt will have a term equal to gleak*(U_rest - U)
U_rest

nsteps  Number of timesteps to run, including time=0.

input   Either an nunits-by-1 vector, in which case inputs to each unit are constant
        across time, or a matrix, nunits-by-nsteps, indicating input for each unit at each timepoint.

W       Weight matrix, nunits-by-nunits

init_add    DEPRECATED: Vector or scalar that gets added to the input current at very first timestep.
            Deprecated because this made it dt-dependent. Replaced by start_add.

start_add   Vector or scalar that gets added, once, to the initial U[:,1], before the integration process begins.

const_add   Scalar that gets added to U after every timestep

sigma       After each timestep, add sigma*sqrt(dt)*randn() to each element of U

do_plot   Default false, if true, plots V of up to the first two dimensions

fignum     Figure number on which to plot

clrearfig  If true, the figure is first cleared, otherwise any plot ois overlaid

nderivs, difforder     Required for making sure function can create its own arrays and 
                       still be differentiated

dUdt_mag_only  If true, returns |dUdt|^2 from the first timestep only, then stops.

** RETURNS:**

Uend Vend       nunits-by-1 vectors representing the final values of U and V that were found.
U, V            nunits-by-nsteps matrices containing the full trajectories

"""
function forwardModel(startU; dt=0.01, tau=0.1, nsteps=100, input=[], noise=[], W=[0 -5;-5 0], 
    init_add=0, start_add=0, const_add=0, do_plot=false, nderivs=0, difforder=0, clearfig=true, fignum=1,
    dUdt_mag_only=false, sigma=0, g_leak=1, U_rest=0, theta=0, beta=1, 
    warn_if_unused_params=false, other_unused_params...)

    if warn_if_unused_params && length(other_unused_params)>0
        @printf("\n\n=== forwardModel warning, had unused params ")
        for k in keys(Dict(other_unused_params))
            @printf("%s, ", k)
        end
    end
    
    my_input = ForwardDiffZeros(size(input,1), size(input,2), nderivs=nderivs, difforder=difforder)
    for i=1:prod(size(input)); my_input[i] = input[i]; end
    input = my_input;
    
    nunits = length(startU)
    if size(startU,2) > size(startU,1)
        error("startU must be a column vector")
    end
    
    # --- formatting input ---
    if ~(typeof(input)<:Array) || prod(size(input))==1  # was a scalar
        input = input[1]*(1+ForwardDiffZeros(nunits, nsteps, nderivs=nderivs, difforder=difforder))
    elseif length(input)==0 # was the empty matrix
        input = ForwardDiffZeros(nunits, nsteps, nderivs=nderivs, difforder=difforder)
    elseif size(input,2)==1     # was a column vector
        input = input*(1+ForwardDiffZeros(1, nsteps, nderivs=nderivs, difforder=difforder))
    end    
    # --- formatting noise ---
    if ~(typeof(noise)<:Array) || prod(size(noise))==1  # was a scalar
        noise = noise*(1+ForwardDiffZeros(nunits, nsteps, nderivs=nderivs, difforder=difforder))
    elseif length(noise)==0 # was the empty matrix
        noise = ForwardDiffZeros(nunits, nsteps, nderivs=nderivs, difforder=difforder)
    elseif size(noise,2)==1     # was a column vector
        noise = noise*(1+ForwardDiffZeros(1, nsteps, nderivs=nderivs, difforder=difforder))
    end    
    
    U = ForwardDiffZeros(nunits, nsteps, nderivs=nderivs, difforder=difforder)
    V = ForwardDiffZeros(nunits, nsteps, nderivs=nderivs, difforder=difforder)
    
    if ~(typeof(W)<:Array); W = [W]; end

    W     = reshape(W, nunits, nunits)
    U     = reshape(U, nunits, nsteps)
    V     = reshape(V, nunits, nsteps)
    input = reshape(input, nunits, nsteps)
    noise = reshape(noise, nunits, nsteps)

    input[:,1] += init_add
    input      += const_add

    #@printf("size(U) is (%d,%d), and size(startU) is (%d,%d) and size(noise) is (%d,%d)", 
    #    size(U,1), size(U,2), size(startU,1), size(startU,2), size(noise,1), size(noise,2))
    # @printf("U[1]=%g, noise[1]=%g\n", startU, noise[1])
    U[:,1] = startU + noise[:,1] + start_add; # @printf("Resulting U=%g\n", U[1])
    V[:,1] = g((U[:,1]-theta)/beta); # @printf("Resulting V=%g\n", V[1])
    
    for i=2:nsteps
        dUdt = g_leak*(U_rest -U[:,i-1]) + W*V[:,i-1] + input[:,i-1]
        if dUdt_mag_only; return sum(dUdt.*dUdt); end;
        # @printf("dUdt=%g\n", dUdt[1])
        # @printf("i=%g\n", i)
        # @printf("noise[2]=%g\n", noise[2])
        U[:,i] = U[:,i-1] + (dt/tau)*dUdt + noise[:,i] + sigma*sqrt(dt)*randn(size(U,1),1)
        # @printf("Resulting U[2]=%g\n", U[2])
        V[:,i] = g((U[:,i]-theta)/beta)
        # @printf("Resulting V[2]=%g\n", V[2])
    end

    if do_plot
        figure(fignum)
        if length(startU)==1
            if clearfig; clf(); end;
            t = (0:nsteps-1)*dt
            plot(t, V[1,:], "b-")
            plot(t[1], V[1,1], "g.")
            plot(t[end], V[1,end], "r.")
            xlabel("t"); ylabel("V1"); ylim([-0.01, 1.01])
        elseif length(startU)>=2
            if clearfig; clf(); end;
            plot(V[1,:], V[2,:], "b-")
            plot(V[1,1], V[2,1], "g.")
            plot(V[1,end], V[2,end], "r.")
            xlabel("V1"); ylabel("V2"); 
            xlim([-0.01, 1.01]); ylim([-0.01, 1.01])
        end
    end

    return U[:,end], V[:,end], U, V
end


"""
backwardsModel(endU; dt=0.01, tau=0.1, nsteps=100, input=[0],noise=[],  W=[0 -5;-5 0], 
    do_plot=false, nderivs=0, difforder=0, clearfig=true, fignum=1, tol=1e-15, start_eta=10)

Runs a tanh() style-network BACKWARDS in time, given its ending point, by making a backwards
guess at each timepoint and then using Hessian minimization to find the backwards vector that correctly
leads to the current timestep value.  Uses forwardModel() . The forwards equations are:

    tau dU/dt = -U + W*V + I
    V = 0.5*tanh(U)+ 0.5

**PARAMETERS:**

endU     A column vector, nunits-by-1, indicating the values of U at time=end


**OPTIONAL PARAMETERS:**

dt      Scalar, timestep size

tau     Scalar, in seconds

nsteps  Number of timesteps to run, including time=0.

input   Either an nunits-by-1 vector, in which case inputs to each unit are constant
        across time, or a matrix, nunits-by-nsteps, indicating input for each unit at each timepoint.

W       Weight matrix, nunits-by-nunits

do_plot   Default false, if true, plots V of up to the first two dimensions

tol       Tolerance in the minimization procedure for finding each backwards timestep. Passed on
          to trust_region_Hessian_minimization()

start_eta   Passed on to trust_region_Hessian_minimization()

fignum     Figure number on which to plot

clrearfig  If true, the figure is first cleared, otherwise any plot ois overlaid

nderivs, difforder     Required for making sure function can create its own arrays and 
                       still be differentiated



** RETURNS:**

Ustart Vstart   nunits-by-1 vectors representing the starting values of U and V that were found.
U, V            nunits-by-nsteps matrices containing the full trajectories
costs           1-by-nsteps vector with the final cost from the minimization procedure for each
                timestep. This is the squared difference between the U[t+1] produced by the U[t] 
                guess and the actual U[t+1]

"""
function backwardsModel(endU; nsteps=100, start_eta=10, tol=1e-15, maxiter=400, 
    do_plot=false, init_add=0, start_add=0, dt=0.01, 
    input=[], noise=[], nderivs=0, difforder=0, clearfig=false, fignum=1, params...)    

    nunits = length(endU)

    # --- formatting input ---
    if ~(typeof(input)<:Array) || prod(size(input))==1  # was a scalar
        input = input[1]*(1+ForwardDiffZeros(nunits, nsteps, nderivs=nderivs, difforder=difforder))
    elseif length(input)==0 # was the empty matrix
        input = ForwardDiffZeros(nunits, nsteps, nderivs=nderivs, difforder=difforder)
    elseif size(input,2)==1     # was a column vector
        input = input*(1+ForwardDiffZeros(1, nsteps, nderivs=nderivs, difforder=difforder))
    end    
    # --- formatting noise ---
    if ~(typeof(noise)<:Array)  # was a scalar
        noise = noise*(1+ForwardDiffZeros(nunits, nsteps, nderivs=nderivs, difforder=difforder))
    elseif length(noise)==0 # was the empty matrix
        noise = ForwardDiffZeros(nunits, nsteps, nderivs=nderivs, difforder=difforder)
    elseif size(noise,2)==1     # was a column vector
        noise = noise*(1+ForwardDiffZeros(1, nsteps, nderivs=nderivs, difforder=difforder))
    end    
    
    function J(U1, U2; nderivs=0, difforder=0, noise=[], inputs=[], pars...)
        U2hat = forwardModel(U1; nsteps=2, noise=noise, input=input, nderivs=nderivs, difforder=difforder, pars...)[1]
        U2hat = U2hat
        DU = U2hat - U2
    
        return sum(DU.*DU)
    end
    
    if length(noise)==0
        noise = ForwardDiffZeros(nunits, nsteps, nderivs=nderivs, difforder=difforder)
    end

    U = ForwardDiffZeros(nunits, nsteps, nderivs=nderivs, difforder=difforder)
    U = reshape(U, nunits, nsteps)
    costs = ForwardDiffZeros(nsteps, 1, nderivs=nderivs, difforder=difforder)    
    
    U[:,end] = endU
    for i=(nsteps-1):-1:1
        if i==1
            my_init_add = init_add
            my_start_add = start_add
        else
            my_init_add = 0
            my_start_add = 0
        end
                
        U[:,i], costs[i] = trust_region_Hessian_minimization(U[:,i+1], 
            (x) -> J(x, U[:,i+1]; nderivs=length(endU), difforder=2, 
            input=input[:,i:i+1], noise = noise[:,i:i+1], 
            init_add=my_init_add, start_add=my_start_add, params...); 
            verbose=false, start_eta=start_eta, tol=tol, maxiter=maxiter)
        if i>1; U[:,i] += noise[:,i]; end
    end
    
    
    V = g(U)
    
    if do_plot
        figure(fignum)   
        if typeof(params)<:Array; params = Dict(params); end;
        if haskey(params, :dt);     dt     = params[:dt];     end
        if haskey(params, :nsteps); nsteps = params[:nsteps]; end
        if length(endU)==1
            if clearfig; clf(); end;
            t = (0:nsteps-1)*dt
            plot(t, V[1,:], "m-")
            plot(t[1], V[1,1], "go")
            plot(t[end], V[1,end], "ro")            
            ylim([-0.01, 1.01])
        elseif length(endU)>=2
            if clearfig; clf(); end;            
            plot(V[1,:], V[2,:], "m-")
            plot(V[1,1], V[2,1], "go")
            plot(V[1,end], V[2,end], "ro")
            xlim([-0.01, 1.01]); ylim([-0.01, 1.01])
        end
    end
    
    return U[:,1], V[:,1], U, V, costs
end



backwardsModel

### Testing forward and backwards models with only 1 dimension

In [4]:
figure(1); clf();
params = Dict(:noise => [0.1], :W => [-2], :nsteps=>10, :start_add=>-1.9)
Uend = forwardModel([1.1]; do_plot=true, params...)[1]
Ustart = backwardsModel(Uend; do_plot=true, tol=1e-30, params...)[1]
@printf("Ustart came back as %g\n", Ustart[1])

Ustart came back as 1.1


### Testing forward and backwards models now with 2 dimensions

In [5]:
nsteps=50
params = Dict(:noise =>0.03*randn(2,nsteps) + [0.1,0]*ones(1,nsteps), :W => [0 -5; -5 0], :nsteps=>nsteps)

Uend, Vend, U, V              = forwardModel([0.1,0.1]; do_plot=true, params...);
Ustart, Vstart, bU, bV, costs = backwardsModel(Uend; do_plot=true, tol=1e-30, params...)

@printf("Ustart came back as : "); print_vector_g(Ustart); print("\n")

Ustart came back as : [0.1, 0.1]


# Exploring dt-dependence of gradients and hessian

In [6]:
# srand(111)
startU=randn(100,2)-3
startU=randn(100,2)-3
sigma = 0

# startU=0.1*randn(100,2)-3
# startU=zeros(100,2)-3


dt = 0.02
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

W = -4
noise = 0
input = 0
sigma = 0


model_params = Dict(:dt=>dt, :tau=>tau, :W=>[0 W; W 0], :nsteps=>nsteps, 
:noise=>noise, :input=>input, :sigma=>sigma, :const_add=>0, :init_add=>0)

forward = (startU; pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;     
    forwardModel(startU; pars...)
end



# clf();
# func = (;pars...) -> forward([-0.2, 0.3]; do_plot=true, merge(model_params, Dict(pars))...)
# func(;W=-4)

args = ["W", "const_add", ["start_add" 2], "sigma"]
params = [-4.01, 0.5, 0.2, -0.2, 0.01]

figure(1); clf();
value1, grad1, hess1 = keyword_vgh((;pars...)->forward([-0.2, 0.3]; do_plot=true, merge(model_params, Dict(pars))...)[1][1], args, params)
title(@sprintf("Running with dt=%g", dt))

dt = 0.005
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

model_params = Dict(:dt=>dt, :tau=>tau, :W=>[0 W; W 0], :nsteps=>nsteps, 
:noise=>noise, :input=>input, :sigma=>sigma, :const_add=>0, :init_add=>0)

figure(2); clf();
value2, grad2, hess2 = keyword_vgh((;pars...)->forward([-0.2, 0.3]; do_plot=true, fignum=2, merge(model_params, Dict(pars))...)[1][1], args, params)
title(@sprintf("Running with dt=%g", dt))

[grad1[:]' ; grad2[:]']

2×5 Array{Float64,2}:
 0.92859   -1.0449   1.11301  -1.12663  0.164904
 0.918901  -1.01352  1.03901  -1.06345  0.512789

# TO-DOs

1. ~~Be able to use W as an optimizable parameter (including configs like "all horizontal weights are the same")~~ DONE!
2. ~~Check out what is going on with the weird trajectories in the function-based MGO example~~  DONE: it's just the strong, single-timestep initial_add
3. ~~Check out whether reducing beta solves the sticking issue even without extra finalFluxPoint locations~~. It does. Reducing beta from 0.01 to 0.003 was enough.  (We also needed dto change the cost_limit to -0.00288, since the range of costs changes when beta changes.)
3. Find the saddle points and use those as the finalFluxPoint locations
4. ===
5. ~~Run a ProAnti model with noise only in initial conditions, and thus with the framework as we have it~~ (skipped, went straight to next step)
6. ~~Make a cost function with frozen noise, and figure out how frozen noise will interact with the backwards trajectory in the minimizations~~
7. ~~Make a forwards and backwards model with Urest, etc., just like in ProAnti()~~
6. ===
7. ~~Make sure that minimization procedures that use tanh() walls report the model parameter, not the control parameter~~ DONE
7. ~~Figure out what is going on with the change in gradient and Hessian upon change of dt~~ DONE: it was just the init step
7. ~~To really follow bbox_Hessian, printouts needs to be for the walled params, the trajectory should be for the walled params, and the seed should also take the walls into account.~~ DONE
7. ~~Have one_d_minimization return the number of iterations it did and why it stopped; then have constrained_parabolic_minimization return the cost, maxiters, and stopping reason; then have bbox_Hessian_minimization return the trajectory of those, as a trace of what was going on.~~ DONE
7. POSTPONED: Clean up examples of forward and backwards models and of 1-d use of fluxSense() function
8. POSTPONED: Find a 2-d example where flux points are actually needed -- when beta=0, it is not so clear.
8. POSTPONED: Measure gradient sensitivity to each of the endpoints in a set of trajectories, as a measure of whether fluxSense is needed or not.
12. POSTPONED: Try to combinee fluxSense with bbox_Hessian_minimization9. ~~Fix the walls issue in bbox_Hessian_minimization using tanh encoding.~~


1. ===
2. ~~Optimize either an MGO or a ProAnti~~ DONE with MGO. Now on to ProAnti
3. ~~Set up so we can easily change task period durations in JJ as we run the model to evaluate the results of model-fitting~~ DONE
4. Have different task period durations while model-fitting
5. Set up to do searches over parameter space
6. Incorporate RT into fits?
10. If fluxSense is needed in ProAnti, could try choosing the Anti unit endpoint values by maximizing the |dJ/dw|^2 over those values.
11. Clean up the notebooks and write up what we've been doing!


# Starting on ProAnti

In [182]:
"""
    plot_PA(t, U, V; fignum=1, clearfig=true, rule_and_delay_period=1, target_period=1, post_target_period=1,
        other_unused_params...)

Helper function for plotting ProAnti results
"""
function plot_PA(t, U, V; fignum=1, clearfig=true, rule_and_delay_period=1, target_period=1, post_target_period=1,
    other_unused_params...)
    figure(fignum)
    if clearfig; clf(); end
    
    ax1 = subplot(3,1,1)
    h = plot(t, V'); 
    setp(h[1], color=[0, 0, 1])
    setp(h[2], color=[1, 0, 0])
    setp(h[3], color=[1, 0.5, 0.5])
    setp(h[4], color=[0, 1, 1])
    ylabel("V")

    ax = gca()
    yl = [ylim()[1], ylim()[2]]
    vlines([rule_and_delay_period, 
            rule_and_delay_period+target_period,
            rule_and_delay_period+target_period+post_target_period], 
            -0.05, 1.05, linewidth=2)
    if yl[1]<0.02
        yl[1] = -0.02
    end
    if yl[2]>0.98
        yl[2] = 1.02
    end
    ylim(yl)
    grid(true)
    remove_xtick_labels(ax1)
        
    ax2 = subplot(3,1,2)
    hu = plot(t, U')
    setp(hu[1], color=[0, 0, 1])
    setp(hu[2], color=[1, 0, 0])
    setp(hu[3], color=[1, 0.5, 0.5])
    setp(hu[4], color=[0, 1, 1])
    ylabel("U"); ylim(minimum(U[:])-0.1, maximum(U[:])+0.1)
    vlines([rule_and_delay_period, 
            rule_and_delay_period+target_period,
            rule_and_delay_period+target_period+post_target_period], 
            ylim()[1], ylim()[2], linewidth=2)
    remove_xtick_labels(ax2)

    grid(true)
    
    subplot(3,1,3)
    delta = V[1,:] - V[4,:]
    hr = plot(t, delta)
    oldlims = [ylim()[1]+0.1, ylim()[2]-0.1]
    ylim(minimum([delta[:];oldlims[1]])-0.1, maximum([delta[:];oldlims[2]])+0.1)
    vlines([rule_and_delay_period, 
            rule_and_delay_period+target_period,
            rule_and_delay_period+target_period+post_target_period], 
            ylim()[1], ylim()[2], linewidth=2)
    xlabel("t"); ylabel("Pro R - Pro L")
    grid(true)
        
end





plot_PA

In [145]:
model_params = Dict(
:dt     =>  0.02, 
:tau    =>  0.1, 
:vW     =>  -1.7,
:hW     =>  -1.7,
:sW     =>  0.2,
:dW     =>  0,
:nsteps =>  2, 
:noise  =>  [], 
:sigma  =>  0.08, 
:input  =>  0, 
:g_leak =>  0.25, 
:U_rest =>  -1,
:theta  =>  1, 
:beta   =>  1, 
:sw     =>  0.2,
:hw     =>  -1.7,
:vw     =>  -1.7,
:constant_excitation      => 0.19, 
:anti_rule_strength       => 0.1,
:pro_rule_strength        => 0.1, 
:target_period_excitation => 1,
:right_light_excitation   => 0.5, 
:right_light_pro_extra    => 0,
:const_add => 0, 
:init_add  => 0, 
:rule_and_delay_period    => 0.4,
:target_period            => 0.1,
:post_target_period       => 0.5,
:const_pro_bias           => 0,
)


function make_input(trial_type; dt=0.02, nderivs=0, difforder=0, constant_excitation=0.19, anti_rule_strength=0.1, 
    pro_rule_strength=0.1, target_period_excitation=1, right_light_excitation=0.5, right_light_pro_extra=0, 
    rule_and_delay_period=0.4, target_period=0.1, post_target_period=0.4, const_pro_bias=0,
    other_unused_params...)

    T = rule_and_delay_period + target_period + post_target_period
    t = 0:dt:T
    nsteps = length(t)

    input = constant_excitation + ForwardDiffZeros(4, nsteps, nderivs=nderivs, difforder=difforder)
    if trial_type=="Anti"
        input[2:3, t.<rule_and_delay_period] += anti_rule_strength
    elseif trial_type=="Pro"
        input[[1,4], t.<rule_and_delay_period] += pro_rule_strength
    else
        error("make_input: I don't recognize input type \"" * trial_type * "\"")
    end
    
    input[:,     (rule_and_delay_period.<=t) & (t.<rule_and_delay_period+target_period)] += target_period_excitation
    input[1:2,   (rule_and_delay_period.<=t) & (t.<rule_and_delay_period+target_period)] += right_light_excitation
    input[1,     (rule_and_delay_period.<=t) & (t.<rule_and_delay_period+target_period)] += right_light_pro_extra
    
    input[[1,4],:] += const_pro_bias
    
    return input, t, nsteps
end


function run_ntrials(nPro, nAnti; plot_list=[], nderivs=0, difforder=0, model_params...)
    pro_input,  t, nsteps = make_input("Pro" ; model_params...)
    anti_input, t, nsteps = make_input("Anti"; model_params...)

    model_params = Dict(model_params)
    sW = model_params[:sW]
    hW = model_params[:hW]
    vW = model_params[:vW]
    dW = model_params[:dW]
    model_params = make_dict(["nsteps", "W"], [nsteps, [sW vW dW hW; vW sW hW dW; dW hW sW vW; hW dW vW sW]], 
        model_params)
    model_params = make_dict(["nderivs", "difforder"], [nderivs, difforder], model_params)
    
    proVs  = ForwardDiffZeros(4, nPro, nderivs=nderivs, difforder=difforder)
    antiVs = ForwardDiffZeros(4, nAnti, nderivs=nderivs, difforder=difforder)

    # --- PRO ---
    figure(1); clf();
    model_params = make_dict(["input"], [pro_input], model_params)
    for i=1:nPro
        startU = [-0.3, -0.7, -0.7, -0.3]
        Uend, Vend, U, V = forwardModel(startU, do_plot=false; model_params...)
        proVs[:,i] = Vend
        if any(plot_list.==i) 
            plot_PA(t, U, V; fignum=1, clearfig=false, model_params...)
            subplot(3,1,1); title("PRO")
        end
    end

    # --- ANTI ---
    figure(2); clf();
    model_params = make_dict(["input"], [anti_input], model_params)
    for i=1:nAnti
        startU = [-0.7, -0.3, -0.3, -0.7]
        Uend, Vend, U, V = forwardModel(startU, do_plot=false; model_params...)
        antiVs[:,i] = Vend
        if any(plot_list.==i) 
            plot_PA(t, U, V; fignum=2, clearfig=false, model_params...)
            subplot(3,1,1); title("ANTI")
        end
    end
    
    return proVs, antiVs
end

nPro = 10; nAnti = 5;
proVs, antiVs = @time(run_ntrials(nPro, nAnti; plot_list=[1:5;], model_params...))

@printf("Pro %% correct = %g%%\n", 100*length(find(proVs[1,:].>proVs[4,:]))/ntrials)
@printf("Anti %% correct = %g%% \n", 100*length(find(antiVs[1,:].<antiVs[4,:]))/ntrials)


  1.366172 seconds (196.73 k allocations: 9.023 MB)
Pro % correct = 90%




Anti % correct = 40% 


In [251]:
function JJ(nPro, nAnti; pro_target=0.9, anti_target=0.7, 
    theta1=0.025, theta2=0.035, cbeta=0.003, verbose=false, 
    pre_string="", zero_last_sigmas=0, seedrand=NaN, 
    rule_and_delay_periods = [0.4], target_periods = [0.1], post_target_periods = [0.5],
    nderivs=0, difforder=0, model_params...)

    nruns = length(rule_and_delay_periods)*length(target_periods)*length(post_target_periods)
    
    cost1s = ForwardDiffZeros(1, nruns, nderivs=nderivs, difforder=difforder)
    cost2s = ForwardDiffZeros(1, nruns, nderivs=nderivs, difforder=difforder)

    if ~isnan(seedrand); srand(seedrand); end
    
    n = totHitsP = totHitsA = totDiffsP = totDiffsA = 0
    for i in rule_and_delay_periods
        for j in target_periods
            for k = post_target_periods
                n += 1
                
                my_params = make_dict(["rule_and_delay_period", "target_period", "post_target_period"],
                [i, j, k], Dict(model_params))
    
                # print("model params is " ); print(model_params); print("\n")
                proVs, antiVs = run_ntrials(nPro, nAnti; nderivs=nderivs, difforder=difforder, my_params...)

                hitsP  = 0.5*(1 + tanh.((proVs[1,:]-proVs[4,:,])/theta1))
                diffsP = tanh.((proVs[1,:,]-proVs[4,:])/theta2).^2
                hitsA  = 0.5*(1 + tanh.((antiVs[4,:]-antiVs[1,:,])/theta1))
                diffsA = tanh.((antiVs[4,:,]-antiVs[1,:])/theta2).^2

                if nPro>0 && nAnti>0
                    cost1s[n] = (nPro*(mean(hitsP) - pro_target).^2  + nAnti*(mean(hitsA) - anti_target).^2)/(nPro+nAnti)
                    cost2s[n] = -cbeta*(nPro*mean(diffsP) + nAnti*mean(diffsA))/(nPro+nAnti)
                elseif nPro>0
                    cost1s[n] = (mean(hitsP) - pro_target).^2
                    cost2s[n] = -cbeta*mean(diffsP)
                else
                    cost1s[n] = (mean(hitsA) - anti_target).^2
                    cost2s[n] = -cbeta*mean(diffsA)
                end

                totHitsP  += mean(hitsP);  totHitsA  += mean(hitsA); 
                totDiffsP += mean(diffsP); totDiffsA += mean(diffsA);
            end
        end
    end
    
    cost1 = mean(cost1s)
    cost2 = mean(cost2s)

    hitsP = totHitsP/n; hitsA = totHitsA/n; diffsP = totDiffsP/n; diffsA = totDiffsA/n
    
    if verbose
        @printf("%s", pre_string)
        @printf("     -- cost=%g,   cost1=%g, cost2=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2))
        if nPro>0 && nAnti>0
            @printf("     -- mean(hitsP)=%g, mean(diffsP)=%g mean(hitsA)=%g, mean(diffsA)=%g\n", 
                convert(Float64, mean(hitsP)), convert(Float64, mean(diffsP)),
                convert(Float64, mean(hitsA)), convert(Float64, mean(diffsA)))
        elseif nPro>0
            @printf("     -- mean(hitsP)=%g, mean(diffsP)=%g (nAnti=0)\n", 
                convert(Float64, mean(hitsP)), convert(Float64, mean(diffsP)))
        else
            @printf("     -- (nPro=0) mean(hitsA)=%g, mean(diffsA)=%g\n", 
                convert(Float64, mean(hitsA)), convert(Float64, mean(diffsA)))
        end        
    end
    
    return cost1 + cost2
end



JJ(2, 10; plot_list=1:5, verbose=true, model_params...)




     -- cost=0.0145009,   cost1=0.0150714, cost2=-0.000570531
     -- mean(hitsP)=0.649618, mean(diffsP)=0.116621 mean(hitsA)=0.625519, mean(diffsA)=0.204888


0.014500908183137585

In [275]:
func = (;params...) -> JJ(100, 0; rule_and_delay_periods = [0.4, 0.8], seedrand=30, cbeta=0.01, 
plot_list = 1:5, verbose=false, merge(model_params, Dict(params))...)

cost, grad, hess = keyword_vgh(func, args, seed)

# func(;make_dict(args, seed+ [1,0.2,0,0,0,0,0])...) - func(;make_dict(args, seed)...)


(0.10535699311851135,[0.02585,-0.0078272,-0.0213399,0.342989,-0.367236,0.0262403,0.650505],
[-0.14336 0.0472937 … -0.0471095 -0.260981; 0.0472937 -0.0329522 … 0.0179526 0.0201735; … ; -0.0471095 0.0179526 … -0.00211476 -0.0168496; -0.260981 0.0201735 … -0.0168496 -10.5855])

In [282]:
args = ["sW", "vW", "hW", "constant_excitation", "right_light_excitation", "target_period_excitation", "sigma"]
seed = [0.2,   -1.7, -1.7,      0.19,                0.5,                       1,                       0.1]
seed = [0.2,   0.17,  0.17,      0.19,                0.5,                       1,                       0.1]
seed = [0.2,   -1.7, -1.7,      -0.19,                0.5,                       1,                       0.1]
seed = [0.2,   -1.7, -1.7,      0.39,                0.15,                       0.1,                       0.1]
bbox = Dict(:sW=>[0 3], :vW=>[-3 3], :hW=>[-3 3], :constant_excitation=>[-2 2],
:right_light_excitation=>[0.05 4], :target_period_excitation=>[0.05 4], :sigma=>[0.05 1])
model_params = merge(model_params, Dict(:post_target_period=>0.5))


# Now with constant_pro_bias and a fixed sigma=0.1
args = ["sW", "vW", "hW", "constant_excitation", "right_light_excitation", "target_period_excitation", "const_pro_bias"]
seed = [0.2,   -1.7, -1.7,      0.39,                0.15,                       0.1,                       0.1]
model_params = merge(model_params, Dict(:post_target_period=>0.5, :sigma=>0.1))
bbox = Dict(:sW=>[0 3], :vW=>[-3 3], :hW=>[-3 3], :constant_excitation=>[-2 2],
:right_light_excitation=>[0.05 4], :target_period_excitation=>[0.05 4], :const_pro_bias=>[-2 2])

# ==========

nPro=100; nAnti=100

rule_and_delay_periods = [0.4, 0.8]
post_target_periods    = [0.5, 1]

pars, traj, cost, cpm_traj, Dlambda = bbox_Hessian_keyword_minimization(seed, args, bbox, 
(;params...) -> JJ(nPro, nAnti; rule_and_delay_periods=rule_and_delay_periods,
post_target_periods=post_target_periods,
seedrand=31, cbeta=0.01, verbose=true, merge(model_params, Dict(params))...),
start_eta = 0.01, tol=1e-12, verbose=true)[1]

pars'

0: eta=0.01 ps=[0.200, -1.700, -1.700, 0.390, 0.150, 0.100, 0.100]
     -- cost=0.0609354,   cost1=0.0702293, cost2=-0.0092939
     -- mean(hitsP)=0.574693, mean(diffsP)=0.952638 mean(hitsA)=0.522901, mean(diffsA)=0.906143
     -- cost=0.0594435,   cost1=0.0686243, cost2=-0.00918085
     -- mean(hitsP)=0.57821, mean(diffsP)=0.941545 mean(hitsA)=0.525837, mean(diffsA)=0.894625
1: eta=0.011 cost=0.0594435 jtype=constrained costheta=-0.985 ps=[0.199944, -1.70014, -1.69931, 0.382009, 0.150148, 0.0999991, 0.0947458]
     -- cost=0.0586714,   cost1=0.067794, cost2=-0.0091226
     -- mean(hitsP)=0.583192, mean(diffsP)=0.934959 mean(hitsA)=0.522194, mean(diffsA)=0.889561
2: eta=0.0121 cost=0.0586714 jtype=constrained costheta=-0.428 ps=[0.200019, -1.70054, -1.69829, 0.373021, 0.150443, 0.0999975, 0.0994822]
     -- cost=0.05822,   cost1=0.0672907, cost2=-0.00907067
     -- mean(hitsP)=0.589098, mean(diffsP)=0.93218 mean(hitsA)=0.515129, mean(diffsA)=0.881954
3: eta=0.01331 cost=0.05822 jtype=c

0.08344536755256632

In [302]:
func = (;params...) -> JJ(nPro, nAnti; rule_and_delay_periods=rule_and_delay_periods,
post_target_periods=post_target_periods,
seedrand=31, cbeta=0.01, verbose=true, merge(model_params, Dict(params))...)

func(;make_dict(args, pars)...)

     -- cost=-0.00884341,   cost1=0.000469729, cost2=-0.00931314
     -- mean(hitsP)=0.90076, mean(diffsP)=0.953981 mean(hitsA)=0.69218, mean(diffsA)=0.908647


-0.008843411106774281

In [312]:
# --------------------
# NOW EVALUATE RESULTS HERE
# --------------------

my_params = make_dict([args; "plot_list"; "post_target_period"; "rule_and_delay_period"; "dt"], 
[pars; [[1:10;]]; 1.5; 0.5; 0.02], model_params)

run_factor = 10

proVs, antiVs = @time(run_ntrials(nPro*run_factor, nAnti*run_factor; plot_list=[], my_params...))

if nPro>0;  @printf("Pro %% correct = %g%%\n", 100*length(find(proVs[1,:].>proVs[4,:]))/(nPro*run_factor)); end;
if nAnti>0; @printf("Anti %% correct = %g%% \n", 100*length(find(antiVs[1,:].<antiVs[4,:]))/(nAnti*run_factor)); end;


figure(3); clf();
ax1 = subplot(2,1,1)
h = plt[:hist](proVs[1,:]-proVs[4,:],-1:0.02:1)
title("PRO Vr - Vl")
remove_xtick_labels(ax1)
vlines(0, ylim()[1], ylim()[2])

ax2 = subplot(2,1,2)
h = plt[:hist](antiVs[1,:]-antiVs[4,:],-1:0.02:1)
title("ANTI Vr - Vl")
vlines(0, ylim()[1], ylim()[2])

figure(1); clf(); figure(2); clf();

JJ(nPro, nAnti; rule_and_delay_periods=my_params[:rule_and_delay_period], 
post_target_periods=my_params[:post_target_period], my_params...)


  8.740874 seconds (13.19 M allocations: 798.834 MB, 1.98% gc time)
Pro % correct = 90.7%
Anti % correct = 67.1% 


-0.0016377281161004287

In [279]:
model_params

Dict{Symbol,Any} with 29 entries:
  :target_period          => 0.1
  :right_light_pro_extra  => 0
  :const_pro_bias         => 0
  :beta                   => 1
  :sigma                  => 0.08
  :dW                     => 0
  :anti_rule_strength     => 0.1
  :init_add               => 0
  :pro_rule_strength      => 0.1
  :vw                     => -1.7
  :sW                     => 0.2
  :vW                     => -1.7
  :noise                  => Any[]
  :tau                    => 0.1
  :theta                  => 1
  :right_light_excitation => 0.5
  :hW                     => -1.7
  :hw                     => -1.7
  :sw                     => 0.2
  :constant_excitation    => 0.19
  :rule_and_delay_period  => 0.4
  :const_add              => 0
  :nsteps                 => 2
  :post_target_period     => 0.5
  :input                  => 0
  ⋮                       => ⋮

In [247]:
for i in [3]
    print(i)
end

3

In [159]:
good_pro_pars = pars

7-element Array{Float64,1}:
  0.764685
 -1.84917 
 -0.961235
  0.399476
  1.33592 
  0.13114 
  0.232522

In [92]:
######################################################
#                                                    #
#         BBOX_HESSIAN_KEYWORD_MINIMIZATION          #
#                                                    #
######################################################




"""
pdict = wallwrap(bdict, pdict)
Given bdict, a dictionary of symbols to [minval, maxval] vectors, and pdict, a dictionary of symbols
to values (or, alternatively, an Array of (Symbol, value) tuples], goes through each of the symbols in 
bdict and modifies the corresponding value in pdict putting it through a tanh so the final output lies 
within the limits in bdict.  Returns the new pdict.  Makes a copy of pdict so as not to modify the original.
"""
function wallwrap(bdict, epdict)
    local pdict = two_level_copy(epdict)
    if typeof(pdict)<:Array
        pdict = Dict(pdict)
    end

    allkeys = keys(bdict)

    for k in allkeys
        local bbox = bdict[k]
        d = 0.5*(bbox[2] - bbox[1])
        m = 0.5*(bbox[2] + bbox[1])

        pdict[k] = bbox[1] + d*(tanh((pdict[k]-m)/d)+1)
    end
    return pdict
end

    
"""
params = vector_wrap(bbox, args, eparams)
Given bdict, a dictionary of symbols to [minval, maxval] vectors, args, an array of strings representing
symbols, and params, an array of values corresponding to the args list, puts each param that has an entry 
in bdict through the tanh-walling mechanism, and returns the result. Does not modify the contents of the 
original params vector (or bdict or args).
"""
function vector_wrap(bbox, args, eparams)
    local params = two_level_copy(eparams)
    pdict = wallwrap(bbox, make_dict(args, params))
    i=1; j=1
    for i=1:length(args)
        if typeof(args[i])<:Array
            params[j:j+args[i][2]-1] = pdict[Symbol(args[i][1])]
            j += args[i][2]-1
        else
            params[j] = pdict[Symbol(args[i])]
        end
    j = j+1
    end
    return params
end


"""
params = inverse_wall(bdict, args, wparams)
Given bdict, a dictionary of symbols to [minval, maxval] vectors, args, an array of strings representing
symbols, and wparams, an array of values corresponding to the args list where each param that has an entry 
in bdict has alreadt been through the tanh-walling mechanism, UNwalls the ones that have a bdict entry and
returns the result. Does not modify the contents of the original params vector (or bdict or args).
"""
function inverse_wall(bdict, args, wparams)
    local params = two_level_copy(wparams)
    pdict = inverse_wall(bdict, make_dict(args, params))
    i=1; j=1
    for i=1:length(args)
        if typeof(args[i])<:Array
            params[j:j+args[i][2]-1] = pdict[Symbol(args[i][1])]
            j += args[i][2]-1
        else
            params[j] = pdict[Symbol(args[i])]
        end
        j = j+1
    end
    return params    
end

    
"""
pdict = inverse_wall(bdict, wdict)
Given bdict, a dictionary of symbols to [minval, maxval] vectors, and wdict, a dictionary of symbols to values
(or vectors of values)  UNwalls the ones that have a bdict entry and
returns the result. Does not modify the contents of any dictionaries.
"""
function inverse_wall(bdict, wdict)
    local pdict = two_level_copy(wdict)

    allkeys = keys(bdict)
    for k in allkeys
        local bbox = bdict[k]
        d = 0.5*(bbox[2] - bbox[1])
        m = 0.5*(bbox[2] + bbox[1])

        pdict[k] = m + d*0.5*log((pdict[k]-bbox[1])./(2*d - pdict[k] + bbox[1]))
    end
    return(pdict)
end
  


"""
function bbox_Hessian_keyword_minimization(seed, args, bbox, func; wallwidth=NaN, start_eta=10, tol=1e-6, 
    maxiter=400, verbose=false)

Like constrained_Hessian_minimization, but uses keyword_hessian!(). 

PARAMETERS:
===========

seed        column vector, representing the starting value of the parameters.

args        List of strings identifying parameters for differentiation, e.g., ["const_E", "w_self]

bbox        If softbox=true (the default), should then be a Dict of Symbol=>[minval maxval] entries. An entry
            in this Dict indicates that the corresponding parameter is to be bounded, as indicated by the associated 
            [minval maxval] vector. The bbox dictionary can have fewer entries than the number of parameters, and its
            default value is Dict(), indicating an unbounded search.

            If softbox=false, then bbox should be an nargs-by-2 matrix indicating the range for each argument,
            with the minima (first column) and maxima (second column), and entries for ALL parameters.

func        func must take only optional keyword args, and must 
            take nderivs=0, difforder=0  and declare any new matrices using ForwardDiffZeros() instead of zeros()


OPTIONAL PARAMETERS:
====================

start_eta    Starting value of the radius.  It's good to start with somethibg biggish, if it is
             too much, it'll quickly get cut down.

tol=1e-6     Numerical tolerance. If a proposed jump produces a change in func that is less than
             this, the minimization stops.

maxiter=400  Maximum number of iterations to do before stopping

verbose=false   If true, print out a report on each iteration of iteration number, radius size (eta),
                what type jump was proposed ("Newton" means going straight to global min, "constrained" means jump has 
                norm eta, failed means that finding the minimum at a given radius somehow didn't work). Will also
                print out the cosine of the angle between the proposed jump and the gradient.

verbose_level   If less than 2, regular verbose output, if 2 or greater, very verbose, for debugging.

softbox         If true, then bbox must be a Dict() and we use the tanh() mechanism for putting a fixed limit
                on the parameters.

hardbox=false   If true, ignores wallwidth, and just rests parameter values to the bounding box if they go outside it.
                If false, adds cost function "walls" to implement the bounding box.

walldith=NaN     Used for putting up cost function "walls" that implement the bounding box limits. Can be NaN.
                If it is NaN, then the wallwidth is a constant factor of the range width for each argument. If not NaN, must
                be an nargs-long vector that indicates the actual wall widths.

wallwidth_factor=0.18   Only relevant if wallwidth is NaN, otherwise ignored. For each arg, the wall width
                is going to be wall_width_factor*(bbox[i,2] - bbox[i,1])


RETURNS:
========

params       A vector the size of seed that has the last values of the minimizing parameters for func
trajectory   A (2+length(params))-by-nsteps matrix. Each column corresponds to an iteration step, and contains
                 the value of eta used, the cost, and the value of the parameters at that iteration
cost         Final value of objective function
cpm_traj     A 2-by-nsteps matrix, containing reports from the contrained parabolic minimization at each timestep.
             The first row is niters (how many iterations cpm's 1-d minimization ran for) and the second row is
             Dlambda, the last change in the parameter being minimized in cpm's internal search


EXAMPLE:
========

function tester(;x=5, y=10, z=20, nderivs=0, difforder=0)
    return x^2*y + z/tanh(y)
end

params, trajectory = bbox_Hessian_keyword_minimization([0.5, 0.5], ["x", "y"], [1.1 2 ; 1.1 4], tester, 
    verbose=true, tol=1e-12, start_eta=1);



"""
function bbox_Hessian_keyword_minimization(seed, args, bbox, func; start_eta=0.1, tol=1e-6, maxiter=400,
    verbose=false, verbose_level=1, verbose_every=1, 
    softbox=true, hardbox=false, wallwidth=NaN, wallwidth_factor=0.18)

      
    """
    Given args, a list of string representing the arguments of interest, a bounding box for each,
    and a Symbol=>value dictionary with the corresponding parameters, computes and returns a high cost for 
    being outside the bounding box
    """
    function wall_cost(args, bbox; wallwidth=NaN, nderivs=0, difforder=0, pars...) 
        myparams = ForwardDiffZeros(length(pars), 1, nderivs=nderivs, difforder=difforder)
        pars2 = Dict()
        for i in [1:length(pars);]
            pars2[string(pars[i][1])] = pars[i][2]
        end
        for i in [1:length(args);]
            myparams[i] = pars2[args[i]]
        end
        
        if isnan(wallwidth)
            # We know that we're going to be taking hessian for params, so declare zeros accordingly:
            wallwidth = ForwardDiffZeros(length(myparams), 1, nderivs=nderivs, difforder=difforder)

            for i in [1:length(myparams);]
                wallwidth[i] = wallwidth_factor*(bbox[i,2]-bbox[i,1])
            end
        end

        retval = 0
        for i in [1:length(myparams);]
            if myparams[i]<bbox[i,1]
                retval += cosh((bbox[i,1]-myparams[i])/wallwidth[i])-1.0
            elseif bbox[i,2] < myparams[i]
                retval += cosh((myparams[i]-bbox[i,2])/wallwidth[i])-1.0                
            end
        end

        return 2*retval
    end

    traj_increment = 100
    params = 0  # Make sure to have this here so that params stays defined beyond the try/catch
    if ( !(typeof(bbox)<:Dict) ); error("Currently only supporting softbox=true, bbox must be a Dict"); end;
    try
        params = copy(inverse_wall(bbox, args, seed))
    catch
        error("Were all initial param values within the indicated walls?")
    end
    eta = start_eta
    trajectory = zeros(2+length(params), traj_increment); cpm_traj = zeros(2, traj_increment)

    if verbose
        @printf "%d: eta=%g ps=" 0 eta 
        print_vector(vector_wrap(bbox, args, params))
        @printf "\n"
    end
    
    if softbox
        if !(typeof(bbox)<:Dict); error("bhm: If softbox=true, then bbox must eb a Dict"); end
        cost, grad, hess = keyword_vgh((;pars...)->func(;wallwrap(bbox, pars)...), args, params)
    elseif hardbox
        cost, grad, hess = keyword_vgh((;pars...) -> func(;pars...), args, params)
    else
        cost, grad, hess = keyword_vgh((;pars...) -> func(;pars...) + wall_cost(args, bbox; wallwidth=wallwidth, pars...),
            args, params)        
    end
        
    chessdelta = zeros(size(params))

    for i in [1:maxiter;]
        if i > size(trajectory, 2)
            trajectory = [trajectory zeros(2+length(params), traj_increment)]
            cpm_traj   = [cpm_traj   zeros(2, traj_increment)]
        end
        trajectory[1:2, i]   = [eta;cost]
        trajectory[3:end, i] = vector_wrap(bbox, args, params)
        
        hessdelta  = - inv(hess)*grad
        try
            if verbose && verbose_level >= 2
                @printf("bhm: about to try cpm with grad : "); print_vector_g(grad); print("\n")
                @printf("bhm:   hess :"); print_vector_g(hess[:]); print("\n");
            end
            if verbose && verbose_level >= 2
                cpm_out = constrained_parabolic_minimization(hess, grad'', eta, 
                    maxiter=500, tol=1e-20, do_plot=true, verbose=true)                
            else
                cpm_out = constrained_parabolic_minimization(hess, grad'', eta, maxiter=500, tol=1e-20)
            end
            chessdelta = cpm_out[1]; cpm_traj[1,i] = cpm_out[5]; cpm_traj[2,i] = cpm_out[6]
            jumptype = "not failed"
        catch y
            jumptype = "failed"
            if verbose
                @printf "Constrained parabolic minimization failed with error %s\n" y
                @printf "\n"
                @printf "eta was %g\n" eta
                @printf "grad was\n"
                print_vector(grad)
                @printf "\n\nhess was\n"
                for k in [1:length(grad);]
                    print_vector(hess[k,:])
                    @printf "\n"
                end
                @printf "\n"
                matwrite("error_report.mat", Dict("grad"=>grad, "hess"=>hess, "eta"=>eta))
            end
            break
        end

        if norm(hessdelta) <= eta
            new_params = params + hessdelta
            jumptype = "Newton"
        elseif jumptype != "failed" 
            new_params = params + chessdelta
            jumptype  = "constrained"
        end

        if jumptype != "failed"
            if softbox
                new_cost, new_grad, new_hess = 
                    keyword_vgh((;pars...) -> func(;wallwrap(bbox, pars)...), args, new_params)
                if verbose && verbose_level >=2
                    @printf("bhm: had new_params = : "); print_vector_g(vector_wrap(bbox, args, params)); print("\n");
                    @printf("bhm: and my bbox was : "); print(bbox); print("\n")
                    @printf("bhm: and my wallwrap output was : "); print(wallwrap(bbox, make_dict(args, new_params))); print("\n")
                    @printf("bhm: and this produced new_grad : "); print_vector_g(new_grad); print("\n")
                    @printf("bhm:   new_hess :"); print_vector_g(new_hess[:]); print("\n");                                        
                end
            elseif hardbox
                for p in [1:length(new_params);]
                    if new_params[p] < bbox[p,1]; new_params[p] = bbox[p,1]; end
                    if bbox[p,2] < new_params[p]; new_params[p] = bbox[p,2]; end
                 end        
                
                new_cost, new_grad, new_hess = keyword_vgh((;pars...) -> func(;pars...), args, new_params)
            else
                new_cost, new_grad, new_hess = keyword_vgh((;pars...) -> func(;pars...) + 
                        wall_cost(args, bbox; wallwidth=wallwidth, pars...),
                    args, new_params)                
            end
            
            if abs(new_cost - cost) < tol || eta < tol
                trajectory = trajectory[:,1:i]; cpm_traj = cpm_traj[:,1:i]
                if verbose
                    @printf("About to break -- tol=%g, new_cost-cost=%g, eta=%g\n", tol, new_cost-cost, eta)
                end
                break
            end
        end

        if jumptype == "failed" || new_cost >= cost  
            if verbose
                @printf("eta going down: new_cost-cost=%g and jumptype='%s'\n", new_cost-cost, jumptype)
                if verbose_level >= 2
                    nwp = vector_wrap(bbox, args, new_params); wp = vector_wrap(bbox, args, params)
                    @printf("   vvv: proposed new params were : "); print_vector_g(nwp); print("\n")
                    @printf("   vvv: proposed delta params was : "); print_vector_g(nwp-wp); print("\n")
                    @printf("   vvv: grad was : "); print_vector_g(grad); print("\n")
                    costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))
                    @printf("   vvv: costheta of proposed jump was %g\n", costheta)
                end
            end
            eta = eta/2
            costheta = NaN
            if eta < tol
                trajectory = trajectory[:,1:i]; cpm_traj = cpm_traj[:,1:i]
                if verbose
                    @printf("About to break -- tol=%g, new_cost-cost=%g, eta=%g\n", tol, new_cost-cost, eta)
                end
                break
            end
        else
            eta = eta*1.1
            costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))

            params = new_params
            cost = new_cost
            grad = new_grad
            hess = new_hess
        end

        if verbose
            if rem(i, verbose_every)==0
                @printf "%d: eta=%g cost=%g jtype=%s costheta=%.3f ps=" i eta cost jumptype costheta
                print_vector_g(vector_wrap(bbox, args, params))
                @printf "\n"
                if verbose_level >= 3
                    @printf "    At this point, grad is ="
                    print_vector_g(grad)
                    @printf "\n"                
                end
            end
        end
    end
    
    return vector_wrap(bbox, args, params), trajectory, cost, cpm_traj
end




bbox_Hessian_keyword_minimization

In [None]:
# WORKS:
# keyword_gradient((;params...) -> JJ(100; merge(model_params, Dict(params))...), ["sw", "vw", "hw"], [0.2, -1.7, -1.7])

args = ["sw", "vw", "hw", "constant_excitation", "right_light_excitation", "target_period_excitation", "sigma"]
seed = [0.2,   -1.7, -1.7,      0.19,                0.5,                       1,                       0.1]
seed = [0.2,   0.17,  0.17,      0.19,                0.5,                       1,                       0.1]
seed = [0.2,   -1.7, -1.7,      -0.19,                0.5,                       1,                       0.1]
seed = [0.2,   -1.7, -1.7,      0.19,                0.15,                       0.1,                       0.1]

bbox = [
    -3    3 ;
    -3    3 ; 
    -3    3 ;
    0.1   0.4 ;
    0.1   2.0 ;
    0.1   2.0 ;
    0.05  2.0 ;
]

pars = bbox_Hessian_keyword_minimization(seed, args, bbox, 
(;params...) -> JJ(100; seedrand=30, cbeta=0.001, verbose=true, merge(model_params, Dict(params))...),
start_eta = 0.01, verbose=true)[1]

# --------------------
# NOW EVALUATE RESULTS
# --------------------

ntrials = 1000
proVs, antiVs = @time(run_ntrials(ntrials; plot_list=[], make_dict(args, pars, model_params)...))

@printf("Pro %% correct = %g%%\n", 100*length(find(proVs[1,:].>proVs[4,:]))/ntrials)
@printf("Anti %% correct = %g%% \n", 100*length(find(antiVs[1,:].<antiVs[4,:]))/ntrials)


figure(3); clf();
ax1 = subplot(2,1,1)
h = plt[:hist](proVs[1,:]-proVs[4,:],-1:0.02:0.1)
title("PRO Vr - Vl")
remove_xtick_labels(ax1)
vlines(0, ylim()[1], ylim()[2])

ax2 = subplot(2,1,2)
h = plt[:hist](antiVs[1,:]-antiVs[4,:],-1:0.02:1)
title("ANTI Vr - Vl")
vlines(0, ylim()[1], ylim()[2])

JJ(100; make_dict([args; "plot_list"], [pars; [[1:5;]]], model_params)...)


In [None]:
[args pars]

In [None]:

ntrials = 1000
proVs, antiVs = @time(run_ntrials(ntrials; plot_list=[], make_dict(args, pars, model_params)...))

@printf("Pro %% correct = %g%%\n", 100*length(find(proVs[1,:].>proVs[4,:]))/ntrials)
@printf("Anti %% correct = %g%% \n", 100*length(find(antiVs[1,:].<antiVs[4,:]))/ntrials)


figure(3); clf();
ax1 = subplot(2,1,1)
h = plt[:hist](proVs[1,:]-proVs[4,:],-1:0.02:0.1)
title("PRO Vr - Vl")
remove_xtick_labels(ax1)
vlines(0, ylim()[1], ylim()[2])

ax2 = subplot(2,1,2)
h = plt[:hist](antiVs[1,:]-antiVs[4,:],-1:0.02:1)
title("ANTI Vr - Vl")
vlines(0, ylim()[1], ylim()[2])

JJ(100; make_dict([args; "plot_list"; "dt"], [pars; [[1:5;]]; 0.01], model_params)...)


In [None]:

JJ(100; make_dict([args; "plot_list"; "dt"], [pars; [[1:5;]]; 0.005], model_params)...)


In [None]:
@time(ForwardDiff.hessian((x)->JJ(100; nderivs=length(x), difforder=2, 
make_dict(["sw", "hw", "vw", "sigma", "gleak", "constant_excitation", "plot_list"], 
[x[1], x[2], x[3], x[4], x[5], x[6], []], model_params)...), 
[0.2, -1.7, -1.7, 0.08, 0.25, 0.19]))

In [None]:
@time(JJ(100; make_dict(["sw", "hw", "vw", "plot_list"], [0.2, -1.7, -1.7, []], model_params)...))

In [None]:
ntrials = 100
proVs, antiVs = @time(run_ntrials(ntrials; plot_list=[], model_params...))

@printf("Pro %% correct = %g%%\n", 100*length(find(proVs[1,:].>proVs[4,:]))/ntrials)
@printf("Anti %% correct = %g%% \n", 100*length(find(antiVs[1,:].<antiVs[4,:]))/ntrials)


### Looking at the distribution of VR - VL

In [None]:
ntrials = 500
proVs, antiVs = @time(run_ntrials(ntrials; plot_list=[], model_params...))

@printf("Pro %% correct = %g%%\n", 100*length(find(proVs[1,:].>proVs[4,:]))/ntrials)
@printf("Anti %% correct = %g%% \n", 100*length(find(antiVs[1,:].<antiVs[4,:]))/ntrials)

figure(1); clf();
ax1 = subplot(2,1,1)
h = plt[:hist](proVs[1,:]-proVs[4,:],-0.1:0.002:0.1)
title("PRO Vr - Vl")
remove_xtick_labels(ax1)
vlines(0, ylim()[1], ylim()[2])

ax2 = subplot(2,1,2)
h = plt[:hist](antiVs[1,:]-antiVs[4,:],-0.1:0.002:0.1)
title("ANTI Vr - Vl")
vlines(0, ylim()[1], ylim()[2])


In [None]:
dt = 0.0002
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

W = -4
noise = 0
sigma = 0.2

model_params = Dict(:dt=>dt, :tau=>tau, :W=>[0 W; W 0], :nsteps=>nsteps, 
    :noise=>0, :input=>0, :const_add=>0, :init_add=>0, :sigma=>sigma)

clf()
srand(20)
startUs = randn(20,2)-3
startUs = [-2 -4.1]

for i=1:size(startUs,1)
    forwardModel(startUs[i,:]; do_plot=true, clearfig=false, model_params...)
end

In [None]:
typeof(model_params[:input])

### --- END of ProAnti section

# Playing around with minimizing 2-d model

In [48]:
# The following sequence leads to a situation where having only [-0.8, -0.8] as the single finalFluxPoint 
# leads to the minimization getting stuck.  Adding further finalFluxPoints solves the problem

dt = 0.02
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

mW = -4
sW = 0.2
noise = 0
input = 0
sigma = 0.1


model_params = Dict(:dt=>dt, :tau=>tau, :sW=>sW, :mW=>mW, :W=>[sW mW ; mW sW], :nsteps=>nsteps, 
:noise=>noise, :input=>input, :sigma=>sigma, :const_add=>0, :init_add=>0)


# WORKING gradient:
# ForwardDiff.gradient((x)->JJ(startU; do_plot=true, nderivs=length(x), difforder=1, 
#    make_dict([["init_add" 2], "const_add"], x, model_params)...), [2.9, -2.9, 0.1])



# The backward and costfunc functions should turn a single-scalar parameter W into the matrix W
# backward always runs with no within-forward noise, i.e., sigma=0
backward = (endpoint; do_plot=false, pars...) -> begin
    pars = Dict(pars)
    if ~haskey(pars, :sW) || ~haskey(pars, :mW) 
        error("Need both sW and mW to determine weight matrix")
    end
    pars=make_dict(["W"], [[pars[:sW] pars[:mW] ; pars[:mW] pars[:sW]]], pars);
    backwardsModel(endpoint; do_plot=do_plot, make_dict(["sigma"], [0], pars)...)[1]
end

forward = (startpoint; pars...) -> begin
    pars = Dict(pars)
    if ~haskey(pars, :sW) || ~haskey(pars, :mW) 
        error("Need both sW and mW to determine weight matrix")
    end
    pars=make_dict(["W"], [[pars[:sW] pars[:mW] ; pars[:mW] pars[:sW]]], pars);
    forwardsModel(startpoint; pars...)
end



costfunc = (startpoints; do_plot=false, verbose=false, beta=0.003, nderivs=0, difforder=0, sr=26, pars...) -> begin
    pars = Dict(pars)
    if ~haskey(pars, :sW) || ~haskey(pars, :mW) 
        error("Need both sW and mW to determine weight matrix")
    end
    pars=make_dict(["W"], [[pars[:sW] pars[:mW] ; pars[:mW] pars[:sW]]], pars);
    JJ(startpoints; seedrand=sr, beta=beta, 
        do_plot=do_plot, verbose=verbose, nderivs=nderivs, difforder=difforder, pars...)
end

function JJ(initUs; theta1=0.15, theta2=0.2, beta=0.003, verbose=false, nderivs=0, difforder=0, 
    do_plot=false, pre_string="", zero_last_sigmas=0, seedrand=NaN, params...)

    if ~isnan(seedrand); srand(seedrand); end
    
    Vend = ForwardDiffZeros(size(initUs,1), size(initUs,2), nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    
    for i=1:size(initUs,1)
        if false # i>size(initUs,1) - zero_last_sigmas
            Ue, Ve, U, V = forwardModel(initUs[i,:]; sigma=0, nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)            
        else
            Ue, Ve, U, V = forwardModel(initUs[i,:]; nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)
        end
        Vend[i,:] = Ve
    end
    
    hits = 0.5*(1 + tanh.((Vend[:,1]-Vend[:,2])/theta1))
    diffs = tanh.((Vend[:,1]-Vend[:,2])/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)

    if do_plot
        title(@sprintf("mean(hits)=%g, mean(diffs)=%g", convert(Float64, mean(hits)), convert(Float64, mean(diffs))))
    end
    
    if verbose
        @printf("%s", pre_string)
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2, mean(hits), mean(diffs)
end
  


beta = 0.003
args = [["start_add" 2], "const_add", "mW", "sW"] 
seed = [-0.2, 0.2, 0.2, -1.5, 0.1] 
walls = Dict(:start_add=>[-5.1, 5.1], :sW=>[0, 5.1], :mW=>[-5.1, 5.1]) # 
# sr =  gives 


new_random_seed = true; if new_random_seed
    sr = convert(Int64, round(time()))
else
    sr = old_sr
end
# sr = 1504716566
old_sr = sr

srand(sr)

startU=randn(50,2)-3


clf()
print("seed = "); print_vector_g(seed); print("\n")
ocost, omhits, omdiffs = costfunc(startU; do_plot=true, sr=sr, verbose=true, make_dict(args, seed, model_params)...)


params, traj, zz, cpm_traj = bbox_Hessian_keyword_minimization(seed, args, walls,  
(;params...) -> costfunc(startU; beta=beta, sr=sr, do_plot=true, verbose=true, merge(model_params, Dict(params))...)[1], 
verbose=true, start_eta=1, tol=1e-8, softbox=true, maxiter=400 )

# params, cost, ptraj, gtraj = fluxSense(costfunc, backward, model_params, startU, fluxFinalPoint, args, seed; 
#    start_eta=0.01, tol=1e-15, maxiter=400, verbose=true, report_every=1, do_plot=false, cost_limit=cost_limit) # cost_limit=-0.000935) # for beta=0.01

# And show the final position
figure(1); clf()
cost, mhits, mdiffs = 
    costfunc(startU; beta=beta, do_plot=true, sr=sr, verbose=true, make_dict(args, params, model_params)...)

repeat_results_in_fig2 = true; if repeat_results_in_fig2
    figure(2); clf()
    costfunc(startU; beta=beta, do_plot=true, sr=sr, verbose=true, 
        make_dict(args, params, merge(Dict(:fignum=>2), model_params))...)
    figure(1); 
end
params'

# For beta=0, and ntrials=20, we collected a bunch of results and observed that it failes about half the time
# WHEN the initial mean)hits) is below 0.5.  It never fails if the initial mean(hits) is above 0.5. 
# Seems like when it starts below 0.5, the fastest way to increase mean(hits) is to push it to 0.5 and floor it there.
# The results were collected in "Results.mat"
# res = [res ; omhits mhits]

[omhits mhits]

seed = [-0.2, 0.2, 0.2, -1.5, 0.1]




-- cost=0.0686324,   cost1=0.0687511, cost2=-0.00011865 :  mean(hits)=0.487796, mean(diffs)=0.0395499
0: eta=1 ps=[-0.200, 0.200, 0.200, -1.500, 0.100]
-- cost=0.0686324,   cost1=0.0687511, cost2=-0.00011865 :  mean(hits)=0.487796, mean(diffs)=0.0395499
-- cost=0.0662608,   cost1=0.0663306, cost2=-6.98297e-05 :  mean(hits)=0.492453, mean(diffs)=0.0232766
1: eta=1.1 cost=0.0662608 jtype=Newton costheta=-0.343 ps=[-0.112938, 0.349709, 0.101748, -1.26536, 0.103245]
-- cost=0.0656768,   cost1=0.0657292, cost2=-5.24826e-05 :  mean(hits)=0.493623, mean(diffs)=0.0174942
2: eta=1.21 cost=0.0656768 jtype=Newton costheta=-0.188 ps=[0.0772162, 0.616916, 0.0618108, -0.989811, 0.103702]
-- cost=0.0658205,   cost1=0.0658659, cost2=-4.54283e-05 :  mean(hits)=0.493357, mean(diffs)=0.0151428
eta going down: new_cost-cost=0.000143696 and jumptype='Newton'
3: eta=0.605 cost=0.0656768 jtype=Newton costheta=NaN ps=[0.0772162, 0.616916, 0.0618108, -0.989811, 0.103702]
-- cost=0.0658205,   cost1=0.0658659, c

1×2 Array{Float64,2}:
 0.487796  0.749922

# Example of successful minimization including self-connection weights

Amazing. Adding self-connection weights seems to make the minimization process even more robust: we can now start from start_add = [-0.2, 0.2], which usually gives mean(hits) < 0.5, and **still** reach a successful solution.  Without the self-connection weights that rarely happened, mean(hits) < 0.5 was a bad start, and start_add = [-0.2, 0.2] most often did not lead to success.

In [48]:
# The following sequence leads to a situation where having only [-0.8, -0.8] as the single finalFluxPoint 
# leads to the minimization getting stuck.  Adding further finalFluxPoints solves the problem

dt = 0.02
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

mW = -4
sW = 0.2
noise = 0
input = 0
sigma = 0.1


model_params = Dict(:dt=>dt, :tau=>tau, :sW=>sW, :mW=>mW, :W=>[sW mW ; mW sW], :nsteps=>nsteps, 
:noise=>noise, :input=>input, :sigma=>sigma, :const_add=>0, :init_add=>0)


# WORKING gradient:
# ForwardDiff.gradient((x)->JJ(startU; do_plot=true, nderivs=length(x), difforder=1, 
#    make_dict([["init_add" 2], "const_add"], x, model_params)...), [2.9, -2.9, 0.1])



# The backward and costfunc functions should turn a single-scalar parameter W into the matrix W
# backward always runs with no within-forward noise, i.e., sigma=0
backward = (endpoint; do_plot=false, pars...) -> begin
    pars = Dict(pars)
    if ~haskey(pars, :sW) || ~haskey(pars, :mW) 
        error("Need both sW and mW to determine weight matrix")
    end
    pars=make_dict(["W"], [[pars[:sW] pars[:mW] ; pars[:mW] pars[:sW]]], pars);
    backwardsModel(endpoint; do_plot=do_plot, make_dict(["sigma"], [0], pars)...)[1]
end

forward = (startpoint; pars...) -> begin
    pars = Dict(pars)
    if ~haskey(pars, :sW) || ~haskey(pars, :mW) 
        error("Need both sW and mW to determine weight matrix")
    end
    pars=make_dict(["W"], [[pars[:sW] pars[:mW] ; pars[:mW] pars[:sW]]], pars);
    forwardsModel(startpoint; pars...)
end



costfunc = (startpoints; do_plot=false, verbose=false, beta=0.003, nderivs=0, difforder=0, sr=26, pars...) -> begin
    pars = Dict(pars)
    if ~haskey(pars, :sW) || ~haskey(pars, :mW) 
        error("Need both sW and mW to determine weight matrix")
    end
    pars=make_dict(["W"], [[pars[:sW] pars[:mW] ; pars[:mW] pars[:sW]]], pars);
    JJ(startpoints; seedrand=sr, beta=beta, 
        do_plot=do_plot, verbose=verbose, nderivs=nderivs, difforder=difforder, pars...)
end

function JJ(initUs; theta1=0.15, theta2=0.2, beta=0.003, verbose=false, nderivs=0, difforder=0, 
    do_plot=false, pre_string="", zero_last_sigmas=0, seedrand=NaN, params...)

    if ~isnan(seedrand); srand(seedrand); end
    
    Vend = ForwardDiffZeros(size(initUs,1), size(initUs,2), nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    
    for i=1:size(initUs,1)
        if false # i>size(initUs,1) - zero_last_sigmas
            Ue, Ve, U, V = forwardModel(initUs[i,:]; sigma=0, nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)            
        else
            Ue, Ve, U, V = forwardModel(initUs[i,:]; nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)
        end
        Vend[i,:] = Ve
    end
    
    hits = 0.5*(1 + tanh.((Vend[:,1]-Vend[:,2])/theta1))
    diffs = tanh.((Vend[:,1]-Vend[:,2])/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)

    if do_plot
        title(@sprintf("mean(hits)=%g, mean(diffs)=%g", convert(Float64, mean(hits)), convert(Float64, mean(diffs))))
    end
    
    if verbose
        @printf("%s", pre_string)
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2, mean(hits), mean(diffs)
end
  


beta = 0.003
args = [["start_add" 2], "const_add", "mW", "sW"] 
seed = [-0.2, 0.2, 0.2, -1.5, 0.1] 
walls = Dict(:start_add=>[-5.1, 5.1], :sW=>[0, 5.1], :mW=>[-5.1, 5.1]) # 
# sr =  gives 


new_random_seed = true; if new_random_seed
    sr = convert(Int64, round(time()))
else
    sr = old_sr
end
# sr = 1504716566
old_sr = sr

srand(sr)

startU=randn(50,2)-3


clf()
print("seed = "); print_vector_g(seed); print("\n")
ocost, omhits, omdiffs = costfunc(startU; do_plot=true, sr=sr, verbose=true, make_dict(args, seed, model_params)...)


params, traj, zz, cpm_traj = bbox_Hessian_keyword_minimization(seed, args, walls,  
(;params...) -> costfunc(startU; beta=beta, sr=sr, do_plot=true, verbose=true, merge(model_params, Dict(params))...)[1], 
verbose=true, start_eta=1, tol=1e-8, softbox=true, maxiter=400 )

# params, cost, ptraj, gtraj = fluxSense(costfunc, backward, model_params, startU, fluxFinalPoint, args, seed; 
#    start_eta=0.01, tol=1e-15, maxiter=400, verbose=true, report_every=1, do_plot=false, cost_limit=cost_limit) # cost_limit=-0.000935) # for beta=0.01

# And show the final position
figure(1); clf()
cost, mhits, mdiffs = 
    costfunc(startU; beta=beta, do_plot=true, sr=sr, verbose=true, make_dict(args, params, model_params)...)

repeat_results_in_fig2 = true; if repeat_results_in_fig2
    figure(2); clf()
    costfunc(startU; beta=beta, do_plot=true, sr=sr, verbose=true, 
        make_dict(args, params, merge(Dict(:fignum=>2), model_params))...)
    figure(1); 
end
params'

# For beta=0, and ntrials=20, we collected a bunch of results and observed that it failes about half the time
# WHEN the initial mean)hits) is below 0.5.  It never fails if the initial mean(hits) is above 0.5. 
# Seems like when it starts below 0.5, the fastest way to increase mean(hits) is to push it to 0.5 and floor it there.
# The results were collected in "Results.mat"
# res = [res ; omhits mhits]

[omhits mhits]

seed = [-0.2, 0.2, 0.2, -1.5, 0.1]




-- cost=0.0686324,   cost1=0.0687511, cost2=-0.00011865 :  mean(hits)=0.487796, mean(diffs)=0.0395499
0: eta=1 ps=[-0.200, 0.200, 0.200, -1.500, 0.100]
-- cost=0.0686324,   cost1=0.0687511, cost2=-0.00011865 :  mean(hits)=0.487796, mean(diffs)=0.0395499
-- cost=0.0662608,   cost1=0.0663306, cost2=-6.98297e-05 :  mean(hits)=0.492453, mean(diffs)=0.0232766
1: eta=1.1 cost=0.0662608 jtype=Newton costheta=-0.343 ps=[-0.112938, 0.349709, 0.101748, -1.26536, 0.103245]
-- cost=0.0656768,   cost1=0.0657292, cost2=-5.24826e-05 :  mean(hits)=0.493623, mean(diffs)=0.0174942
2: eta=1.21 cost=0.0656768 jtype=Newton costheta=-0.188 ps=[0.0772162, 0.616916, 0.0618108, -0.989811, 0.103702]
-- cost=0.0658205,   cost1=0.0658659, cost2=-4.54283e-05 :  mean(hits)=0.493357, mean(diffs)=0.0151428
eta going down: new_cost-cost=0.000143696 and jumptype='Newton'
3: eta=0.605 cost=0.0656768 jtype=Newton costheta=NaN ps=[0.0772162, 0.616916, 0.0618108, -0.989811, 0.103702]
-- cost=0.0658205,   cost1=0.0658659, c

1×2 Array{Float64,2}:
 0.487796  0.749922

In [45]:
old_sr

1504716566

# Example that works about half the time-- seesm to follow the patter where initial mean(hits) < 0.5 means half the time go to mean(hits)=0.5 and get stuck there, whereas initial mean(hits)>=0.5 means success

In [19]:
# The following sequence leads to a situation where having only [-0.8, -0.8] as the single finalFluxPoint 
# leads to the minimization getting stuck.  Adding further finalFluxPoints solves the problem

dt = 0.02
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

W = -4
noise = 0
input = 0
sigma = 0.1


model_params = Dict(:dt=>dt, :tau=>tau, :W=>[0 W; W 0], :nsteps=>nsteps, 
:noise=>noise, :input=>input, :sigma=>sigma, :const_add=>0, :init_add=>0)


# WORKING gradient:
# ForwardDiff.gradient((x)->JJ(startU; do_plot=true, nderivs=length(x), difforder=1, 
#    make_dict([["init_add" 2], "const_add"], x, model_params)...), [2.9, -2.9, 0.1])



# The backward and costfunc functions should turn a single-scalar parameter W into the matrix W
# backward always runs with no within-forward noise, i.e., sigma=0
backward = (endpoint; do_plot=false, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;     
    backwardsModel(endpoint; do_plot=do_plot, make_dict(["sigma"], [0], pars)...)[1]
end

forward = (startpoint; pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;     
    forwardsModel(startpoint; pars...)
end



costfunc = (startpoints; do_plot=false, verbose=false, beta=0.003, nderivs=0, difforder=0, sr=26, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;         
    JJ(startpoints; seedrand=sr, beta=beta, 
        do_plot=do_plot, verbose=verbose, nderivs=nderivs, difforder=difforder, pars...)
end

function JJ(initUs; theta1=0.15, theta2=0.2, beta=0.003, verbose=false, nderivs=0, difforder=0, 
    do_plot=false, pre_string="", zero_last_sigmas=0, seedrand=NaN, params...)

    if ~isnan(seedrand); srand(seedrand); end
    
    Vend = ForwardDiffZeros(size(initUs,1), size(initUs,2), nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    
    for i=1:size(initUs,1)
        if false # i>size(initUs,1) - zero_last_sigmas
            Ue, Ve, U, V = forwardModel(initUs[i,:]; sigma=0, nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)            
        else
            Ue, Ve, U, V = forwardModel(initUs[i,:]; nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)
        end
        Vend[i,:] = Ve
    end
    
    hits = 0.5*(1 + tanh.((Vend[:,1]-Vend[:,2])/theta1))
    diffs = tanh.((Vend[:,1]-Vend[:,2])/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)

    if do_plot
        title(@sprintf("mean(hits)=%g, mean(diffs)=%g", convert(Float64, mean(hits)), convert(Float64, mean(diffs))))
    end
    
    if verbose
        @printf("%s", pre_string)
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2, mean(hits), mean(diffs)
end
  


fluxFinalPoint = [-0.8 -0.8; -0.6 -0.6 ; -0.4 -0.4; -0.2 -0.2; 0 0; 0.2 0.2]
fluxFinalPoint = zeros(0,2);


beta = 0.05
args = [["start_add" 2], "const_add", "W", "sigma"]
seed = [0.1, 0.1, 2.1, -1, 0.1]
walls = Dict(:start_add=>[-5.1, 5.1], :W=>[-5.1, 5.1], :sigma=>[-0.5, 0.5])
# sr = 1504432803 causes a total mess with everything around the decision boundary; 
# sr = 1504432962 gets stuck at mean(hits)=0.66 but if we reduce the bounds of sigma, reaches 0.74


beta = 0.0000001
args = [["start_add" 2], "const_add", "W"] 
seed = [0.1, 0.1, 2.1, -1] 
# This seed proves deadly, and always starts with mean(hits)<0.5:  seed = [-0.1, 0.1, 2.1, -1] 
walls = Dict(:start_add=>[-5.1, 5.1], :W=>[-5.1, 5.1]) # 
# sr = 1504433892 gives a mean(hits)=0.5 mess
# sr = 1504433983 gives a mean(hits)=0.5 mess
# sr = 1504434067 gives a mean(hits)=0.5 mess
# sr = 1504434114 gives a mean(hits)=0.5 mess


beta = 0.003
args = [["start_add" 2], "const_add", "W"] 
seed = [-0.2, 0.2, 0.2, -1.5] 
walls = Dict(:start_add=>[-5.1, 5.1], :W=>[-5.1, 5.1]) # 
# sr = 1504713552 gives a mean(hits)=0.5 mess
# sr = 1504713626 gives a mean(hits)=0.5 mess
# sr = 1504713708 gives a mean(hits)=0.5 mess
# sr =  gives a mean(hits)=0.5 mess


new_random_seed = true; if new_random_seed
    sr = convert(Int64, round(time()))
else
    sr = old_sr
end
# sr = 1504649431
old_sr = sr

srand(sr)

# THIS IS THE GOOD ONE FOR ALL THE COMMENTS ON sr NUMBERS ABOVE: startU=randn(50,2)-3
startU=randn(50,2)-3


clf()
print("seed = "); print_vector_g(seed); print("\n")
ocost, omhits, omdiffs = costfunc(startU; do_plot=true, sr=sr, verbose=true, make_dict(args, seed, model_params)...)


# :sigma=>[-0.3, 0.3] does fine but :sigma=>[-0.2, 0.2] gets stuck.
# If we fix sigma at 0 it also gets stuck, but dynamics kind of odd, W a bit to big, or decrease dt
params, traj, zz, cpm_traj = bbox_Hessian_keyword_minimization(seed, args, walls, # , :sigma=>[-0.2, 0.2]), 
(;params...) -> costfunc(startU; beta=beta, sr=sr, do_plot=false, verbose=true, merge(model_params, Dict(params))...)[1], 
verbose=true, start_eta=1, tol=1e-8, softbox=true, maxiter=400 )

# params, cost, ptraj, gtraj = fluxSense(costfunc, backward, model_params, startU, fluxFinalPoint, args, seed; 
#    start_eta=0.01, tol=1e-15, maxiter=400, verbose=true, report_every=1, do_plot=false, cost_limit=cost_limit) # cost_limit=-0.000935) # for beta=0.01

# And show the final position
figure(1); clf()
cost, mhits, mdiffs = 
    costfunc(startU; beta=beta, do_plot=true, sr=sr, verbose=true, make_dict(args, params, model_params)...)

repeat_results_in_fig2 = true; if repeat_results_in_fig2
    figure(2); clf()
    costfunc(startU; beta=beta, do_plot=true, sr=sr, verbose=true, 
        make_dict(args, params, merge(Dict(:fignum=>2), model_params))...)
    figure(1); 
end
params'

# For beta=0, and ntrials=20, we collected a bunch of results and observed that it failes about half the time
# WHEN the initial mean)hits) is below 0.5.  It never fails if the initial mean(hits) is above 0.5. 
# Seems like when it starts below 0.5, the fastest way to increase mean(hits) is to push it to 0.5 and floor it there.
# The results were collected in "Results.mat"
# res = [res ; omhits mhits]

[omhits mhits]

seed = [-0.2, 0.2, 0.2, -1.5]




-- cost=0.0618702,   cost1=0.0619307, cost2=-6.04596e-05 :  mean(hits)=0.501141, mean(diffs)=0.0201532
0: eta=1 ps=[-0.200, 0.200, 0.200, -1.500]
-- cost=0.0618702,   cost1=0.0619307, cost2=-6.04596e-05 :  mean(hits)=0.501141, mean(diffs)=0.0201532
-- cost=0.0609813,   cost1=0.0610149, cost2=-3.36142e-05 :  mean(hits)=0.502988, mean(diffs)=0.0112047
1: eta=1.1 cost=0.0609813 jtype=Newton costheta=-0.368 ps=[-0.187942, 0.181484, -0.0195354, -1.39132]
-- cost=0.0613172,   cost1=0.0613271, cost2=-9.90029e-06 :  mean(hits)=0.502357, mean(diffs)=0.0033001
eta going down: new_cost-cost=0.000335981 and jumptype='Newton'
2: eta=0.55 cost=0.0609813 jtype=Newton costheta=NaN ps=[-0.187942, 0.181484, -0.0195354, -1.39132]
-- cost=0.0533101,   cost1=0.0533805, cost2=-7.04243e-05 :  mean(hits)=0.518958, mean(diffs)=0.0234748
3: eta=0.605 cost=0.0533101 jtype=constrained costheta=-0.697 ps=[0.148346, -0.140224, 0.230154, -1.53175]
-- cost=0.00648721,   cost1=0.00724268, cost2=-0.000755464 :  mean(hi

1×2 Array{Float64,2}:
 0.501141  0.75008

In [18]:
old_sr

1504713708

In [13]:
figure(2); clf();
plot(cpm_traj[1,:], ".")

1-element Array{Any,1}:
 PyObject <matplotlib.lines.Line2D object at 0x33088fad0>

In [260]:
function param_path(ppath; k=1, do_plot=true, fignum=1, clearfig=true)
    costhetas = zeros(1, size(ppath,2)-k-1)
    deltas = ppath[:,2:end] - ppath[:,1:end-1]
    for i=1:length(costhetas)
        costhetas[i] = dot(deltas[:,i], deltas[:,i+k])/(norm(deltas[:,i])*norm(deltas[:,i+k]))
    end;
    
    if do_plot
        figure(fignum); if clearfig; clf(); end;
        plot(costhetas', ".")
    end
    return costhetas
end
    



param_path (generic function with 1 method)

In [269]:
param_path(traj[3:end,:]; k=30)


1×69 Array{Float64,2}:
 NaN  0.850328  0.775369  -0.952955  …  NaN  NaN  NaN  NaN  NaN  NaN  NaN

In [273]:
a = matread("error_report.mat")
hess = a["hess"]
eta = a["eta"]
grad = a["grad"]

chessdelta = constrained_parabolic_minimization(hess, grad'', eta, maxiter=500, max_efactor_tries=20, tol=1e-20, do_plot=true, verbose=true)[1]

cpm: g (candidate indices) are : [1, 333, 444, 667]
cpm: and their corresponding costs are : [0.018, 0.018, 0.018, 0.018]
cpm: and their corresponding lambdas are : [-7654187.550, -30616.750, 2518227.704, 7638879.175]
cpm: the minimum cost was : 0.0182817
1 : After searching for lambdas with efactor=3, we found these : []
cpm: g (candidate indices) are : [1, 361, 667]
cpm: and their corresponding costs are : [0.018, 0.018, 0.018]
cpm: and their corresponding lambdas are : [-30616750.199, 2449340.016, 30555516.699]
cpm: the minimum cost was : 0.0182817
2 : After searching for lambdas with efactor=12, we found these : []
cpm: g (candidate indices) are : [1, 340, 667]
cpm: and their corresponding costs are : [0.018, 0.018, 0.018]
cpm: and their corresponding lambdas are : [-122467000.798, 2081939.014, 122222066.796]
cpm: the minimum cost was : 0.0182817
3 : After searching for lambdas with efactor=48, we found these : []
cpm: g (candidate indices) are : [1, 335, 667]
cpm: and their corres

LoadError: ArgumentError: collection must be non-empty

# Example doing a successful minimization of a 2d model

In [None]:
function JJ(initUs; theta1=0.15, theta2=0.2, beta=0.003, verbose=false, nderivs=0, difforder=0, 
    do_plot=false, pre_string="", zero_last_sigmas=0, seedrand=NaN, params...)

    if ~isnan(seedrand); srand(seedrand); end
    
    Vend = ForwardDiffZeros(size(initUs,1), size(initUs,2), nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    
    for i=1:size(initUs,1)
        if i>size(initUs,1) - zero_last_sigmas
            Ue, Ve, U, V = forwardModel(initUs[i,:]; sigma=0, nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)            
        else
            Ue, Ve, U, V = forwardModel(initUs[i,:]; nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)
        end
        Vend[i,:] = Ve
    end
    
    hits = 0.5*(1 + tanh.((Vend[:,1]-Vend[:,2])/theta1))
    diffs = tanh.((Vend[:,1]-Vend[:,2])/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)
    
    if verbose
        @printf("%s", pre_string)
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2
end
  


In [None]:
# The following sequence leads to a situation where having only [-0.8, -0.8] as the single finalFluxPoint 
# leads to the minimization getting stuck.  Adding further finalFluxPoints solves the problem
#
srand(12)   # 12 is perfect success; srand(11) gets stuck at mean(hits)=0.72
startU=randn(100,2)-3
startU=randn(100,2)-3
sigma = 0

# startU=0.1*randn(100,2)-3
# startU=zeros(100,2)-3


dt = 0.02
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

W = -4
noise = 0
input = 0
sigma = 0


model_params = Dict(:dt=>dt, :tau=>tau, :W=>[0 W; W 0], :nsteps=>nsteps, 
:noise=>noise, :input=>input, :sigma=>sigma, :const_add=>0, :init_add=>0)


# WORKING gradient:
# ForwardDiff.gradient((x)->JJ(startU; do_plot=true, nderivs=length(x), difforder=1, 
#    make_dict([["init_add" 2], "const_add"], x, model_params)...), [2.9, -2.9, 0.1])



# The backward and costfunc functions should turn a single-scalar parameter W into the matrix W
# backward always runs with no within-forward noise, i.e., sigma=0
backward = (endpoint; do_plot=false, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;     
    backwardsModel(endpoint; do_plot=do_plot, make_dict(["sigma"], [0], pars)...)[1]
end


beta = 0.003;

costfunc = (startpoints; do_plot=false, verbose=false, nderivs=0, difforder=0, sr=26, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;         
    JJ(startpoints; seedrand=sr, beta=beta, 
        do_plot=do_plot, verbose=verbose, nderivs=nderivs, difforder=difforder, pars...)
end

function JJ(initUs; theta1=0.15, theta2=0.2, beta=0.003, verbose=false, nderivs=0, difforder=0, 
    do_plot=false, pre_string="", zero_last_sigmas=0, seedrand=NaN, params...)

    if ~isnan(seedrand); srand(seedrand); end
    
    Vend = ForwardDiffZeros(size(initUs,1), size(initUs,2), nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    
    for i=1:size(initUs,1)
        if false # i>size(initUs,1) - zero_last_sigmas
            Ue, Ve, U, V = forwardModel(initUs[i,:]; sigma=0, nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)            
        else
            Ue, Ve, U, V = forwardModel(initUs[i,:]; nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)
        end
        Vend[i,:] = Ve
    end
    
    hits = 0.5*(1 + tanh.((Vend[:,1]-Vend[:,2])/theta1))
    diffs = tanh.((Vend[:,1]-Vend[:,2])/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)
    
    if verbose
        @printf("%s", pre_string)
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2
end
  


if beta==0.003;     cost_limit = -0.00288
elseif beta<0.001;  cost_limit = -0.0008
elseif beta==0.001; cost_limit = -0.000935
elseif beta==0.05;  cost_limit = -0.0485
else
    error("Don't know what cost limit goes with beta %g\n", beta)
end

fluxFinalPoint = [-0.8 -0.8; -0.6 -0.6 ; -0.4 -0.4; -0.2 -0.2; 0 0; 0.2 0.2]
fluxFinalPoint = zeros(0,2);



args = [["init_add" 2], "const_add", "W", "sigma"]
seed = [2, 2, 2.1, -1, 0.1]



clf()
print("seed = "); print_vector_g(seed); print("\n")
costfunc(startU; do_plot=true, verbose=true, make_dict(args, seed, model_params)...)

# :sigma=>[-0.3, 0.3] does fine but :sigma=>[-0.2, 0.2] gets stuck.
# If we fix sigma at 0 it also gets stuck, but dynamics kind of odd, W a bit to big, or decrease dt
params, traj = bbox_Hessian_keyword_minimization(seed, args, Dict(:init_add=>[-5.1, 5.1], :sigma=>[-0.3, 0.3]), 
(;params...) -> costfunc(startU; do_plot=false, verbose=true, merge(model_params, Dict(params))...), 
 verbose=true, start_eta=1, tol=1e-6, hardbox=true )

# params, cost, ptraj, gtraj = fluxSense(costfunc, backward, model_params, startU, fluxFinalPoint, args, seed; 
#    start_eta=0.01, tol=1e-15, maxiter=400, verbose=true, report_every=1, do_plot=false, cost_limit=cost_limit) # cost_limit=-0.000935) # for beta=0.01

# And show the final position
clf()
costfunc(startU; do_plot=true, verbose=true, make_dict(args, params, model_params)...)
params'

In [None]:
clf()
startU=randn(4000,2)-3
costfunc(startU; sr=NaN, do_plot=false, verbose=true, make_dict(args, params, model_params)...)
params'

# Example of getting stuck

In [None]:
function JJ(initUs; theta1=0.15, theta2=0.2, beta=0.003, verbose=false, nderivs=0, difforder=0, 
    do_plot=false, pre_string="", zero_last_sigmas=0, seedrand=NaN, params...)

    if ~isnan(seedrand); srand(seedrand); end
    
    Vend = ForwardDiffZeros(size(initUs,1), size(initUs,2), nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    
    for i=1:size(initUs,1)
        if i>size(initUs,1) - zero_last_sigmas
            Ue, Ve, U, V = forwardModel(initUs[i,:]; sigma=0, nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)            
        else
            Ue, Ve, U, V = forwardModel(initUs[i,:]; nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)
        end
        Vend[i,:] = Ve
    end
    
    hits = 0.5*(1 + tanh.((Vend[:,1]-Vend[:,2])/theta1))
    diffs = tanh.((Vend[:,1]-Vend[:,2])/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)
    
    if verbose
        @printf("%s", pre_string)
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2
end
  


In [None]:
function JJ(initUs; theta1=0.15, theta2=0.2, beta=0.003, verbose=false, nderivs=0, difforder=0, 
    do_plot=false, pre_string="", zero_last_sigmas=0, seedrand=NaN, params...)

    if ~isnan(seedrand); srand(seedrand); end
    
    Vend = ForwardDiffZeros(size(initUs,1), size(initUs,2), nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    
    for i=1:size(initUs,1)
        if false # i>size(initUs,1) - zero_last_sigmas
            Ue, Ve, U, V = forwardModel(initUs[i,:]; sigma=0, nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)            
        else
            Ue, Ve, U, V = forwardModel(initUs[i,:]; nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)
        end
        Vend[i,:] = Ve
    end
    
    hits = 0.5*(1 + tanh.((Vend[:,1]-Vend[:,2])/theta1))
    diffs = tanh.((Vend[:,1]-Vend[:,2])/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)
    
    if verbose
        @printf("%s", pre_string)
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2
end


# The following sequence leads to a situation where having only [-0.8, -0.8] as the single finalFluxPoint 
# leads to the minimization getting stuck.  Adding further finalFluxPoints solves the problem
#
srand(11)
startU=randn(100,2)-3
startU=randn(100,2)-3

dt = 0.02
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

W = -4
noise = 0
input = 0
sigma = 0


model_params = Dict(:dt=>dt, :tau=>tau, :W=>[0 W; W 0], :nsteps=>nsteps, 
:noise=>noise, :input=>input, :sigma=>sigma, :const_add=>0, :init_add=>0)


# WORKING gradient:
# ForwardDiff.gradient((x)->JJ(startU; do_plot=true, nderivs=length(x), difforder=1, 
#    make_dict([["init_add" 2], "const_add"], x, model_params)...), [2.9, -2.9, 0.1])



# The backward and costfunc functions should turn a single-scalar parameter W into the matrix W
# backward always runs with no within-forward noise, i.e., sigma=0
backward = (endpoint; do_plot=false, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;     
    backwardsModel(endpoint; do_plot=do_plot, make_dict(["sigma"], [0], pars)...)[1]
end


beta = 0.0001;
beta = 0.003;
beta = 0.003;
beta=0

costfunc = (startpoints; do_plot=false, verbose=false, nderivs=0, difforder=0, sr=26, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;         
    JJ(startpoints; seedrand=sr, beta=beta, 
        do_plot=do_plot, verbose=verbose, nderivs=nderivs, difforder=difforder, pars...)
end

  


if beta==0.003;     cost_limit = -0.00288
elseif beta<0.001;  cost_limit = -0.0008
elseif beta==0.001; cost_limit = -0.000935
elseif beta==0.05;  cost_limit = -0.0485
else
    error("Don't know what cost limit goes with beta %g\n", beta)
end

fluxFinalPoint = [-0.8 -0.8; -0.6 -0.6 ; -0.4 -0.4; -0.2 -0.2; 0 0; 0.2 0.2]
fluxFinalPoint = zeros(0,2);


args = [["init_add" 2], "const_add", "W"] # , "sigma"]
seed = [2, 2, 2.1, -1] # , 0.1]



clf()
print("seed = "); print_vector_g(seed); print("\n")
costfunc(startU; do_plot=true, verbose=true, make_dict(args, seed, model_params)...)

# :sigma=>[-0.3, 0.3] does fine but :sigma=>[-0.2, 0.2] gets stuck.
# If we fix sigma at 0 it also gets stuck, but dynamics kind of odd, W a bit to big, or decrease dt
params, traj = bbox_Hessian_keyword_minimization(seed, args, Dict(:init_add=>[-5.1, 5.1]), # , :sigma=>[-0.2, 0.2]), 
(;params...) -> costfunc(startU; do_plot=false, verbose=true, merge(model_params, Dict(params))...), 
 verbose=true, start_eta=1, tol=1e-16, hardbox=true )

# params, cost, ptraj, gtraj = fluxSense(costfunc, backward, model_params, startU, fluxFinalPoint, args, seed; 
#    start_eta=0.01, tol=1e-15, maxiter=400, verbose=true, report_every=1, do_plot=false, cost_limit=cost_limit) # cost_limit=-0.000935) # for beta=0.01

# And show the final position
clf()
costfunc(startU; do_plot=true, verbose=true, make_dict(args, params, model_params)...)
params'

In [None]:
function JJ(initUs; theta1=0.15, theta2=0.2, beta=0.003, verbose=false, nderivs=0, difforder=0, 
    do_plot=false, pre_string="", zero_last_sigmas=0, seedrand=NaN, params...)

    if ~isnan(seedrand); srand(seedrand); end
    
    Vend = ForwardDiffZeros(size(initUs,1), size(initUs,2), nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    
    for i=1:size(initUs,1)
        if false # i>size(initUs,1) - zero_last_sigmas
            Ue, Ve, U, V = forwardModel(initUs[i,:]; sigma=0, nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)            
        else
            Ue, Ve, U, V = forwardModel(initUs[i,:]; nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)
        end
        Vend[i,:] = Ve
    end
    
    hits = 0.5*(1 + tanh.((Vend[:,1]-Vend[:,2])/theta1))
    diffs = tanh.((Vend[:,1]-Vend[:,2])/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)
    
    if verbose
        @printf("%s", pre_string)
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2
end


# The following sequence leads to a situation where having only [-0.8, -0.8] as the single finalFluxPoint 
# leads to the minimization getting stuck.  Adding further finalFluxPoints solves the problem
#
srand(11)
startU=randn(100,2)-3
startU=randn(100,2)-3

dt = 0.02
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

W = -4
noise = 0
input = 0
sigma = 0


model_params = Dict(:dt=>dt, :tau=>tau, :W=>[0 W; W 0], :nsteps=>nsteps, 
:noise=>noise, :input=>input, :sigma=>sigma, :const_add=>0, :init_add=>0)


# WORKING gradient:
# ForwardDiff.gradient((x)->JJ(startU; do_plot=true, nderivs=length(x), difforder=1, 
#    make_dict([["init_add" 2], "const_add"], x, model_params)...), [2.9, -2.9, 0.1])



# The backward and costfunc functions should turn a single-scalar parameter W into the matrix W
# backward always runs with no within-forward noise, i.e., sigma=0
backward = (endpoint; do_plot=false, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;     
    backwardsModel(endpoint; do_plot=do_plot, make_dict(["sigma"], [0], pars)...)[1]
end


beta = 0.0001;
beta = 0.003;
beta = 0.003;
beta=0

costfunc = (startpoints; do_plot=false, verbose=false, nderivs=0, difforder=0, sr=26, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;         
    JJ(startpoints; seedrand=sr, beta=beta, 
        do_plot=do_plot, verbose=verbose, nderivs=nderivs, difforder=difforder, pars...)
end

  


if beta==0.003;     cost_limit = -0.00288
elseif beta<0.001;  cost_limit = -0.0008
elseif beta==0.001; cost_limit = -0.000935
elseif beta==0.05;  cost_limit = -0.0485
else
    error("Don't know what cost limit goes with beta %g\n", beta)
end

fluxFinalPoint = [-0.8 -0.8; -0.6 -0.6 ; -0.4 -0.4; -0.2 -0.2; 0 0; 0.2 0.2]
fluxFinalPoint = zeros(0,2);


args = [["init_add" 2], "const_add", "W"] # , "sigma"]
seed = [2, 2, 2.1, -1] # , 0.1]



clf()
print("seed = "); print_vector_g(seed); print("\n")
costfunc(startU; do_plot=true, verbose=true, make_dict(args, seed, model_params)...)

# :sigma=>[-0.3, 0.3] does fine but :sigma=>[-0.2, 0.2] gets stuck.
# If we fix sigma at 0 it also gets stuck, but dynamics kind of odd, W a bit to big, or decrease dt
params, traj = bbox_Hessian_keyword_minimization(seed, args, Dict(:init_add=>[-5.1, 5.1], :W=>[-5.1, 5.1]), # , :sigma=>[-0.2, 0.2]), 
(;params...) -> costfunc(startU; do_plot=false, verbose=true, merge(model_params, Dict(params))...), 
 verbose=true, start_eta=1, tol=1e-16, hardbox=true )

# params, cost, ptraj, gtraj = fluxSense(costfunc, backward, model_params, startU, fluxFinalPoint, args, seed; 
#    start_eta=0.01, tol=1e-15, maxiter=400, verbose=true, report_every=1, do_plot=false, cost_limit=cost_limit) # cost_limit=-0.000935) # for beta=0.01

# And show the final position
clf()
costfunc(startU; do_plot=true, verbose=true, make_dict(args, params, model_params)...)
params'

In [None]:
print(args)

params'

In [None]:
clf()
costfunc(startU; do_plot=true, verbose=true, make_dict(args, [2, 1.7, 1.3, -1.4, -0.5], model_params)...)


In [None]:
figure(2)
clf()
plot(traj[1,:], ".")

In [None]:
clf()
cost, grad, hess = keyword_gradient((;params...) -> 
costfunc(startU; do_plot=true, verbose=true, merge(model_params, Dict(params))...), args, [0.6, -0.6, 1.5, -2])


# costfunc(startU; do_plot=true, verbose=true, make_dict(args, [0.6, -0.6, 1.5, -2], model_params)...)




In [None]:
clf()
cost, gradB, hessB = keyword_vgh((;params...) -> 
costfunc(startU; do_plot=true, verbose=true, merge(merge(model_params, Dict(params)), Dict(:dt=>0.005, :nsteps=>201))...), 
args, [0.6, -0.6, 1.5, -2])


In [None]:
[gradA gradB]

In [None]:
clf()
costfunc(startU; do_plot=true, verbose=true, 
merge(make_dict(args, params, model_params), Dict(:dt=>0.005, :nsteps=>201))...)


In [None]:
model_params


In [None]:
figure(2);
clf()
subplot(2,1,1); plot(cost', ".")
subplot(2,1,2); 
guys = 4:5
ng = sqrt(sum(gtraj[guys,:].*gtraj[guys,:],1))
plot(sum(gtraj[guys,1:end-1].*gtraj[guys,2:end],1)'./(ng[1:end-1].*ng[2:end]), ".")
grid(true)


### Beginning of attempt at finding saddle points

In [None]:
dt = 0.02
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

W = -4
noise = 0


model_params = Dict(:dt=>dt, :tau=>tau, :W=>[0 W; W 0], :nsteps=>nsteps, 
    :noise=>noise, :noise=>noise, :const_add=>0, :init_add=>0)

@time(trust_region_Hessian_minimization([-2.1, -2.1], 
    (x)->forwardModel(x; do_plot=false, nderivs=2, difforder=1, dUdt_mag_only=true, model_params...), 
verbose=false, start_eta=0.1, tol=1e-6))

In [None]:
backward = (endpoint; do_plot=false, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;     
    backwardsModel(endpoint; do_plot=do_plot, make_dict(["sigma"], [0], pars)...)[1]
end


# hessian_fluxSense()

In [None]:

"""
function bbox_Hessian_keyword_minimization(seed, args, bbox, func; wallwidth=NaN, start_eta=10, tol=1e-6, 
    maxiter=400, verbose=false)

Like constrained_Hessian_minimization, but uses keyword_hessian!(). 

PARAMETERS:
===========

seed        column vector, representing the starting value of the parameters.

args        List of strings identifying parameters for differentiation, e.g., ["const_E", "w_self]

bbox        An nargs-by-2 matrix indicating the range for each argument,
            with the minima (first column) and maxima (second column).

func        func must take only optional keyword args, and must 
            take nderivs=0, difforder=0  and declare any new matrices using ForwardDiffZeros() instead of zeros()


OPTIONAL PARAMETERS:
====================

start_eta=10 Starting value of the radius.  It's good to start with somethibg biggish, if it is
             too much, it'll quickly get cut down.

tol=1e-6     Numerical tolerance. If a proposed jump produces a change in func that is less than
             this, the minimization stops.

maxiter=400  Maximum number of iterations to do before stopping

verbose=false   If true, print out a report on each iteration of iteration number, radius size (eta),
                what type jump was proposed ("Newton" means going straight to global min, "constrained" means jump has 
                norm eta, failed means that finding the minimum at a given radius somehow didn't work). Will also
                print out the cosine of the angle between the proposed jump and the gradient.

hardbox=false   If true, ignores wallwidth, and just rests parameter values to the bounding box if they go outside it.
                If false, adds cost function "walls" to implement the bounding box.

walldith=NaN     Used for putting up cost function "walls" that implement the bounding box limits. Can be NaN.
                If it is NaN, then the wallwidth is a constant factor of the range width for each argument. If not NaN, must
                be an nargs-long vector that indicates the actual wall widths.

wallwidth_factor=0.18   Only relevant if wallwidth is NaN, otherwise ignored. For each arg, the wall width
                is going to be wall_width_factor*(bbox[i,2] - bbox[i,1])


RETURNS:
========

params       A vector the size of seed that has the last values of the minimizing parameters for func
trajectory   A (2+length(params))-by-nsteps matrix. Each column corresponds to an iteration step, and contains
                 the value of eta used, the cost, and the value of the parameters at that iteration
cost         Final value of objective function


EXAMPLE:
========

function tester(;x=5, y=10, z=20, nderivs=0, difforder=0)
    return x^2*y + z/tanh(y)
end

params, trajectory = bbox_Hessian_keyword_minimization([0.5, 0.5], ["x", "y"], [1.1 2 ; 1.1 4], tester, 
    verbose=true, tol=1e-12, start_eta=1);



"""
# function bbox_Hessian_keyword_minimization(seed, args, bbox, func; 
    
start_eta=10 
tol=1e-6 
maxiter=400
verbose=false
verbose_every=1 
wallwidth=NaN 
wallwidth_factor=0.18
hardbox=false


    
    traj_increment = 100
    params = seed
    eta = start_eta
    trajectory = zeros(2+length(params), traj_increment)

    if verbose
        @printf "%d: eta=%g ps=" 0 eta 
        print_vector_g(params)
        @printf "\n"
    end
    
    if hardbox
        cost, grad, hess = keyword_vgh((;pars...) -> func(;pars...), args, params)
    else
        cost, grad, hess = keyword_vgh((;pars...) -> func(;pars...) + wall_cost(args, bbox; wallwidth=wallwidth, pars...),
            args, params)        
    end
        
    chessdelta = zeros(size(params))

    for i=1:maxiter
        if i > size(trajectory, 2)
            trajectory = [trajectory zeros(2+length(params), traj_increment)]
        end
        trajectory[1:2, i]   = [eta;cost]
        trajectory[3:end, i] = params
        
        hessdelta  = - inv(hess)*grad
        try
            chessdelta = constrained_parabolic_minimization(hess, grad'', eta)[1]
            jumptype = "not failed"
        catch y
            jumptype = "failed"
            if verbose
                @printf "Constrained parabolic minimization failed with error %s\n" y
                @printf "\n"
                @printf "eta was %g\n" eta
                @printf "grad was\n"
                print_vector(grad)
                @printf "\n\nhess was\n"
                for k in [1:length(grad);]
                    print_vector(hess[k,:])
                    @printf "\n"
                end
                @printf "\n"
            end
        end

        if norm(hessdelta) <= eta
            new_params = params + hessdelta
            jumptype = "Newton"
        elseif jumptype != "failed" 
            new_params = params + chessdelta
            jumptype  = "constrained"
        end

        if jumptype != "failed"
            if hardbox
                for p in [1:length(new_params);]
                    if new_params[p] < bbox[p,1]; new_params[p] = bbox[p,1]; end
                    if bbox[p,2] < new_params[p]; new_params[p] = bbox[p,2]; end
                 end        
                
                new_cost, new_grad, new_hess = keyword_vgh((;pars...) -> func(;pars...), args, new_params)
            else
                new_cost, new_grad, new_hess = keyword_vgh((;pars...) -> func(;pars...) + 
                        wall_cost(args, bbox; wallwidth=wallwidth, pars...),
                    args, new_params)                
            end
            
            if abs(new_cost - cost) < tol || eta < tol
                trajectory = trajectory[:,1:i]
                break
            end
        end

        if jumptype == "failed" || new_cost >= cost  
            eta = eta/2
            costheta = NaN
            if eta < tol
                trajectory = trajectory[:,1:i]
                break
            end
        else
            eta = eta*1.1
            costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))

            params = new_params
            cost = new_cost
            grad = new_grad
            hess = new_hess
        end

        if verbose
            if rem(i, verbose_every)==0
                @printf "%d: eta=%g cost=%.4f jtype=%s costheta=%.3f ps=" i eta cost jumptype costheta
                print_vector(params)
                @printf "\n"
            end
        end
    end
    
    return params, trajectory, cost
end


In [None]:
    """
    Given args, a list of string representing the arguments of interest, a bounding box for each,
    and a Symbol=>value dictionary with the corresponding parameters, computes and returns a high cost for 
    being outside the bounding box
    """
    function wall_cost(args, bbox; wallwidth=NaN, nderivs=0, difforder=0, pars...) 
        myparams = ForwardDiffZeros(length(pars), 1, nderivs=nderivs, difforder=difforder)
        pars2 = Dict()
        for i in [1:length(pars);]
            pars2[string(pars[i][1])] = pars[i][2]
        end
        for i in [1:length(args);]
            myparams[i] = pars2[args[i]]
        end
        
        if isnan(wallwidth)
            # We know that we're going to be taking hessian for params, so declare zeros accordingly:
            wallwidth = ForwardDiffZeros(length(myparams), 1, nderivs=nderivs, difforder=difforder)

            for i in [1:length(myparams);]
                wallwidth[i] = wallwidth_factor*(bbox[i,2]-bbox[i,1])
            end
        end

        retval = 0
        for i in [1:length(myparams);]
            if myparams[i]<bbox[i,1]
                retval += cosh((bbox[i,1]-myparams[i])/wallwidth[i])-1.0
            elseif bbox[i,2] < myparams[i]
                retval += cosh((myparams[i]-bbox[i,2])/wallwidth[i])-1.0                
            end
        end

        return 2*retval
    end


In [None]:
clf()
costfunc(startU; do_plot=true, verbose=true, make_dict(["sigma"], [0.1], make_dict(args, params, model_params))...)


Working on figuring out the weird trajectories. Probably a dt thing.  The fourth one is the weird one.

In [None]:
forward = (startpoint; do_plot=false, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;     
    forwardModel(startpoint; do_plot=do_plot, pars...)
end


# FUNCTION DEFINITION: fluxSense()

### The main fluxSense() function containing the main minimization loop

In [None]:
function fluxSense(costfunc, backward, paramsDict, startUs, ends, args, seed; start_eta=0.01, tol=1e-15, 
    maxiter=400, verbose=true, do_plot=false, cost_limit=[], report_fluxless_grad=false, report_every=1)
   
    if do_plot; clf(); end;

    params = seed
    eta    = start_eta

    if ~(typeof(ends)<:Array); ends = [ends]; end
    U0 = zeros(size(ends))
    for j in 1:size(ends,1)
        # @printf("model params is "); print(model_params); print("\n")
        # @printf("ends[j,:] is "); print(ends[j,:]); print("\n")        
        U0[j,:] = backward(ends[j,:]; tol=1e-25, do_plot=false, make_dict(args, params, model_params)...)'
        # @printf("U0[j,:] is "); print(U0[j,:]); print("\n")        
    end
    
    if length(ends)>0
        @printf("U0[end,:] is "); print_vector_g(U0[end,:]); @printf("\n")
    end
    
    cost, grad, hess = 
        vgh((x)->costfunc([startUs;U0]; do_plot=do_plot, nderivs=length(x), difforder=2, 
            make_dict(args, x, model_params)...), params)
    
    if verbose && report_fluxless_grad
        fcost, fgrad, fhess = 
        vgh((x)->costfunc(startUs; do_plot=false, nderivs=length(x), difforder=2, 
                make_dict(args, x, model_params)...), params)
        @printf("      ### grad without flux track = "); print_vector_g(fgrad); @printf("\n")    
    end

    if verbose
        @printf("Initial cost, grad, hess:\n")
        @printf("   cost = %g\n", cost)
        @printf("   grad = "); print_vector_g(grad); print("\n")
        @printf("   hess = "); print_vector_g(hess); print("\n")
    end

    delta_params=0
    ptrajectory = zeros(length(seed), maxiter); 
    gtrajectory = zeros(length(seed), maxiter); 
    ctrajectory = zeros(1, maxiter);
    
    for i in [1:maxiter;]         
        my_verbose = verbose && rem(i, report_every)==0

        new_params = params - eta*grad/(sqrt(sum(grad.*grad)))
        delta_params = new_params - params

        new_cost, new_grad, new_hess = 
        vgh((x)->costfunc([startUs;U0]; do_plot=false, verbose=my_verbose, pre_string="   newpars>> ",
                zero_last_sigmas=size(U0,1), nderivs=length(x), difforder=2, make_dict(args, x, model_params)...), 
                new_params)

        if my_verbose
            @printf("delta_params="); print_vector_g(delta_params); @printf("\n"); 
            @printf("new_cost=%g  cost=%g   delta_cost=%g\n", new_cost, cost, new_cost-cost)
        end
        
        if abs(new_cost - cost) < tol
            @printf("\n===\nChange in cost was less than the tolerance %g\n===\n", tol)
            ptrajectory=ptrajectory[:,1:i-1]; gtrajectory=gtrajectory[:,1:i-1]; ctrajectory=ctrajectory[1:i-1]
            break
        end
        if (length(cost_limit)>0 && cost < cost_limit)
            @printf("\n===\nCost was less than the cost limit %g\n===\n", cost_limit)
            ptrajectory=ptrajectory[:,1:i-1]; gtrajectory=gtrajectory[:,1:i-1]; ctrajectory=ctrajectory[1:i-1]
            break
        end
        
        if new_cost >= cost
            eta = eta/2
            costheta = NaN
        else
            eta = eta*1.2
            costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))

            params = new_params
    
            for j in 1:size(ends,1)
                U0[j,:] = backward(ends[j,:]; do_plot=false, make_dict(args, params, model_params)...)'
            end
            if my_verbose && length(ends)>0
                @printf("U0[end,:] is "); print_vector_g(U0[end,:]); @printf("\n")
            end
            cost, grad, hess = 
                vgh((x)->costfunc([startUs;U0]; do_plot=do_plot, verbose=my_verbose, nderivs=length(x), difforder=2, 
                    zero_last_sigmas=size(U0,1), make_dict(args, x, model_params)...), params)

        end
        
        ptrajectory[:,i] = params
        gtrajectory[:,i] = grad
        ctrajectory[i]   = cost

        if my_verbose
            @printf "%d: eta=%g cost=%g costheta=%g ps=" i eta cost  costheta
            print_vector(params)
            @printf "\n"
            @printf("grad="); print_vector_g(grad); @printf("\n")
            if report_fluxless_grad
                fcost, fgrad, fhess = 
                vgh((x)->costfunc(startUs; do_plot=false, verbose=false, nderivs=length(x), difforder=2, 
                        make_dict(args, x, model_params)...), params)
            @printf("      ### grad without flux track = "); print_vector_g(fgrad); @printf("\n")    
            end            
        end
    end    

    return params, ctrajectory, ptrajectory, gtrajectory
end


# WORKING EXAMPLE:   1-D example of using fluxSense()

## after defining fluxSense(), run the next two cells in order

### Define the cost function. It passes most keyword params down to the forward and backward models

In [None]:
function J(initUs; theta1=0.15, theta2=0.2, beta=0.01, verbose=false, nderivs=0, difforder=0, 
    do_plot=false, params...)

    Vend = ForwardDiffZeros(length(initUs), 1, nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    
    for i=1:length(initUs)
        Ue, Ve, U, V = forwardModel(initUs[i]; nderivs=nderivs, difforder=difforder, 
            do_plot=do_plot, clearfig=false, params...)
        Vend[i] = Ve[1]
    end
    
    hits = 0.5*(1 + tanh.((Vend-0.5)/theta1))
    diffs = tanh((Vend-0.5)/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)
    
    if verbose
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2
end
    
    

### Now setup, run fluxSense(), and display results. Example is only 1-d so far.

In [None]:

backward = (endpoint; do_plot=false, pars...) -> backwardsModel(endpoint; do_plot=do_plot, pars...)[1]

costfunc = (startpoints; do_plot=false, verbose=false, nderivs=0, difforder=0, pars...) -> 
J(startpoints; do_plot=do_plot, verbose=verbose, beta=0.01, nderivs=nderivs, difforder=difforder, pars...)

dt = 0.01
t = 0:dt:2
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

noise = 0*randn(1, nsteps)
noise = 0.02*sin(2*pi*3*t); noise=reshape(noise, 1, nsteps)

W = 4.1
const_add = -2
init_add=0

model_params = Dict(:dt=>dt, :tau=>tau, :W=>W, :nsteps=>nsteps, 
    :noise=>noise, :noise=>noise, :const_add=>const_add, :init_add=>init_add)

srand(20)  
startUs = randn(20, 1)       # The starting values
# startUs = [randn(10,1)+2;randn(10,1)-2]


args = ["init_add", "const_add", "W"]
seed = [0, -2, 4.1]

# Do an initial run plotting to show the starting position
clf()
costfunc(startUs; do_plot=true, verbose=true, model_params...)


fluxFinalPoint = convert(Float64, 0)  # The final value of the pinned output
#
# If you remove the fluxFinalPoint, by un-commenting the following line, it gets stuck. But
# it is also true that if you make beta=0 (in the constfunc() definition in line 4 above) then t
# that also solves the sticking problem.  If we had beta=0 until after our hits are what we want, 
# would we ever need fluxPoint?
#
fluxFinalPoint = [];

params, cost = fluxSense(costfunc, backward, model_params, startUs, fluxFinalPoint, args, seed; 
start_eta=0.01, tol=1e-15, maxiter=400, verbose=true, report_fluxless_grad=false, do_plot=true, cost_limit=-0.00959)

# And show the final position
clf()
costfunc(startUs; do_plot=true, verbose=true, make_dict(args, params, model_params)...)


### ---  END --- 1d example of using fluxSense()

# Example of forwards and backwards models

### Inverting time even through a sinusoidal noise, with added noise

In [None]:
dt = 0.01
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

noise = 0*randn(1, nsteps)
noise = 0.2*sin(2*pi*3*t); noise=reshape(noise, 1, nsteps)
W = [0.5]


model_params = Dict(:dt=>dt, :tau=>tau, :W=>W, :nsteps=>nsteps, 
    :noise=>noise, :noise=>noise, :const_add=>-0.15, :init_add=>0.3)
clf();

srand(10)

startUs = -0.5
Uend, Vend, U, V =forwardModel(startUs; do_plot=true, clearfig=false, model_params...)
Ustart, Vstart = backwardsModel(Uend;  do_plot=true, clearfig=false, tol=1e-15, model_params...)

[startUs Ustart]




# OLD: scripts on the path to writing fluxSense() as a function 

### Define the cost function. It passes most keyword params down to the forward and backward models

In [None]:
function J(initUs; theta1=0.15, theta2=0.2, beta=0.01, verbose=false, nderivs=0, difforder=0, 
    do_plot=false, params...)

    Vend = ForwardDiffZeros(length(initUs), 1, nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    
    for i=1:length(initUs)
        Ue, Ve, U, V = forwardModel(initUs[i]; nderivs=nderivs, difforder=difforder, 
            do_plot=do_plot, clearfig=false, params...)
        Vend[i] = Ve[1]
    end
    
    hits = 0.5*(1 + tanh.((Vend-0.5)/theta1))
    diffs = tanh((Vend-0.5)/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)
    
    if verbose
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2
end
    
    

#### An example of some code that does differentiation. This cell not necessary for running the others.

In [None]:
# An example of a standard setup which we'll try to modify to try to get 75% correct

dt = 0.01
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

noise = 0*randn(1, nsteps)
noise = 0.02*sin(2*pi*3*t); noise=reshape(noise, 1, nsteps)

W = 4.1
const_add = -2
init_add=0

model_params = Dict(:dt=>dt, :tau=>tau, :W=>W, :nsteps=>nsteps, 
:noise=>noise, :noise=>noise, :const_add=>const_add, :init_add=>init_add)

srand(10)
startUs = randn(40, 1)
J(startUs; do_plot=true, verbose=true, model_params...)

# --- now while taking the derivative ---
args = ["init_add", "const_add", "W"]
seed = [init_add, const_add, W]

ForwardDiff.gradient((x)->J(startUs; do_plot=true, nderivs=length(x), difforder=1, verbose=true, make_dict(args, x, model_params)...), seed)



### Main adaptive step with gradient and keywords loop

In [None]:
#####################################################
#                                                   #
#                                                   #
#     ADAPTIVE GRADIENT FOR KEYWORD VERSION         #
#                                                   #
#                                                   #
#####################################################


# -----------------  FORWARD MODEL SETUP ---------------
dt = 0.01
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

noise = 0*randn(1, nsteps)
noise = 0.02*sin(2*pi*3*t); noise=reshape(noise, 1, nsteps)

W = 4.1
const_add = -2
init_add=0

model_params = Dict(:dt=>dt, :tau=>tau, :W=>W, :nsteps=>nsteps, 
:noise=>noise, :noise=>noise, :const_add=>const_add, :init_add=>init_add)


# ----------------  CRITICAL INDICATION OF PARAMETERS TO OPTIMIZE IS HERE: -----
args = ["init_add", "const_add", "W"]
seed = [0, -2, 4.1]

fluxFinalPoint = convert(Float64, 0)  # The final value of the pinned output

srand(10)  
startUs = randn(200, 1)       # The starting values

# -----------------------------------------------------------------------------

clf()

start_eta = 0.01
tol = 1e-15
maxiter = 400
verbose = true
do_plot=false

# -------

params = seed
eta    = start_eta


U0 = backwardsModel(fluxFinalPoint; do_plot=false, make_dict(args, params, model_params)...)[1]
J([startUs;U0]; verbose=true, do_plot=true, make_dict(args, params, model_params)...)


cost, grad, hess = 
    vgh((x)->J([startUs;U0]; do_plot=false, nderivs=length(x), difforder=2, make_dict(args, x, model_params)...), params)

@printf("Initial cost, grad, hess:\n")
print_vector_g(:cost)
print_vector_g(:grad)
print_vector_g(:hess)
delta_params=0


for i in 1:maxiter         
        new_params = params - eta*grad/(sqrt(sum(grad.*grad)))
        delta_params = new_params - params
        print_vector_g(:delta_params)
        new_cost, new_grad, new_hess = 
            vgh((x)->J([startUs;U0]; do_plot=false, verbose=false,
                nderivs=length(x), difforder=2, make_dict(args, x, model_params)...), new_params)
        @printf("new_cost=%g  cost=%g   delta_cost=%g\n", new_cost, cost, new_cost-cost)
        if abs(new_cost - cost) < tol
            break
        end

        if new_cost >= cost
            eta = eta/2
            costheta = NaN
        else
            eta = eta*1.2
            costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))

            params = new_params
    
            U0 = backwardsModel(fluxFinalPoint; do_plot=false, make_dict(args, params, model_params)...)[1]
            cost, grad, hess = 
                vgh((x)->J([startUs;U0]; do_plot=do_plot, verbose=true,
                    nderivs=length(x), difforder=2, make_dict(args, x, model_params)...), params)

        end

        if verbose
            @printf "%d: eta=%g cost=%g costheta=%g ps=" i eta cost  costheta
            print_vector(params)
            @printf "\n"
        end
end

### A cell to plot the results

In [None]:
# J(params; initUs=[startUs;U0], verbose=true, do_plot=true)
clf()
Ve = zeros(length(startUs),1)
for i=1:length(startUs)
    Ue, Vee, U, V = forwardModel(startUs[i]; do_plot=true, clearfig=false, make_dict(args, params, model_params)...)
    Ve[i] = Vee[1]
end

@printf("\n\nFinal result produces %d hits out of %d trials for %.1f per cent correct\n\n", length(find(Ve.>0.5)), 
    length(Ve), 100*length(find(Ve.>0.5))/length(Ve))

## --- END Complete keyword-driven adaptive gradient version of FluxSense minimizing

### Complete adaptive gradient version of FluxSense minimizing

In [None]:
#####################################################
#                                                   #
#                                                   #
#     In this cell we define J(x::Vector[3])        #
#     Not full keyword version yet.                 #
#                                                   #
#     Next cell has the adaptive gradient procedure #
#                                                   #
#     Run the third cell to see results             #
#                                                   #
#                                                   #
#####################################################

dt = 0.01
t = 0:dt:1
tau = 0.1
nsteps = length(t)

W = [4]
k = -2
init_k = 0

noise = 0.2*sin(2*pi*3*t); noise = reshape(noise, 1, nsteps)

mypars = Dict(:dt=>dt, :tau=>tau, :nsteps=>nsteps)

srand(10)
startUs = 2*randn(200,1)
# for i=1:length(startUs)
#    Uend, Vend, U, V = forwardModel(startUs[i]; noise=noise+k, W=W, do_plot=true, clearfig=false, params...)
#end

# backwardsModel([1.2*0]; do_plot=true, clearfig=false, params...)

function J(x; initUs=startUs, theta1=0.15, theta2=0.2, beta=0.05, verbose=false,
    nderivs=0, difforder=0, do_plot=true)
    
    Vend = ForwardDiffZeros(length(initUs), 1, nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    k = x[1]
    W = x[2]
    init_k = x[3]
    
    for i=1:length(initUs)
        Ue, Ve, U, V = forwardModel(initUs[i]+init_k; noise=noise+k, W=[W], 
        nderivs=nderivs, difforder=difforder, do_plot=do_plot, clearfig=false, mypars...)
        Vend[i] = Ve[1]
    end
    
    hits = 0.5*(1 + tanh.((Vend-0.5)/theta1))
    diffs = tanh((Vend-0.5)/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)
    
    if verbose
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2
end


# WORKS:
# ForwardDiff.gradient((x)->forwardModel(x[1]; noise=noise+k, W=[x[2]], 
#    do_plot=true, clearfig=true, nderivs=2, difforder=1, params...)[1][1], [-2.1, 4])

ForwardDiff.gradient((x)->J(x; nderivs=3, difforder=1), [-2, 4.1, 0])
# J([-2.1, 4])


# ForwardDiff.derivative((x)->forwardModel(startUs[1]; noise=noise+k, W=[x], 
#    do_plot=true, clearfig=true, nderivs=1, difforder=1, params...)[1], 4.5995)


In [None]:
#####################################################
#                                                   #
#                                                   #
#     ADAPTIVE GRADIENT VERSION                     #
#                                                   #
#                                                   #
#####################################################

# This is all BEFORE makign J fully keyword-value driven

seed = [-2, 4.1, 0]   # params are constant add, W, and init_add.
start_eta = 0.01
tol = 1e-15
maxiter = 400
verbose = true

params = seed
eta = start_eta

U0 = backwardsModel([1.2*0]; noise=noise+params[1], W=[params[2]], do_plot=true, mypars...)[1] - params[3]

J(params; initUs=[startUs;U0], verbose=true)

cost, grad, hess = vgh((x)->J(x; initUs=[startUs;U0], nderivs=length(params), difforder=2, verbose=true), params)

@printf("Initial cost, grad, hess:\n")
print_vector_g(:cost)
print_vector_g(:grad)
print_vector_g(:hess)
delta_params=0

for i in 1:maxiter         
        new_params = params - eta*grad/(sqrt(sum(grad.*grad)))
        delta_params = new_params - params
        print_vector_g(:delta_params)
        new_cost, new_grad, new_hess = 
            vgh((x)->J(x; initUs=[startUs;U0], nderivs=length(params), difforder=2, verbose=false, do_plot=false), 
                new_params)        
        @printf("new_cost=%g  cost=%g   delta_cost=%g\n", new_cost, cost, new_cost-cost)
        if abs(new_cost - cost) < tol
            break
        end

        if new_cost >= cost
            eta = eta/2
            costheta = NaN
        else
            eta = eta*1.2
            costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))

            params = new_params
    
            U0 = backwardsModel([1.2*0]; noise=noise+params[1], W=[params[2]], do_plot=false, mypars...)[1] - params[3]
            cost, grad, hess = 
                vgh((x)->J(x; initUs=[startUs;U0], nderivs=length(params), difforder=2, verbose=true, do_plot=false), params)

        end

        if verbose
            @printf "%d: eta=%g cost=%g costheta=%g ps=" i eta cost  costheta
            print_vector(params)
            @printf "\n"
        end
end

In [None]:
# J(params; initUs=[startUs;U0], verbose=true, do_plot=true)
clf()
Ve = zeros(length(startUs),1)
for i=1:length(startUs)
    Ue, Vee, U, V = forwardModel(startUs[i]+params[3]; noise=params[1], W=[params[2]],     
        do_plot=true, clearfig=false, tau=0.1, nsteps=201, dt=0.01)
    Ve[i] = Vee[1]
end


### -------END OF: complete adaptive gradient version of FluxSense minimizing

In [None]:
clf()
J(params; initUs=[startUs;U0], verbose=true, do_plot=true)

In [None]:
length(find(Ve.>0.5))/200

In [None]:
mypars


In [None]:
#####################################################
#                                                   #
#                                                   #
#     HESSIAN VERSION                               #
#                                                   #
#                                                   #
#####################################################


seed = [-2, 4.1]
start_eta = 0.0000001
tol = 1e-15
maxiter = 400
verbose = true

params = seed
eta = start_eta

U0 = backwardsModel([1.2*0]; noise=noise+params[1], W=[params[2]], do_plot=true, mypars...)[1]

J(params; initUs=[startUs;U0], verbose=true)

cost, grad, hess = vgh((x)->J(x; initUs=[startUs;U0], nderivs=length(params), difforder=2, verbose=true), params)

@printf("Initial cost, grad, hess:\n")
print_vector_g(:cost)
print_vector_g(:grad)
print_vector_g(:hess)

for i in 1:maxiter
        hathess    = hess + eye(length(grad), length(grad))/eta        
        new_params = params - inv(hathess)*grad
        new_cost, new_grad, new_hess = 
            vgh((x)->J(x; initUs=[startUs;U0], nderivs=length(params), difforder=2, verbose=true), new_params)
            
        if abs(new_cost - cost) < tol
            # break
        end

        if new_cost >= cost
            eta = eta/2
            costheta = NaN
        else
            eta = eta*1.1
            costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))

            params = new_params
    
            U0 = backwardsModel([1.2*0]; noise=noise+params[1], W=[params[2]], do_plot=false, mypars...)[1]
            cost, grad, hess = 
                vgh((x)->J(x; initUs=[startUs;U0], nderivs=length(params), difforder=2, verbose=true), params)

        end

        if verbose
            @printf "%d: eta=%g cost=%g costheta=%g ps=" i eta cost  costheta
            print_vector(params)
            @printf "\n"
        end
end

In [None]:
new_params

In [None]:
seed = [-2, 4.1]
start_eta = 10
tol = 1e-15
maxiter = 400

params = seed
eta = start_eta

# backwardsModel([0.5]; noise=noise+params[1], W=params[2], params...)

cost, grad, hess = vgh(func, params)


    if verbose && verbose_level >= 2
        @printf("Initial cost, grad, hess:\n")
        print_vector_g(:cost)
        print_vector_g(:grad)
        print_vector_g(:hess)
    end
    

    for i in [1:maxiter;]
        hathess    = hess + eye(length(grad), length(grad))/eta        
        new_params = params - inv(hathess)*grad
        new_cost, new_grad, new_hess = vgh(func, new_params)
            
        if abs(new_cost - cost) < tol
            break
        end

        if new_cost >= cost
            eta = eta/2
            costheta = NaN
        else
            eta = eta*1.1
            costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))

            params = new_params
            cost = new_cost
            grad = new_grad
            hess = new_hess
        end

        if verbose
            @printf "%d: eta=%.3f cost=%.4f costheta=%.3f ps=" i eta cost  costheta
            print_vector(params)
            @printf "\n"
        end
    end
    
    return params, cost
end



In [None]:
trust_region_Hessian_minimization([-2, 4.1], (x)->J(x; nderivs=2, difforder=2, verbose=true), verbose=true)

In [None]:
dt = 0.01
t = 0:dt:1
tau = 0.1
nsteps = length(t)

noise = 3.5*sin(2*pi*3*t); noise=reshape(noise, 1, nsteps)
W = [0.5]

params = Dict(:dt=>dt, :tau=>tau, :nsteps=>nsteps, :noise=>noise)

function J(x; nderivs=0, difforder=0)
    startU = x[1]
    W = x[2]
    
    Uend, Vend, U, V = forwardModel(startU; do_plot=true, W=[W], nderivs=nderivs, difforder=difforder, params...)
    
    return (Vend[1]-0.5)^2
end



In [None]:
trust_region_Hessian_minimization([-0.5, 0.5], (x) -> J(x;nderivs=2, difforder=2), verbose=true)

# OLD STUFF

### --- BEGIN --- Old example that gets stuck: too large beta

In [None]:
# The following sequence leads to a situation where having only [-0.8, -0.8] as the single finalFluxPoint 
# leads to the minimization getting stuck.  Adding further finalFluxPoints solves the problem.
#
# Reducing beta in the cost function J() from 0.01 to 0.003 also eliminated the problem.  
#
srand(10)
startU=randn(100,2)-3
startU=randn(100,2)-3

# startU=randn(100,2)-3

dt = 0.02
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

W = -4
noise = 0


model_params = Dict(:dt=>dt, :tau=>tau, :W=>[0 W; W 0], :nsteps=>nsteps, 
    :noise=>noise, :noise=>noise, :const_add=>0, :init_add=>0)

clf(); subplot(2,1,1)
for i in 1:size(startU,1)
    Uend, Vend, U, V = forwardModel(startU[i,:]; do_plot=true, clearfig=false, model_params...)
end

Ustarthat, Vstarthat, Uhatm, Vhat, costs = backwardsModel([-0.8, -0.8]; do_plot=true, clearfig=false, 
tol=1e-50, maxiter=800, model_params...)

subplot(2,1,2)
plot(t, costs, ".-")


function JJ(initUs; theta1=0.15, theta2=0.2, beta=0.01, verbose=false, nderivs=0, difforder=0, 
    do_plot=false, pre_string="", params...)

    Vend = ForwardDiffZeros(size(initUs,1), size(initUs,2), nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    
    for i=1:size(initUs,1)
        Ue, Ve, U, V = forwardModel(initUs[i,:]; nderivs=nderivs, difforder=difforder, 
            do_plot=do_plot, clearfig=false, params...)
        Vend[i,:] = Ve
    end
    
    hits = 0.5*(1 + tanh.((Vend[:,1]-Vend[:,2])/theta1))
    diffs = tanh.((Vend[:,1]-Vend[:,2])/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)
    
    if verbose
        @printf("%s", pre_string)
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2
end
  
clf();
JJ(startU; do_plot=true, model_params...)

# WORKING gradient:
# ForwardDiff.gradient((x)->JJ(startU; do_plot=true, nderivs=length(x), difforder=1, 
#    make_dict([["init_add" 2], "const_add"], x, model_params)...), [2.9, -2.9, 0.1])



# The backward and costfunc functions should turn a single-scalar parameter W into the matrix W
backward = (endpoint; do_plot=false, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;     
    backwardsModel(endpoint; do_plot=do_plot, pars...)[1]
end

costfunc = (startpoints; do_plot=false, verbose=false, nderivs=0, difforder=0, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;         
    JJ(startpoints; do_plot=do_plot, verbose=verbose, nderivs=nderivs, difforder=difforder, pars...)
end


fluxFinalPoint = [-0.8 -0.8]  # ; -0.6 -0.6 ; -0.4 -0.4; -0.2 -0.2; 0 0; 0.2 0.2]

args = [["init_add" 2], "const_add"] # , "W"]
seed = [0.001, 0.001, 0] # , -4]
# seed = [1.190, -1.178, 2.000]

params, cost = fluxSense(costfunc, backward, model_params, startU, fluxFinalPoint, args, seed; 
start_eta=0.01, tol=1e-15, maxiter=400, verbose=true, do_plot=false, cost_limit=-0.00935) # cost_limit=-0.000935) # for beta=0.01

# And show the final position
clf()
costfunc(startU; do_plot=true, verbose=true, make_dict(args, params, model_params)...)


### --- END --- Old example that gets stuck: too large beta

# Example of getting stuck without a flux point even with beta=0 

In [None]:
function JJ(initUs; theta1=0.15, theta2=0.2, beta=0.003, verbose=false, nderivs=0, difforder=0, 
    do_plot=false, pre_string="", zero_last_sigmas=0, seedrand=NaN, params...)

    if ~isnan(seedrand); srand(seedrand); end
    
    Vend = ForwardDiffZeros(size(initUs,1), size(initUs,2), nderivs=nderivs, difforder=difforder)

    if do_plot; clf(); end;
    
    for i=1:size(initUs,1)
        if i>size(initUs,1) - zero_last_sigmas
            Ue, Ve, U, V = forwardModel(initUs[i,:]; sigma=0, nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)            
        else
            Ue, Ve, U, V = forwardModel(initUs[i,:]; nderivs=nderivs, difforder=difforder, 
                do_plot=do_plot, clearfig=false, params...)
        end
        Vend[i,:] = Ve
    end
    
    hits = 0.5*(1 + tanh.((Vend[:,1]-Vend[:,2])/theta1))
    diffs = tanh.((Vend[:,1]-Vend[:,2])/theta2).^2
    
    cost1 = (mean(hits) - 0.75).^2 
    cost2 = -beta*mean(diffs)
    
    if verbose
        @printf("%s", pre_string)
        @printf("-- cost=%g,   cost1=%g, cost2=%g :  mean(hits)=%g, mean(diffs)=%g\n", 
            convert(Float64, cost1+cost2), convert(Float64, cost1), convert(Float64, cost2),
            convert(Float64, mean(hits)), convert(Float64, mean(diffs)))
    end
    
    return cost1 + cost2
end
  


In [None]:

srand(11)
startU=randn(100,2)-3
startU=randn(100,2)-3


# startU=0.1*randn(100,2)-3
# startU=zeros(100,2)-3


dt = 0.005  # If we go to dt=0.02, it doesn't get stuck
t = 0:dt:1
tau = 0.1
nsteps = length(t)
t = t[1:nsteps]

W = -4
noise = 0
input = 0
sigma = 0.1


model_params = Dict(:dt=>dt, :tau=>tau, :W=>[0 W; W 0], :nsteps=>nsteps, 
:noise=>noise, :input=>input, :sigma=>sigma, :const_add=>0, :init_add=>0)



# The backward and costfunc functions should turn a single-scalar parameter W into the matrix W
# backward always runs with no within-forward noise, i.e., sigma=0
backward = (endpoint; do_plot=false, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;     
    backwardsModel(endpoint; do_plot=do_plot, make_dict(["sigma"], [0], pars)...)[1]
end


beta=0

costfunc = (startpoints; do_plot=false, verbose=false, nderivs=0, difforder=0, sr=26, pars...) -> begin
    pars = Dict(pars)
    if haskey(pars, :W); 
        W=pars[:W];   # mess with it only if it is not already a matrix:
        if length(W)==1; pars=make_dict(["W"], [[0 W;W 0]], pars); end;
    end;         
    JJ(startpoints; seedrand=sr, beta=beta, 
        do_plot=do_plot, verbose=verbose, nderivs=nderivs, difforder=difforder, pars...)
end

if beta==0.003;     cost_limit = -0.00288
elseif beta<0.001;  cost_limit = -0.0008
elseif beta==0.001; cost_limit = -0.000935
elseif beta==0.05;  cost_limit = -0.0485
else
    error("Don't know what cost limit goes with beta %g\n", beta)
end

fluxFinalPoint = zeros(0,2);

args = [["init_add" 2], "const_add", "W"]

seed = [0.001, 0.001, 0, -4]


# Alternatively, start right from the sticking point:
seed = [4.74063,  -4.68228,  2.73165,  -5.6783]

# Walls are big enough that we never hit them, so it is immaterial:
bbox = [
    -15        15  ;
    -15        15  ;
    -15        15  ;
    -20.5  20.5  ; 
]




clf()
costfunc(startU; do_plot=true, verbose=true, make_dict(args, seed, model_params)...)


# YOU CAN DO EITHER THIS:
params, traj = bbox_Hessian_keyword_minimization(seed, args, bbox, 
(;params...) -> costfunc(startU; verbose=true, merge(model_params, Dict(params))...), 
verbose=true, start_eta=0.01, tol=1e-10, hardbox=true )

# OR THIS:  (both get stuck)
# fluxFinalPoint = [-0.1 -0.1]
# params, cost, ptraj, gtraj = fluxSense(costfunc, backward, model_params, startU, fluxFinalPoint, args, seed; 
# start_eta=0.01, tol=1e-15, maxiter=400, verbose=true, report_every=1, do_plot=true, cost_limit=cost_limit) # cost_limit=-0.000935) # for beta=0.01

# And show the final position
clf()
costfunc(startU; do_plot=true, verbose=true, make_dict(args, params, model_params)...)
