In [1]:
using PyCall
using PyPlot
using ForwardDiff
using DiffBase
using MAT

pygui(true)

import Base.convert
convert(::Type{Float64}, x::ForwardDiff.Dual) = Float64(x.value)
function convert(::Array{Float64}, x::Array{ForwardDiff.Dual}) 
    y = zeros(size(x)); 
    for i in 1:prod(size(x)) 
        y[i] = convert(Float64, x[i]) 
    end
    return y
end

include("general_utils.jl")
include("hessian_utils.jl")

"""
We define functions to convert Duals, the variable types used by ForwardDiff, 
to Floats. This is useful if we want to print out the value of a variable 
(since print doesn't know how to Duals). Note that after being converted to a Float, no
differentiation by ForwardDiff can happen!  e.g. after
    x = convert(Float64, y)
ForwardDiff can still differentiate y, but it can't differentiate x
"""




"We define functions to convert Duals, the variable types used by ForwardDiff, \nto Floats. This is useful if we want to print out the value of a variable \n(since print doesn't know how to Duals). Note that after being converted to a Float, no\ndifferentiation by ForwardDiff can happen!  e.g. after\n    x = convert(Float64, y)\nForwardDiff can still differentiate y, but it can't differentiate x\n"

In [60]:
function prob_weight(nflashes; delta=0.5, gamma=3, other_unused_pars...)
    nflashes = nflashes/10
    return delta.*nflashes.^gamma./(delta.*nflashes.^gamma + (1-nflashes).^gamma)
end
# From Christine: delta = .5, gamma = 3, 
# From Christine: lapse = .008, softmax/lambda = 1.8, alpha = .93


function prob_right(clickrateL, clickrateR, nflashesL, nflashesR; 
    lapse=0.008, lambda=1.8, alpha=0.93, bias=0.5, pars...)

    answer = exp(-lambda*(clickrateR.^alpha*prob_weight(nflashesR; pars...) - 
        clickrateL.^alpha*prob_weight(nflashesL; pars...)) + bias)
    
    return lapse + (1 - 2*lapse)./(1 + answer)    
end


function one_trial_cost(rat_went_right, clickrateL, clickrateR, nflashesL, nflashesR; pars...)

    PR = prob_right(clickrateL, clickrateR, nflashesL, nflashesR; pars...)
    
    if abs(1-PR) < 1e-16; PR = 1-1e-16; end
    if PR < 1e-16;        PR =16      ; end
    if rat_went_right==1
        return log(PR)
    else
        return log(1-PR)
    end
end


function J(rat_went_right, clickrateL, clickrateR, nflashesL, nflashesR; 
            nderivs=0, difforder=0, pars...)
     
    answer = ForwardDiffZeros(length(rat_went_right), 1, nderivs=nderivs, difforder=difforder)

    for i=1:length(rat_went_right)
        answer[i] = 
            one_trial_cost(rat_went_right[i], clickrateL[i], clickrateR[i], 
                nflashesL[i], nflashesR[i]; pars...)
    end    
    return sum(answer), answer
end




J (generic function with 1 method)

In [11]:
F = matread("J266.mat")["J266_data"]
nflashesL = F["lflashes"]
nflashesR = F["rflashes"]
clickrateL = F["lclickHz"]
clickrateR = F["rclickHz"]
rat_went_right = F["went_right"];

In [None]:
J(rat_went_right, clickrateL, clickrateR, nflashesL, nflashesR)[1]

In [8]:
ntrials = 1000
rat_went_right[ntrials]

1.0

In [64]:
# From Christine: alpha = .93, delta = .5, gamma = 3, softmax/lambda = 1.8, lapse = .008
args = ["alpha", "gamma", "lambda", "lapse"]
seed = [0.93, 0.5]
bbox = Dict(:alpha=>[0.0001 1], :lambda=>[0.0001 10])

# ntrials = 1000
ntrials = length(rat_went_right)

params, traj = bbox_Hessian_keyword_minimization(seed, args, bbox, (;params...) -> J(rat_went_right[1:ntrials], 
clickrateL[1:ntrials], clickrateR[1:ntrials], nflashesL[1:ntrials], nflashesR[1:ntrials]; params...)[1];
verbose=true, start_eta=0.001, tol=1e-12)


0: eta=0.001 ps=[0.400, 0.500]
1: eta=0.0011 cost=-37388.2 jtype=constrained costheta=-1.000 ps=[0.400951, 0.500027]
2: eta=0.00121 cost=-37397.6 jtype=constrained costheta=-1.000 ps=[0.401998, 0.500056]
3: eta=0.001331 cost=-37408.2 jtype=constrained costheta=-1.000 ps=[0.403151, 0.500088]
4: eta=0.0014641 cost=-37420.1 jtype=constrained costheta=-1.000 ps=[0.40442, 0.500122]
5: eta=0.00161051 cost=-37433.7 jtype=constrained costheta=-1.000 ps=[0.405818, 0.50016]
6: eta=0.00177156 cost=-37449 jtype=constrained costheta=-1.000 ps=[0.407358, 0.500201]
7: eta=0.00194872 cost=-37466.5 jtype=constrained costheta=-1.000 ps=[0.409055, 0.500247]
8: eta=0.00214359 cost=-37486.4 jtype=constrained costheta=-1.000 ps=[0.410923, 0.500296]
9: eta=0.00235795 cost=-37509.2 jtype=constrained costheta=-1.000 ps=[0.412982, 0.500349]
10: eta=0.00259374 cost=-37535.3 jtype=constrained costheta=-1.000 ps=[0.415251, 0.500408]
11: eta=0.00285312 cost=-37565.2 jtype=constrained costheta=-1.000 ps=[0.417752, 0

([1.0,10.0],
[0.001 0.0011 … 24.4132 26.8545; -37379.8 -37388.2 … -77892.5 -77892.5; 0.4 0.400951 … 1.0 1.0; 0.5 0.500027 … 10.0 10.0],

-77892.53968849515,
[148.0 148.0 … 5.0 55.0; 0.0 0.0 … -2.84343e-25 1.13899e-25])

In [66]:
# From Christine: alpha = .93, delta = .5, gamma = 3, softmax/lambda = 1.8, lapse = .008
args = ["alpha", "lambda"]
seed = [0.4, 0.5]

ntrials = 10
# ntrials = length(rat_went_right)

params, cost, traj = trust_region_keyword_Hessian_minimization(seed, args, (;params...) -> J(rat_went_right[1:ntrials], 
clickrateL[1:ntrials], clickrateR[1:ntrials], nflashesL[1:ntrials], nflashesR[1:ntrials]; params...)[1];
verbose=true, start_eta=0.001, tol=1e-12)


1: eta=0.001 cost=-8.0895 costheta=-1.000 ps=[0.410, 0.507]
2: eta=0.001 cost=-8.2630 costheta=-1.000 ps=[0.421, 0.515]
3: eta=0.001 cost=-8.4924 costheta=-1.000 ps=[0.435, 0.525]
4: eta=0.001 cost=-8.8029 costheta=-1.000 ps=[0.453, 0.536]
5: eta=0.002 cost=-9.2312 costheta=-1.000 ps=[0.474, 0.550]
6: eta=0.002 cost=-9.8227 costheta=-1.000 ps=[0.501, 0.567]
7: eta=0.002 cost=-10.5967 costheta=-1.000 ps=[0.534, 0.587]
8: eta=0.002 cost=-11.4519 costheta=-1.000 ps=[0.569, 0.609]
9: eta=0.002 cost=-12.1965 costheta=-1.000 ps=[0.601, 0.631]
10: eta=0.003 cost=-12.8181 costheta=-0.999 ps=[0.631, 0.653]
11: eta=0.003 cost=-13.4380 costheta=-0.999 ps=[0.661, 0.678]
12: eta=0.003 cost=-14.2051 costheta=-1.000 ps=[0.697, 0.707]
13: eta=0.003 cost=-15.3089 costheta=-1.000 ps=[0.745, 0.744]
14: eta=0.004 cost=-16.9658 costheta=-1.000 ps=[0.810, 0.790]
15: eta=0.004 cost=-19.2418 costheta=-1.000 ps=[0.890, 0.843]
16: eta=0.005 cost=-21.4741 costheta=-1.000 ps=[0.973, 0.895]
17: eta=0.005 cost=-22.

([NaN,NaN],NaN,
[0.0011 0.00121 … 3.27855e13 3.6064e13; -8.08949 -8.263 … NaN NaN; 0.409587 0.421136 … NaN NaN; 0.50686 0.514938 … NaN NaN])

In [93]:
ps = traj[3:end,145]; traj[:,143:146]

4×4 Array{Float64,2}:
 830.145    913.16     1004.48     1104.92
 -24.1817   -24.1817    -24.1817    NaN   
   1.63179    1.63389     1.63599   NaN   
   1.23695    1.2379      1.23884   NaN   

In [94]:
func = (x) -> J(rat_went_right[1:ntrials], 
clickrateL[1:ntrials], clickrateR[1:ntrials], nflashesL[1:ntrials], nflashesR[1:ntrials]; 
nderivs=length(x), difforder=2, make_dict(args, x)...)[1]

cost, grad, hess = vgh(func, ps)

(-24.181729489660793,[-2.30333e-6,-1.03767e-6],
[NaN NaN; NaN NaN])

In [84]:
make_dict(args, traj[3:end,145])

Dict{Any,Any} with 2 entries:
  :alpha  => 1.63599
  :lambda => 1.23884

In [52]:
# From Christine: alpha = .93, delta = .5, gamma = 3, softmax/lambda = 1.8, lapse = .008
args = ["alpha", "delta"]
seed = [0.4, 0.5]
bbox = Dict(:alpha=>[0.0001 2], :delta=>[0.0001 10])

ntrials = 10
# ntrials = length(rat_went_right)

params, cost, traj = bbox_Hessian_keyword_minimization(seed, args, bbox, (;params...) -> J(rat_went_right[1:ntrials], 
clickrateL[1:ntrials], clickrateR[1:ntrials], nflashesL[1:ntrials], nflashesR[1:ntrials]; params...)[1];
verbose=true, start_eta=0.001)


LoadError: InterruptException:

In [51]:

#############################################################################
#                                                                           #
#                   TRUST_REGION_KEYWORD_HESSIAN_MINIMIZATION               #
#                                                                           #
#############################################################################



"""
function trust_region_keyword_Hessian_minimization(seed, args, func; start_eta=10, tol=1e-6, maxiter=400,
    verbose=false)

(below, x stands for delta_x, the step from the current x=x0 position at which the cost = const)

cost = 0.5*x'*H*x + grad*x + const

dcost/dx = H*x + grad  ;   dcost/dx = 0  ==> x =  - inv(H)*grad

Trust-region says have a parameter lambda, and replace H with hat{H} = H +  I/eta.  
When eta is very large, this is equivalent to a straight Newton method jump, 
because hat{H} ~= H.  But when eta is small, this is more like a small gradient
descent step, because for small eta inv(hat{H}) ~= eta and therefore the delta x is like 
-eta*grad.  So, if the cost function is going down, make eta larger, and if it is going
up, make eta a lot smaller. Just like we do in other adaptive methods

PARAMETERS:
===========

seed        column vector, representing the starting value of the parameters.

args        List of strings, representing args to be minimized.

func        Function that takes a vector and returns a scalar.  If you want to
            work with a function that tales more parameterrs and returns more than one 
            output, you can use something like

                    x -> orig_func(x, other_params)[1]

            You only need the "[1]" part if the orig_func returns more outputs than a scalar. 

OPTIONAL PARAMETERS:
====================

start_eta=10    Starting value of eta.  It's good to start with somethibg biggish, if it is
                too much, it'll quickly get cut down.

tol=1e-15       Numerical tolerance. If a proposed jump produces a change in func that is less than
               this, the minimization stops.

maxiter=400    Maximum number of iterations to do before stopping

verbose=false   If true, print out a report on each iteration of iteration number, radius size (eta),
                what type jump was proposed ("Newton" means going straight to global min, "constrained" means jump has 
                norm eta, failed means that finding the minimum at a given radius somehow didn't work). Will also
                print out the cosine of the angle between the proposed jump and the gradient.

RETURNS:
========

params       A vector the size of seed that has the last values of the minimizing parameters for func

"""
function trust_region_keyword_Hessian_minimization(seed, args, func; start_eta=10, tol=1e-15, maxiter=400,
    verbose=false, verbose_level=1)

    params = seed
    eta = start_eta

    cost, grad, hess = keyword_vgh(func, args, params)
    if verbose && verbose_level >= 2
        @printf("Initial cost=%g, grad=", cost); print_vector_g(grad); print("\n")
        @printf("hess = "); print_vector_g(hess[:]); print("\n")
    end
    
    traj = zeros(2+length(params),maxiter)

    for i in [1:maxiter;]
        hathess    = hess + eye(length(grad), length(grad))/eta        
        new_params = params - inv(hathess)*grad
        new_cost, new_grad, new_hess = keyword_vgh(func, args, new_params)
            
        if abs(new_cost - cost) < tol
            break
        end

        if new_cost >= cost
            eta = eta/2
            costheta = NaN
        else
            eta = eta*1.1
            costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))

            params = new_params
            cost = new_cost
            grad = new_grad
            hess = new_hess
        end
            
        traj[1,i] = eta
        traj[2,i] = cost
        traj[3:end,i] = params

        if verbose
            @printf "%d: eta=%.3f cost=%.4f costheta=%.3f ps=" i eta cost  costheta
            print_vector(params)
            @printf "\n"
        end
    end
    
    return params, cost, traj
end





trust_region_keyword_Hessian_minimization

In [39]:
func = (x;params...) -> J(rat_went_right[1:ntrials], 
clickrateL[1:ntrials], clickrateR[1:ntrials], nflashesL[1:ntrials], nflashesR[1:ntrials]; make_dict(args, x)...)[1]

e = 0.000001; d=[0,e]; (func(seed+d) - func(seed))/e

0.0

In [219]:
args = ["alpha"]
seed = [0.9]
func = (x;pars...) -> J(rat_went_right, clickrateL, 
    clickrateR, nflashesL, nflashesR; make_dict(args, x)...)[1]


args = ["delta"]
seed = [0.9]
ntrials = 2783 # 2783
func = (x;pars...) -> J(rat_went_right[ntrials], clickrateL[ntrials], clickrateR[ntrials], nflashesL[ntrials], nflashesR[ntrials]; nderivs=length(args), difforder=2, make_dict(args, x)...)[1]
func = (x;pars...) -> one_trial_cost(rat_went_right[ntrials], clickrateL[ntrials], clickrateR[ntrials], nflashesL[ntrials], nflashesR[ntrials]; make_dict(args, x)...)
func = (x;pars...) -> prob_right(clickrateL[ntrials], clickrateR[ntrials], nflashesL[ntrials], nflashesR[ntrials]; make_dict(args, x)...)
func = (x;pars...) -> prob_weight(nflashesR[ntrials]; make_dict(args, x)...)


cost, grad, hess = vgh(func, seed)

(0.4736842105263158,[0.277008],
[-0.291588])

In [194]:
strial = 27
gu = J(rat_went_right[strial], clickrateL[strial], clickrateR[strial], nflashesL[strial], nflashesR[strial])

(-4.8283137373023015,
[-4.82831])

In [196]:
nflashesR


56265×1 Array{Float64,2}:
 10.0
 10.0
 10.0
 10.0
  0.0
 10.0
  0.0
 10.0
 10.0
  0.0
 10.0
  0.0
  0.0
  ⋮  
  3.0
 10.0
 10.0
  1.0
  1.0
  2.0
 10.0
 10.0
  2.0
  0.0
  0.0
  0.0

In [67]:
seed = [0.93]
args = ["alpha"]
bbox = [0 3]
func = (;pars...) -> J(rat_went_right, clickrateL, clickrateR, nflashesL, nflashesR; pars...)

params, traj = bbox_Hessian_keyword_minimization(seed, args, bbox, func, wallwidth_factor=0.001, verbose=true, start_eta=0.001)
params

0: eta=0.001 ps=[0.930]
Constrained parabolic minimization failed with error ArgumentError("collection must be non-empty")

eta was 0.001
grad was
[NaN]

hess was
[NaN]

1: eta=0.0005 cost=-176622.7834 jtype=failed costheta=NaN ps=[0.930]
Constrained parabolic minimization failed with error ArgumentError("collection must be non-empty")

eta was 0.0005
grad was
[NaN]

hess was
[NaN]

2: eta=0.00025 cost=-176622.7834 jtype=failed costheta=NaN ps=[0.930]
Constrained parabolic minimization failed with error ArgumentError("collection must be non-empty")

eta was 0.00025
grad was
[NaN]

hess was
[NaN]

3: eta=0.000125 cost=-176622.7834 jtype=failed costheta=NaN ps=[0.930]
Constrained parabolic minimization failed with error ArgumentError("collection must be non-empty")

eta was 0.000125
grad was
[NaN]

hess was
[NaN]

4: eta=6.25e-05 cost=-176622.7834 jtype=failed costheta=NaN ps=[0.930]
Constrained parabolic minimization failed with error ArgumentError("collection must be non-empty")

eta w

1-element Array{Float64,1}:
 0.93

In [24]:
keys(matread("J266.mat")["J266_data"])

Base.KeyIterator for a Dict{String,Any} with 12 entries. Keys:
  "rightvolume"
  "went_right"
  "rclickHz"
  "lclickHz"
  "lefthit"
  "rflashes"
  "lflashes"
  "leftvolume"
  "righthit"
  "leftprob"
  "rightprob"
  "cues_lowtohigh"

In [228]:
figure(1); clf()
h = plt[:hist](nflashesL)

([4018.0,2215.0,2045.0,1973.0,1892.0,2749.0,1980.0,1997.0,1951.0,35445.0],[0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0],Any[PyObject <matplotlib.patches.Rectangle object at 0x32690b510>,PyObject <matplotlib.patches.Rectangle object at 0x327b005d0>,PyObject <matplotlib.patches.Rectangle object at 0x327b1ead0>,PyObject <matplotlib.patches.Rectangle object at 0x33943a510>,PyObject <matplotlib.patches.Rectangle object at 0x33943ab90>,PyObject <matplotlib.patches.Rectangle object at 0x33944d250>,PyObject <matplotlib.patches.Rectangle object at 0x33944d8d0>,PyObject <matplotlib.patches.Rectangle object at 0x33944df50>,PyObject <matplotlib.patches.Rectangle object at 0x339459610>,PyObject <matplotlib.patches.Rectangle object at 0x339459c90>])

In [50]:

#############################################################################
#                                                                           #
#                   TRUST_REGION_HESSIAN_MINIMIZATION                       #
#                                                                           #
#############################################################################



"""
function trust_region_Hessian_minimization(seed, func; start_eta=10, tol=1e-6, maxiter=400,
    verbose=false)

(below, x stands for delta_x, the step from the current x=x0 position at which the cost = const)

cost = 0.5*x'*H*x + grad*x + const

dcost/dx = H*x + grad  ;   dcost/dx = 0  ==> x =  - inv(H)*grad

Trust-region says have a parameter lambda, and replace H with hat{H} = H +  I/eta.  
When eta is very large, this is equivalent to a straight Newton method jump, 
because hat{H} ~= H.  But when eta is small, this is more like a small gradient
descent step, because for small eta inv(hat{H}) ~= eta and therefore the delta x is like 
-eta*grad.  So, if the cost function is going down, make eta larger, and if it is going
up, make eta a lot smaller. Just like we do in other adaptive methods

PARAMETERS:
===========

seed        column vector, representing the starting value of the parameters.

func        Function that takes a vector and returns a scalar.  If you want to
            work with a function that tales more parameterrs and returns more than one 
            output, you can use something like

                    x -> orig_func(x, other_params)[1]

            You only need the "[1]" part if the orig_func returns more outputs than a scalar. 

OPTIONAL PARAMETERS:
====================

start_eta=10    Starting value of eta.  It's good to start with somethibg biggish, if it is
                too much, it'll quickly get cut down.

tol=1e-15       Numerical tolerance. If a proposed jump produces a change in func that is less than
               this, the minimization stops.

maxiter=400    Maximum number of iterations to do before stopping

verbose=false   If true, print out a report on each iteration of iteration number, radius size (eta),
                what type jump was proposed ("Newton" means going straight to global min, "constrained" means jump has 
                norm eta, failed means that finding the minimum at a given radius somehow didn't work). Will also
                print out the cosine of the angle between the proposed jump and the gradient.

RETURNS:
========

params       A vector the size of seed that has the last values of the minimizing parameters for func

"""
function trust_region_Hessian_minimization(seed, func; start_eta=10, tol=1e-15, maxiter=400,
    verbose=false, verbose_level=1)

    params = seed
    eta = start_eta

    cost, grad, hess = vgh(func, params)
    if verbose && verbose_level >= 2
        @printf("Initial cost, grad, hess:\n")
        print_vector_g(:cost)
        print_vector_g(:grad)
        print_vector_g(:hess)
    end
    
    traj = zeros(2+length(params),maxiter)

    for i in [1:maxiter;]
        hathess    = hess + eye(length(grad), length(grad))/eta        
        new_params = params - inv(hathess)*grad
        new_cost, new_grad, new_hess = vgh(func, new_params)
            
        if abs(new_cost - cost) < tol
            break
        end

        if new_cost >= cost
            eta = eta/2
            costheta = NaN
        else
            eta = eta*1.1
            costheta = dot(new_params-params, grad)/(norm(new_params-params)*norm(grad))

            params = new_params
            cost = new_cost
            grad = new_grad
            hess = new_hess
        end

        traj[1,i] = eta
        traj[2,i] = cost
        traj[3:end,i] = params
        
        if verbose
            @printf "%d: eta=%.3f cost=%.4f costheta=%.3f ps=" i eta cost  costheta
            print_vector(params)
            @printf "\n"
        end
    end
    
    return params, cost, traj
end





trust_region_Hessian_minimization