From 1456118064683d1d233db11d8b05ac54733dd942 Mon Sep 17 00:00:00 2001
From: alejandro-carderera <alejandro.carderera@gmail.com>
Date: Thu, 4 Mar 2021 16:14:24 -0500
Subject: [PATCH 01/18] WIP: Starting reorganization of BCG and addition of
 enhancements.

---
 examples/bcgTests.jl |  72 ++++---
 src/blended_cg.jl    | 462 +++++++++++++++++++++++++++++++++++++++++++
 src/utils.jl         |   8 +-
 3 files changed, 513 insertions(+), 29 deletions(-)

diff --git a/examples/bcgTests.jl b/examples/bcgTests.jl
index 8674e6146..fcdf29fce 100644
--- a/examples/bcgTests.jl
+++ b/examples/bcgTests.jl
@@ -4,7 +4,7 @@ using Random
 using DoubleFloats
 
 n = Int(1e4)
-k = 3000
+k = 1000
 
 s = rand(1:100)
 @info "Seed $s"
@@ -52,37 +52,24 @@ const x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n))
 FrankWolfe.benchmark_oracles(x -> cf(x, xp), (str, x) -> cgrad!(str, x, xp), ()->randn(n), lmo; k=100)
 
 x0 = deepcopy(x00)
-@time x, v, primal, dual_gap, trajectorySs = FrankWolfe.fw(
-    f,
-    grad!,
-    lmo,
-    x0,
-    max_iteration=k,
-    line_search=FrankWolfe.shortstep,
-    L=2,
-    print_iter=k / 10,
-    emphasis=FrankWolfe.memory,
-    verbose=true,
-    trajectory=true,
-);
-
-x0 = deepcopy(x00)
-@time x, v, primal, dual_gap, trajectoryAda = FrankWolfe.afw(
+x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg(
     f,
     grad!,
     lmo,
     x0,
     max_iteration=k,
     line_search=FrankWolfe.adaptive,
-    L=100,
     print_iter=k / 10,
     emphasis=FrankWolfe.memory,
+    L=2,
     verbose=true,
     trajectory=true,
-);
+    Ktolerance=1.00,
+    goodstep_tolerance=0.95,
+    weight_purge_threshold=1e-10,
+)
 
-x0 = deepcopy(x00)
-x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg(
+x, v, primal, dual_gap, trajectoryBCG_backup = FrankWolfe.bcg_backup(
     f,
     grad!,
     lmo,
@@ -99,7 +86,42 @@ x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg(
     weight_purge_threshold=1e-10,
 )
 
-data = [trajectorySs, trajectoryAda, trajectoryBCG]
-label = ["short step", "AFW", "BCG"]
-
-FrankWolfe.plot_trajectories(data, label)
+data = [trajectoryBCG, trajectoryBCG_backup]
+label = ["BCG", "BCG backup"]
+
+FrankWolfe.plot_trajectories(data, label, filename="output_results.png")
+
+#x0 = deepcopy(x00)
+#@time x, v, primal, dual_gap, trajectoryAda = FrankWolfe.afw(
+#    f,
+#    grad!,
+#    lmo,
+#    x0,
+#    max_iteration=k,
+#    line_search=FrankWolfe.adaptive,
+#    L=100,
+#    print_iter=k / 10,
+#    emphasis=FrankWolfe.memory,
+#    verbose=true,
+#    trajectory=true,
+#);
+
+#x0 = deepcopy(x00)
+#@time x, v, primal, dual_gap, trajectorySs = FrankWolfe.fw(
+#    f,
+#    grad!,
+#    lmo,
+#    x0,
+#    max_iteration=k,
+#    line_search=FrankWolfe.shortstep,
+#    L=2,
+#    print_iter=k / 10,
+#    emphasis=FrankWolfe.memory,
+#    verbose=true,
+#    trajectory=true,
+#);
+
+#data = [trajectorySs, trajectoryAda, trajectoryBCG]
+#label = ["short step", "AFW", "BCG"]
+#
+#FrankWolfe.plot_trajectories(data, label)
diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 1925001ac..f6eb0cc1d 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -1,3 +1,42 @@
+function print_header(data)
+    @printf(
+        "\n────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n"
+    )
+    @printf(
+        "%6s %13s %14s %14s %14s %14s %14s %14s\n",
+        data[1],
+        data[2],
+        data[3],
+        data[4],
+        data[5],
+        data[6],
+        data[7],
+        data[8],
+    )
+    @printf(
+        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n"
+    )
+end
+
+function print_footer()
+    @printf(
+        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n\n"
+    )
+end
+
+function print_iter_func(data)
+    @printf(
+        "%6s %13s %14e %14e %14e %14e %14i %14i\n",
+        st[Symbol(data[1])],
+        data[2],
+        Float64(data[3]),
+        Float64(data[4]),
+        Float64(data[5]),
+        data[6],
+        data[7],
+        data[8],
+    )
+end
 
 function bcg(
     f,
@@ -21,6 +60,429 @@ function bcg(
     gradient=nothing,
     direction_storage=nothing,
     lmo_kwargs...,
+)
+    t = 0
+    primal = Inf
+    dual_gap = Inf
+    active_set = ActiveSet([(1.0, x0)])
+    x = x0
+    if gradient === nothing
+        gradient = similar(x0, float(eltype(x0)))
+    end
+    primal = f(x)
+    grad!(gradient, x)
+    # initial gap estimate computation
+    vmax = compute_extreme_point(lmo, gradient)
+    phi = fast_dot(gradient, x0 - vmax) / 2
+    dual_gap = phi
+    traj_data = []
+    tt = regular
+    time_start = time_ns()
+    v = x0
+    if direction_storage === nothing
+        direction_storage = Vector{float(eltype(x))}()
+        Base.sizehint!(direction_storage, 100)
+    end
+
+    if line_search == shortstep && !isfinite(L)
+        @error("Lipschitz constant not set to a finite value. Prepare to blow up spectacularly.")
+    end
+
+    if line_search == agnostic || line_search == nonconvex
+        @error("Lazification is not known to converge with open-loop step size strategies.")
+    end
+
+    if line_search == fixed && gamma0 == 0
+        println("WARNING: gamma0 not set. We are not going to move a single bit.")
+    end
+
+    if verbose
+        println("\nBlended Conditional Gradients Algorithm.")
+        numType = eltype(x0)
+        println(
+            "EMPHASIS: $memory STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $numType",
+        )
+        println("K: $Ktolerance")
+        println("WARNING: In memory emphasis mode iterates are written back into x0!")
+        headers = (
+            "Type",
+            "Iteration",
+            "Primal",
+            "Dual",
+            "Dual Gap",
+            "Time",
+            "#ActiveSet",
+            "#non-simplex",
+            "#forced FW",
+        )
+        print_header(headers)
+    end
+    if !isa(x, Union{Array, SparseVector})
+        x = convert(Array{float(eltype(x))}, x)
+    end
+    non_simplex_iter = 0
+    nforced_fw = 0
+    force_fw_step = false
+    if verbose && mod(t, print_iter) == 0
+        if t == 0
+            tt = initial
+        end
+        rep = (
+            tt,
+            string(t),
+            primal,
+            primal - dual_gap,
+            dual_gap,
+            (time_ns() - time_start) / 1.0e9,
+            length(active_set),
+            non_simplex_iter,
+            nforced_fw,
+        )
+        print_iter_func(rep)
+        flush(stdout)
+    end
+
+    while t <= max_iteration && phi ≥ epsilon
+        # TODO replace with single call interface from function_gradient.jl
+        #Mininize over the convex hull until strong Wolfe gap is below a given tolerance.
+        num_simplex_descent_steps = minimize_over_convex_hull(
+            f,
+            grad!,
+            gradient,
+            active_set::ActiveSet,
+            phi,
+            t,
+            trajectory,
+            traj_data,
+            time_start,
+            non_simplex_iter,
+            verbose = verbose,
+            print_iter=print_iter,
+            hessian = nothing,
+            L=L,
+            accelerated = false,
+        )
+
+        t = t + num_simplex_descent_steps
+        #Take a FW step.
+        x  = compute_active_set_iterate(active_set)
+        primal = f(x)
+        grad!(gradient, x)
+        non_simplex_iter += 1
+        # compute new atom
+        (v, value) = lp_separation_oracle(
+            lmo,
+            active_set,
+            gradient,
+            phi,
+            Ktolerance;
+            inplace_loop=(emphasis == memory),
+            force_fw_step=force_fw_step,
+            lmo_kwargs...,
+        )
+        force_fw_step = false
+        xval = fast_dot(x, gradient)
+        if value > xval - phi/Ktolerance
+            tt = dualstep
+            # setting gap estimate as ∇f(x) (x - v_FW) / 2
+            phi = (xval - value) / 2
+        else
+            tt = regular
+            L, gamma = line_search_wrapper(line_search, t, f, grad!, x, x - v, gradient, dual_gap, L, gamma0, linesearch_tol, step_lim, 1.0)
+
+            if gamma == 1.0
+                active_set_initialize!(active_set, v)
+            else
+                active_set_update!(active_set, gamma, v)
+            end
+        end
+        t = t + 1
+        non_simplex_iter += 1
+        x  = compute_active_set_iterate(active_set)
+        dual_gap = phi
+        if trajectory
+            push!(
+                traj_data,
+                (
+                    t,
+                    primal,
+                    primal - dual_gap,
+                    dual_gap,
+                    (time_ns() - time_start) / 1.0e9,
+                    length(active_set),
+                ),
+            )
+        end
+
+        if verbose && mod(t, print_iter) == 0
+            if t == 0
+                tt = initial
+            end
+            rep = (
+                tt,
+                string(t),
+                primal,
+                primal - dual_gap,
+                dual_gap,
+                (time_ns() - time_start) / 1.0e9,
+                length(active_set),
+                non_simplex_iter,
+            )
+            print_iter_func(rep)
+            flush(stdout)
+        end
+        
+    end
+    if verbose
+        x = compute_active_set_iterate(active_set)
+        grad!(gradient, x)
+        v = compute_extreme_point(lmo, gradient)
+        primal = f(x)
+        dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+        rep = (
+            last,
+            string(t - 1),
+            primal,
+            primal - dual_gap,
+            dual_gap,
+            (time_ns() - time_start) / 1.0e9,
+            length(active_set),
+            non_simplex_iter,
+        )
+        print_iter_func(rep)
+        flush(stdout)
+    end
+    active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold)
+    active_set_renormalize!(active_set)
+    x = compute_active_set_iterate(active_set)
+    grad!(gradient, x)
+    v = compute_extreme_point(lmo, gradient)
+    primal = f(x)
+    #dual_gap = 2phi
+    dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+    if verbose
+        rep = (
+            pp,
+            string(t - 1),
+            primal,
+            primal - dual_gap,
+            dual_gap,
+            (time_ns() - time_start) / 1.0e9,
+            length(active_set),
+            non_simplex_iter,
+        )
+        print_iter_func(rep)
+        print_footer()
+        flush(stdout)
+    end
+    return x, v, primal, dual_gap, traj_data
+end
+
+
+function minimize_over_convex_hull(
+    f,
+    grad!,
+    gradient,
+    active_set::ActiveSet,
+    tolerance,
+    t,
+    trajectory,
+    traj_data,
+    time_start,
+    non_simplex_iter;
+    verbose = true,
+    print_iter=1000,
+    hessian = nothing,
+    L=nothing,
+    linesearch_tol=10e-10,
+    step_lim=100,
+    weight_purge_threshold=1e-12,
+    storage=nothing,
+    accelerated = false,
+)
+    #No hessian is known, use simplex gradient descent.
+    if isnothing(hessian)
+        number_of_steps = simplex_gradient_descent_over_convex_hull(
+            f,
+            grad!,
+            gradient,
+            active_set::ActiveSet,
+            tolerance,
+            t,
+            trajectory,
+            traj_data,
+            time_start,
+            non_simplex_iter,
+            verbose = verbose,
+            print_iter=print_iter,
+            L=L,
+            linesearch_tol=linesearch_tol,
+            step_lim=step_lim,
+            weight_purge_threshold=weight_purge_threshold,
+        )
+    else
+        #Rewrite problem as problem over the simplex and solve using Nesterov's AGD
+        if accelerated
+            return
+        #Rewrite problem as problem over the simplex and solve using gradient descent.
+        else
+            return
+        end
+    end
+    return number_of_steps
+end
+
+
+function simplex_gradient_descent_over_convex_hull(
+    f,
+    grad!,
+    gradient,
+    active_set::ActiveSet,
+    tolerance,
+    t,
+    trajectory,
+    traj_data,
+    time_start,
+    non_simplex_iter;
+    verbose = true,
+    print_iter=1000,
+    hessian = nothing,
+    L=nothing,
+    linesearch_tol=10e-10,
+    step_lim=100,
+    weight_purge_threshold=1e-12,
+)
+    number_of_steps = 0
+    x  = compute_active_set_iterate(active_set)
+    while true
+        grad!(gradient, x)
+        #Check if strong Wolfe gap over the convex hull is small enough.
+        c = [fast_dot(gradient, a) for a in active_set.atoms]
+        if maximum(c) - minimum(c) <= tolerance
+            return number_of_steps
+        end
+        #Otherwise perform simplex steps until we get there.
+        k = length(active_set)
+        csum = sum(c)
+        c .-= (csum / k)
+        # name change to stay consistent with the paper, c is actually updated in-place
+        d = c
+        if norm(d) <= 1e-8
+            @info "Resetting active set."
+            # resetting active set to singleton
+            a0 = active_set.atoms[1]
+            empty!(active_set)
+            push!(active_set, (1, a0))
+            return false
+        end
+        # NOTE: sometimes the direction is non-improving
+        # usual suspects are floating-point errors when multiplying atoms with near-zero weights
+        # in that case, inverting the sense of d
+        @inbounds if fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), gradient) < 0
+            @warn "Non-improving d, aborting simplex descent. We likely reached the limits of the numerical accuracy. 
+            The solution is still valid but we might not be able to converge further from here onwards. 
+            If higher accuracy is required, consider using Double64 (still quite fast) and if that does not help BigFloat (slower) as type for the numbers.
+            Alternatively, consider using AFW (with lazy = true) instead."
+            println(fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), gradient))
+            return true
+        end
+
+        η = eltype(d)(Inf)
+        rem_idx = -1
+        @inbounds for idx in eachindex(d)
+            if d[idx] > 0
+                max_val = active_set.weights[idx] / d[idx]
+                if η > max_val
+                    η = max_val
+                    rem_idx = idx
+                end
+            end
+        end
+        # TODO at some point avoid materializing both x and y
+        x = copy(active_set.x)
+        η = max(0, η)
+        @. active_set.weights -= η * d
+        y = copy(update_active_set_iterate!(active_set))
+        if f(x) ≥ f(y)
+            active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold)
+            return false
+        end
+        linesearch_method = L === nothing || !isfinite(L) ? backtracking : shortstep
+        if linesearch_method == backtracking
+            _, gamma =
+                backtrackingLS(f, gradient, x, x - y, linesearch_tol=linesearch_tol, step_lim=step_lim)
+        else # == shortstep, just two methods here for now
+            gamma = fast_dot(gradient, x - y) / (L * norm(x - y)^2)
+        end
+        gamma = min(1.0, gamma)
+        # step back from y to x by (1 - γ) η d
+        # new point is x - γ η d
+        if gamma == 1.0
+            active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold)
+        else
+            @. active_set.weights += η * (1 - gamma) * d
+            @. active_set.x =  x + gamma * (y - x)
+        end
+        number_of_steps = number_of_steps + 1
+        x  = compute_active_set_iterate(active_set)
+        primal = f(x)
+        dual_gap = tolerance
+        if trajectory
+            push!(
+                traj_data,
+                (
+                    t + number_of_steps,
+                    primal,
+                    primal - dual_gap,
+                    dual_gap,
+                    (time_ns() - time_start) / 1.0e9,
+                    length(active_set),
+                ),
+            )
+        end
+        tt = simplex_descent
+        if verbose && mod(t + number_of_steps, print_iter) == 0
+            if t == 0
+                tt = initial
+            end
+            rep = (
+                tt,
+                string(t+ number_of_steps),
+                primal,
+                primal - dual_gap,
+                dual_gap,
+                (time_ns() - time_start) / 1.0e9,
+                length(active_set),
+                non_simplex_iter,
+            )
+            print_iter_func(rep)
+            flush(stdout)
+        end
+    end
+end
+
+function bcg_backup(
+    f,
+    grad!,
+    lmo,
+    x0;
+    line_search::LineSearchMethod=adaptive,
+    L=Inf,
+    gamma0=0,
+    step_lim=20,
+    epsilon=1e-7,
+    max_iteration=10000,
+    print_iter=1000,
+    trajectory=false,
+    verbose=false,
+    linesearch_tol=1e-7,
+    emphasis=nothing,
+    Ktolerance=1.0,
+    goodstep_tolerance=1.0,
+    weight_purge_threshold=1e-9,
+    gradient=nothing,
+    direction_storage=nothing,
+    lmo_kwargs...,
 )
     function print_header(data)
         @printf(
diff --git a/src/utils.jl b/src/utils.jl
index f311b3fb9..67d9db340 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -262,7 +262,7 @@ function plot_trajectories(data, label; filename=nothing)
                 x,
                 y,
                 label=label[i],
-                xaxis=:log,
+                #xaxis=:log,
                 yaxis=:log,
                 ylabel="Primal",
                 legend=:topright,
@@ -284,7 +284,7 @@ function plot_trajectories(data, label; filename=nothing)
                 y,
                 label=label[i],
                 legend=false,
-                xaxis=:log,
+                #xaxis=:log,
                 yaxis=:log,
                 yguidefontsize=8,
                 xguidefontsize=8,
@@ -303,7 +303,7 @@ function plot_trajectories(data, label; filename=nothing)
                 y,
                 label=label[i],
                 legend=false,
-                xaxis=:log,
+                #xaxis=:log,
                 yaxis=:log,
                 ylabel="Dual Gap",
                 xlabel="Iterations",
@@ -324,7 +324,7 @@ function plot_trajectories(data, label; filename=nothing)
                 y,
                 label=label[i],
                 legend=false,
-                xaxis=:log,
+                #xaxis=:log,
                 yaxis=:log,
                 xlabel="Time",
                 yguidefontsize=8,

From 4440a7b9e22b9f499adaa8f1959630b0b7961159 Mon Sep 17 00:00:00 2001
From: alejandro-carderera <alejandro.carderera@gmail.com>
Date: Wed, 10 Mar 2021 14:31:09 +0100
Subject: [PATCH 02/18] Simplex descent for quadratics.

Write the problem in barycentric coordinates in closed form if the objective function is quadratic. Need to verify that the code works well for an active set with dense atoms (right now all my testing is with MaybeHotVectors), and perform more tests with the code.

Next step is to implement Nesterov's AGD.
---
 examples/blended_cg.jl |  52 +++++++++++--
 src/blended_cg.jl      | 170 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 210 insertions(+), 12 deletions(-)

diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl
index 780470bc3..e8d02ef9b 100644
--- a/examples/blended_cg.jl
+++ b/examples/blended_cg.jl
@@ -2,8 +2,10 @@ import FrankWolfe
 using LinearAlgebra
 using Random
 using DoubleFloats
+using FrankWolfe
 
 n = Int(1e4)
+#n = Int(1e3)
 k = 1000
 
 s = rand(1:100)
@@ -17,10 +19,24 @@ xpi = rand(n);
 total = sum(xpi);
 const xp = xpi # ./ total;
 
+#matrix = rand(n, n)
+#hessian = matrix * transpose(matrix)
+#linear_term = rand(n)
+#constant_term = rand()
+#f(x) = constant_term + dot(linear_term, x) + 0.5* transpose(x) * hessian * x
+#function grad!(storage, x)
+#    storage .= linear_term +  hessian * x
+#end
+#gradient = zeros(n)
+
+
 f(x) = norm(x - xp)^2
 function grad!(storage, x)
     @. storage = 2 * (x - xp)
 end
+hessian = Matrix(1.0I, n, n) 
+
+
 
 # better for memory consumption as we do coordinate-wise ops
 
@@ -33,11 +49,11 @@ function cgrad!(storage, x, xp)
 end
 
 # this LMO might produce numerical instabilities do demonstrate the recovery feature
-const lmo = FrankWolfe.KSparseLMO(100, 1.0)
+#const lmo = FrankWolfe.KSparseLMO(100, 1.0)
 
 # full upgrade of the lmo (and hence optimization) to Double64.
 # the same lmo with Double64 is much more numerically robust. costs relatively little in speed.
-# const lmo = FrankWolfe.KSparseLMO(100, Double64(1.0))
+ #const lmo = FrankWolfe.KSparseLMO(100, Double64(1.0))
 
 # as above but now to bigfloats
 # the same lmo here with bigfloat. even more robust but much slower
@@ -45,11 +61,18 @@ const lmo = FrankWolfe.KSparseLMO(100, 1.0)
 
 # other oracles to test / experiment with
 # const lmo = FrankWolfe.LpNormLMO{Float64,1}(1.0)
-# const lmo = FrankWolfe.ProbabilitySimplexOracle(Double64(1.0));
+#const lmo = FrankWolfe.ProbabilitySimplexOracle(Double64(1.0));
+const lmo = FrankWolfe.ProbabilitySimplexOracle(1.0);
 # const lmo = FrankWolfe.UnitSimplexOracle(1.0);
 
 const x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n))
 
+#active_set = FrankWolfe.ActiveSet([(1.0, x00)]) 
+#vect = FrankWolfe.compute_extreme_point(lmo, rand(n))
+#FrankWolfe.active_set_update!(active_set, 0.2, vect)
+#vect = FrankWolfe.compute_extreme_point(lmo, rand(n))
+#FrankWolfe.active_set_update!(active_set, 0.2, vect)
+
 FrankWolfe.benchmark_oracles(x -> cf(x, xp), (str, x) -> cgrad!(str, x, xp), ()->randn(n), lmo; k=100)
 
 # copying here and below the x00 as the algorithms write back into the variables to save memory.
@@ -57,6 +80,25 @@ FrankWolfe.benchmark_oracles(x -> cf(x, xp), (str, x) -> cgrad!(str, x, xp), ()-
 
 x0 = deepcopy(x00)
 x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg(
+    f,
+    grad!,
+    lmo,
+    x0,
+    max_iteration=k,
+    line_search=FrankWolfe.adaptive,
+    print_iter=k / 10,
+    hessian = hessian,
+    emphasis=FrankWolfe.memory,
+    L=2,
+    verbose=true,
+    trajectory=true,
+    Ktolerance=1.00,
+    goodstep_tolerance=0.95,
+    weight_purge_threshold=1e-10,
+)
+
+x0 = deepcopy(x00)
+x, v, primal, dual_gap, trajectoryBCG2 = FrankWolfe.bcg(
     f,
     grad!,
     lmo,
@@ -90,8 +132,8 @@ x, v, primal, dual_gap, trajectoryBCG_backup = FrankWolfe.bcg_backup(
     weight_purge_threshold=1e-10,
 )
 
-data = [trajectoryBCG, trajectoryBCG_backup]
-label = ["BCG", "BCG backup"]
+data = [trajectoryBCG, trajectoryBCG2, trajectoryBCG_backup]
+label = ["BCG", "BCG old", "BCG backup"]
 
 FrankWolfe.plot_trajectories(data, label, filename="output_results.png")
 
diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 7691b78df..2121b12b9 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -46,6 +46,7 @@ function bcg(
     line_search::LineSearchMethod=adaptive,
     L=Inf,
     gamma0=0,
+    hessian = nothing,
     step_lim=20,
     epsilon=1e-7,
     max_iteration=10000,
@@ -158,7 +159,7 @@ function bcg(
             non_simplex_iter,
             verbose = verbose,
             print_iter=print_iter,
-            hessian = nothing,
+            hessian = hessian,
             L=L,
             accelerated = false,
         )
@@ -188,7 +189,7 @@ function bcg(
             phi = (xval - value) / 2
         else
             tt = regular
-            L, gamma = line_search_wrapper(line_search, t, f, grad!, x, x - v, gradient, dual_gap, L, gamma0, linesearch_tol, step_lim, 1.0)
+            gamma, L = line_search_wrapper(line_search, t, f, grad!, x, x - v, gradient, dual_gap, L, gamma0, linesearch_tol, step_lim, 1.0)
 
             if gamma == 1.0
                 active_set_initialize!(active_set, v)
@@ -321,17 +322,133 @@ function minimize_over_convex_hull(
             weight_purge_threshold=weight_purge_threshold,
         )
     else
-        #Rewrite problem as problem over the simplex and solve using Nesterov's AGD
+        x = compute_active_set_iterate(active_set)
+        grad!(gradient, x)
+        #Rewrite as problem over the simplex
+        M, b = build_reduced_problem(active_set.atoms, active_set.weights, gradient, hessian)
+        L = eigmax(M)
+        reduced_f(y) =  f(x) - dot(gradient, x) + 0.5*transpose(x) * hessian * x + dot(b, y) + 0.5*transpose(y) * M * y
+
+        function reduced_grad!(storage, x)
+            storage .= b + M*x
+        end
+
+        function reduced_linesearch(gradient, direction)
+            return -dot(gradient, direction)/ (transpose(direction)*M*direction)
+        end
+        
+        #Solve using Nesterov's AGD
         if accelerated
             return
-        #Rewrite problem as problem over the simplex and solve using gradient descent.
+        #Solve using gradient descent.
         else
-            return
+            new_weights, number_of_steps = simplex_gradient_descent_over_probability_simplex(
+                active_set.weights, 
+                reduced_f, 
+                reduced_grad!, 
+                reduced_linesearch, 
+                tolerance,
+                t, 
+                trajectory, 
+                traj_data, 
+                time_start, 
+                non_simplex_iter, 
+                verbose = verbose, 
+                print_iter=print_iter, 
+                L = L,
+                )   
+            @. active_set.weights = new_weights
         end
     end
+    active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold)
     return number_of_steps
 end
 
+function simplex_gradient_descent_over_probability_simplex(
+    initial_point,
+    reduced_f,
+    reduced_grad!,
+    reduced_linesearch,
+    tolerance,
+    t,
+    trajectory,
+    traj_data,
+    time_start,
+    non_simplex_iter;
+    verbose = verbose,
+    print_iter=print_iter,
+    L = 1.0,
+)
+    number_of_steps = 0
+    x = deepcopy(initial_point)
+    gradient = similar(x)
+    d = similar(x)
+    reduced_grad!(gradient, x)
+    strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient)
+    while strong_wolfe_gap > tolerance
+        y = projection_simplex_sort(x .- gradient/L)
+        @. d =   y - x
+        gamma = min(1.0, reduced_linesearch(gradient, d))
+        @. x =  x + gamma*d
+        number_of_steps = number_of_steps + 1
+        primal = reduced_f(x)
+        reduced_grad!(gradient, x)
+        strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient)
+        if trajectory
+            push!(
+                traj_data,
+                (
+                    t + number_of_steps,
+                    primal,
+                    primal - tolerance,
+                    tolerance,
+                    (time_ns() - time_start) / 1.0e9,
+                    length(initial_point),
+                ),
+            )
+        end
+        tt = simplex_descent
+        if verbose && mod(t + number_of_steps, print_iter) == 0
+            if t == 0
+                tt = initial
+            end
+            rep = (
+                tt,
+                string(t+ number_of_steps),
+                primal,
+                primal - tolerance,
+                tolerance,
+                (time_ns() - time_start) / 1.0e9,
+                length(initial_point),
+                non_simplex_iter,
+            )
+            print_iter_func(rep)
+            flush(stdout)
+        end
+    end
+    return x, number_of_steps
+end
+
+# Sort projection for the simplex.
+function projection_simplex_sort(x)
+    n = length(x)
+    if sum(x) == 1.0 && all(>=(0.0), x)
+        return x
+    end
+    v = x .- maximum(x)
+    u = sort(v, rev=true)
+    cssv = cumsum(u)
+    rho = sum(u .* collect(1:1:n).>(cssv .- 1.0)) - 1
+    theta = (cssv[rho + 1] - 1.0) / (rho + 1)
+    w = clamp.(v .- theta, 0.0, Inf)
+    return w
+end
+
+function Strong_Frank_Wolfe_gap_probability_simplex(gradient)
+    index_min = argmin(gradient)
+    index_max = argmax(gradient)
+    return gradient[index_max] - gradient[index_min]
+end
 
 function simplex_gradient_descent_over_convex_hull(
     f,
@@ -409,8 +526,8 @@ function simplex_gradient_descent_over_convex_hull(
         end
         linesearch_method = L === nothing || !isfinite(L) ? backtracking : shortstep
         if linesearch_method == backtracking
-            _, gamma =
-                backtrackingLS(f, gradient, x, x - y, linesearch_tol=linesearch_tol, step_lim=step_lim)
+            gamma, _ =
+            backtrackingLS(f, direction, x, x - y, 1.0, linesearch_tol=linesearch_tol, step_lim=step_lim)
         else # == shortstep, just two methods here for now
             gamma = fast_dot(gradient, x - y) / (L * norm(x - y)^2)
         end
@@ -461,6 +578,45 @@ function simplex_gradient_descent_over_convex_hull(
     end
 end
 
+#In case the matrix is a maybe hot vector
+#Returns the problem written in the form:
+# reduced_linear^T \lambda + 0.5* \lambda^T reduced_hessian \lambda
+#according to the current active set.
+function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, weights, gradient, hessian)
+    n = atoms[1].len
+    k = length(atoms)
+    aux_matrix = zeros(eltype(atoms[1].active_val), n, k)
+    reduced_linear = zeros(eltype(atoms[1].active_val), k)
+    #Compute the intermediate matrix.
+    for i in 1:k
+        reduced_linear[i] = dot(atoms[i], gradient)
+        aux_matrix[:,i] .= atoms[i].active_val*hessian[atoms[i].val_idx, :] 
+    end
+    #Compute the final matrix.
+    reduced_hessian = zeros(eltype(atoms[1].active_val), k, k)
+    for i in 1:k
+        reduced_hessian[:,i] .= atoms[i].active_val*aux_matrix[atoms[i].val_idx,:]
+    end
+    reduced_linear .-=  reduced_hessian * weights
+    return reduced_hessian, reduced_linear
+end
+
+#In case the active set atoms are not MaybeHotVectors
+function build_reduced_problem(active_set::AbstractVector{<:Array}, weights, gradient, hessian)
+    n = length(atoms[1])
+    k = length(atoms)
+    #Construct the matrix of vertices.
+    vertex_matrix = zeros(n, k)
+    reduced_linear = zeros(k)
+    for i in 1:k
+        reduced_linear[i] = dot(atoms[i], gradient)
+        vertex_matrix[:, i] .= active_set.atoms[i]
+    end
+    reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix
+    reduced_linear .-= reduced_hessian * weights
+    return reduced_hessian, reduced_linear
+end
+
 function bcg_backup(
     f,
     grad!,

From 5084f095c09e6aa5d2b305cedb840bb72efc3f02 Mon Sep 17 00:00:00 2001
From: alejandro-carderera <alejandro.carderera@gmail.com>
Date: Wed, 10 Mar 2021 21:04:41 +0100
Subject: [PATCH 03/18] Intermediate commit

Added Nesterov's AGD and fixed some bugs.

Many tests remaining, and I need to incorporate the comments from Mathieu above.
---
 examples/blended_cg.jl |  67 ++++-----
 src/blended_cg.jl      | 299 +++++++++++++++++++++++++++++++----------
 2 files changed, 264 insertions(+), 102 deletions(-)

diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl
index e8d02ef9b..d6a0e6a1f 100644
--- a/examples/blended_cg.jl
+++ b/examples/blended_cg.jl
@@ -3,10 +3,11 @@ using LinearAlgebra
 using Random
 using DoubleFloats
 using FrankWolfe
+using SparseArrays
 
 n = Int(1e4)
 #n = Int(1e3)
-k = 1000
+k = 100
 
 s = rand(1:100)
 @info "Seed $s"
@@ -30,13 +31,19 @@ const xp = xpi # ./ total;
 #gradient = zeros(n)
 
 
-f(x) = norm(x - xp)^2
+#f(x) = norm(x - xp)^2
+#function grad!(storage, x)
+#    @. storage = 2 * (x - xp)
+#end
+#hessian = Matrix(2.0I, n, n) 
+
+matrix = rand(n,n)
+hessian = transpose(matrix) * matrix
+linear = rand(n)
+f(x) = dot(linear, x) + 0.5*transpose(x) * hessian * x
 function grad!(storage, x)
-    @. storage = 2 * (x - xp)
+    storage .= linear + hessian * x
 end
-hessian = Matrix(1.0I, n, n) 
-
-
 
 # better for memory consumption as we do coordinate-wise ops
 
@@ -49,7 +56,7 @@ function cgrad!(storage, x, xp)
 end
 
 # this LMO might produce numerical instabilities do demonstrate the recovery feature
-#const lmo = FrankWolfe.KSparseLMO(100, 1.0)
+const lmo = FrankWolfe.KSparseLMO(100, 1.0)
 
 # full upgrade of the lmo (and hence optimization) to Double64.
 # the same lmo with Double64 is much more numerically robust. costs relatively little in speed.
@@ -62,7 +69,7 @@ end
 # other oracles to test / experiment with
 # const lmo = FrankWolfe.LpNormLMO{Float64,1}(1.0)
 #const lmo = FrankWolfe.ProbabilitySimplexOracle(Double64(1.0));
-const lmo = FrankWolfe.ProbabilitySimplexOracle(1.0);
+#const lmo = FrankWolfe.ProbabilitySimplexOracle(1.0);
 # const lmo = FrankWolfe.UnitSimplexOracle(1.0);
 
 const x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n))
@@ -90,10 +97,10 @@ x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg(
     hessian = hessian,
     emphasis=FrankWolfe.memory,
     L=2,
+    accelerated = true,
     verbose=true,
     trajectory=true,
     Ktolerance=1.00,
-    goodstep_tolerance=0.95,
     weight_purge_threshold=1e-10,
 )
 
@@ -111,31 +118,29 @@ x, v, primal, dual_gap, trajectoryBCG2 = FrankWolfe.bcg(
     verbose=true,
     trajectory=true,
     Ktolerance=1.00,
-    goodstep_tolerance=0.95,
-    weight_purge_threshold=1e-10,
-)
-
-x, v, primal, dual_gap, trajectoryBCG_backup = FrankWolfe.bcg_backup(
-    f,
-    grad!,
-    lmo,
-    x0,
-    max_iteration=k,
-    line_search=FrankWolfe.adaptive,
-    print_iter=k / 10,
-    emphasis=FrankWolfe.memory,
-    L=2,
-    verbose=true,
-    trajectory=true,
-    Ktolerance=1.00,
-    goodstep_tolerance=0.95,
     weight_purge_threshold=1e-10,
 )
 
-data = [trajectoryBCG, trajectoryBCG2, trajectoryBCG_backup]
-label = ["BCG", "BCG old", "BCG backup"]
-
-FrankWolfe.plot_trajectories(data, label, filename="output_results.png")
+#x, v, primal, dual_gap, trajectoryBCG_backup = FrankWolfe.bcg_backup(
+#    f,
+#    grad!,
+#    lmo,
+#    x0,
+#    max_iteration=k,
+#    line_search=FrankWolfe.adaptive,
+#    print_iter=k / 10,
+#    emphasis=FrankWolfe.memory,
+#    L=2,
+#    verbose=true,
+#    trajectory=true,
+#    Ktolerance=1.00,
+#    goodstep_tolerance=0.95,
+#    weight_purge_threshold=1e-10,
+#)
+
+#data = [trajectoryBCG, trajectoryBCG2, trajectoryBCG_backup]
+#label = ["BCG", "BCG old", "BCG backup"]
+#FrankWolfe.plot_trajectories(data, label, filename="output_results.png")
 
 #x0 = deepcopy(x00)
 #@time x, v, primal, dual_gap, trajectoryAda = FrankWolfe.afw(
diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 2121b12b9..96fb032d7 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -1,3 +1,5 @@
+import Arpack
+
 function print_header(data)
     @printf(
         "\n────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n"
@@ -55,8 +57,8 @@ function bcg(
     verbose=false,
     linesearch_tol=1e-7,
     emphasis=nothing,
+    accelerated = false,
     Ktolerance=1.0,
-    goodstep_tolerance=1.0,
     weight_purge_threshold=1e-9,
     gradient=nothing,
     direction_storage=nothing,
@@ -137,7 +139,6 @@ function bcg(
             (time_ns() - time_start) / 1.0e9,
             length(active_set),
             non_simplex_iter,
-            nforced_fw,
         )
         print_iter_func(rep)
         flush(stdout)
@@ -161,15 +162,13 @@ function bcg(
             print_iter=print_iter,
             hessian = hessian,
             L=L,
-            accelerated = false,
+            accelerated = accelerated,
         )
-
         t = t + num_simplex_descent_steps
         #Take a FW step.
         x  = compute_active_set_iterate(active_set)
         primal = f(x)
         grad!(gradient, x)
-        non_simplex_iter += 1
         # compute new atom
         (v, value) = lp_separation_oracle(
             lmo,
@@ -232,7 +231,6 @@ function bcg(
             print_iter_func(rep)
             flush(stdout)
         end
-        
     end
     if verbose
         x = compute_active_set_iterate(active_set)
@@ -324,24 +322,50 @@ function minimize_over_convex_hull(
     else
         x = compute_active_set_iterate(active_set)
         grad!(gradient, x)
+        c = [fast_dot(gradient, a) for a in active_set.atoms]
+        if maximum(c) - minimum(c) <= tolerance
+            return 0
+        end
+
         #Rewrite as problem over the simplex
-        M, b = build_reduced_problem(active_set.atoms, active_set.weights, gradient, hessian)
+        M, b = build_reduced_problem(active_set.atoms, hessian, active_set.weights, gradient)
         L = eigmax(M)
-        reduced_f(y) =  f(x) - dot(gradient, x) + 0.5*transpose(x) * hessian * x + dot(b, y) + 0.5*transpose(y) * M * y
+        #L = Arpack.eigs(M, nev=1, which=:LM)
+        #L = 2.0
+
+        mu = eigmin(M)
+        #mu = Arpack.eigs(M, nev=1, which=:SM)
+        #mu = 2.0
 
+        reduced_f(y) =  f(x) - dot(gradient, x) + 0.5*transpose(x) * hessian * x + dot(b, y) + 0.5*transpose(y) * M * y
         function reduced_grad!(storage, x)
             storage .= b + M*x
         end
 
-        function reduced_linesearch(gradient, direction)
-            return -dot(gradient, direction)/ (transpose(direction)*M*direction)
-        end
         
         #Solve using Nesterov's AGD
-        if accelerated
-            return
+        if accelerated && L / mu > 1.0
+            new_weights, number_of_steps = accelerated_simplex_gradient_descent_over_probability_simplex(
+                active_set.weights, 
+                reduced_f, 
+                reduced_grad!,
+                tolerance,
+                t, 
+                trajectory, 
+                traj_data, 
+                time_start, 
+                non_simplex_iter, 
+                verbose = verbose, 
+                print_iter=print_iter, 
+                L = L,
+                mu = mu,
+                )   
         #Solve using gradient descent.
         else
+            function reduced_linesearch(gradient, direction)
+                return -dot(gradient, direction)/ (transpose(direction)*M*direction)
+            end
+
             new_weights, number_of_steps = simplex_gradient_descent_over_probability_simplex(
                 active_set.weights, 
                 reduced_f, 
@@ -360,10 +384,167 @@ function minimize_over_convex_hull(
             @. active_set.weights = new_weights
         end
     end
+    number_elements = length(active_set.atoms)
     active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold)
     return number_of_steps
 end
 
+#In case the matrix is a maybe hot vector
+#Returns the problem written in the form:
+# reduced_linear^T \lambda + 0.5* \lambda^T reduced_hessian \lambda
+#according to the current active set.
+function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, hessian, weights, gradient)
+    n = atoms[1].len
+    k = length(atoms)
+    aux_matrix = zeros(eltype(atoms[1].active_val), n, k)
+    reduced_linear = zeros(eltype(atoms[1].active_val), k)
+    #Compute the intermediate matrix.
+    for i in 1:k
+        reduced_linear[i] = dot(atoms[i], gradient)
+        aux_matrix[:,i] .= atoms[i].active_val*hessian[atoms[i].val_idx, :] 
+    end
+    #Compute the final matrix.
+    reduced_hessian = zeros(eltype(atoms[1].active_val), k, k)
+    for i in 1:k
+        reduced_hessian[:,i] .= atoms[i].active_val*aux_matrix[atoms[i].val_idx,:]
+    end
+    reduced_linear .-=  reduced_hessian * weights
+    return reduced_hessian, reduced_linear
+end
+
+#Case where the active set contains sparse arrays
+function build_reduced_problem(atoms::AbstractVector{<:SparseArrays.AbstractSparseArray}, hessian, weights, gradient)
+    n = length(atoms[1])
+    k = length(atoms)
+    #Construct the matrix of vertices.
+    vertex_matrix = zeros(n, k)
+    reduced_linear = zeros(k)
+    for i in 1:k
+        reduced_linear[i] = dot(atoms[i], gradient)
+        vertex_matrix[:, i] .= atoms[i]
+    end
+    reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix
+    reduced_linear .-= reduced_hessian * weights
+    return reduced_hessian, reduced_linear
+end
+
+#General case where the active set contains normal Julia arrays
+function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, gradient)
+    n = length(atoms[1])
+    k = length(atoms)
+    #Construct the matrix of vertices.
+    vertex_matrix = zeros(n, k)
+    reduced_linear = zeros(k)
+    for i in 1:k
+        reduced_linear[i] = dot(atoms[i], gradient)
+        vertex_matrix[:, i] .= atoms[i]
+    end
+    reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix
+    reduced_linear .-= reduced_hessian * weights
+    return reduced_hessian, reduced_linear
+end
+
+function accelerated_simplex_gradient_descent_over_probability_simplex(
+    initial_point,
+    reduced_f,
+    reduced_grad!,
+    tolerance,
+    t,
+    trajectory,
+    traj_data,
+    time_start,
+    non_simplex_iter;
+    verbose = verbose,
+    print_iter=print_iter,
+    L = 1.0,
+    mu = 1.0,
+)
+    number_of_steps = 0
+    x = deepcopy(initial_point)
+    x_old = deepcopy(initial_point)
+    y = deepcopy(initial_point)
+    gradient_x = similar(x)
+    gradient_y = similar(x)
+    d = similar(x)
+    reduced_grad!(gradient_x, x)
+    reduced_grad!(gradient_y, x)
+    strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient_x)
+    q = mu / L
+    # If the problem is close to convex, simply use the accelerated algorithm for convex objective functions.
+    if mu < 1.0e-3
+        alpha = 0.0
+    else
+        alpha = sqrt(q)
+    end
+    alpha_old = 0.0
+    while strong_wolfe_gap > tolerance
+        @. x_old =   x
+        reduced_grad!(gradient_y, y)
+        @. d =   y - gradient_y/L
+        x = projection_simplex_sort(y .- gradient_y/L)
+        if mu < 1.0e-3
+            alpha_old = alpha_old
+            alpha = 0.5 * (1 + sqrt(1 + 4 * alpha^2))
+            gamma = (alpha_old - 1.0) / alpha
+        else
+            alpha_old = alpha
+            alpha = return_bounded_root_of_square(1, alpha^2 - q, -alpha^2)
+            gamma = alpha_old * (1 - alpha_old) / (alpha_old^2 - alpha)
+        end
+        @. y =  x + gamma*(x - x_old)
+        number_of_steps = number_of_steps + 1
+        primal = reduced_f(x)
+        reduced_grad!(gradient_x, x)
+        strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient_x)
+        if trajectory
+            push!(
+                traj_data,
+                (
+                    t + number_of_steps,
+                    primal,
+                    primal - tolerance,
+                    tolerance,
+                    (time_ns() - time_start) / 1.0e9,
+                    length(initial_point),
+                ),
+            )
+        end
+        tt = simplex_descent
+        if verbose && mod(t + number_of_steps, print_iter) == 0
+            if t == 0
+                tt = initial
+            end
+            rep = (
+                tt,
+                string(t+ number_of_steps),
+                primal,
+                primal - tolerance,
+                tolerance,
+                (time_ns() - time_start) / 1.0e9,
+                length(initial_point),
+                non_simplex_iter,
+            )
+            print_iter_func(rep)
+            flush(stdout)
+        end
+    end
+    return x, number_of_steps
+end
+
+function return_bounded_root_of_square(a,b,c)
+    root1 = (-b+sqrt(b^2 - 4*a*c))/(2*a)
+    if root1 >= 0 && root1 < 1.0
+        return root1
+    else
+        root2 = (-b - sqrt(b^2 - 4*a*c))/(2*a)
+        if root2 >= 0 && root2 < 1.0
+            return root2
+        else
+            print("\n TODO, introduce assert. Roots are: ", root1, " ", root2,"\n")
+        end
+    end
+end
+
 function simplex_gradient_descent_over_probability_simplex(
     initial_point,
     reduced_f,
@@ -429,6 +610,8 @@ function simplex_gradient_descent_over_probability_simplex(
     return x, number_of_steps
 end
 
+
+
 # Sort projection for the simplex.
 function projection_simplex_sort(x)
     n = length(x)
@@ -445,9 +628,20 @@ function projection_simplex_sort(x)
 end
 
 function Strong_Frank_Wolfe_gap_probability_simplex(gradient)
-    index_min = argmin(gradient)
-    index_max = argmax(gradient)
-    return gradient[index_max] - gradient[index_min]
+    val_min = gradient[1]
+    val_max = gradient[1]
+    for i in 2:length(gradient)
+        temp_val = gradient[i]
+        if temp_val < val_min
+            val_min = temp_val
+        else
+            if temp_val > val_max
+                val_max = temp_val
+            end
+        end
+
+    end
+    return val_max - val_min
 end
 
 function simplex_gradient_descent_over_convex_hull(
@@ -520,27 +714,27 @@ function simplex_gradient_descent_over_convex_hull(
         η = max(0, η)
         @. active_set.weights -= η * d
         y = copy(update_active_set_iterate!(active_set))
+        number_of_steps = number_of_steps + 1
         if f(x) ≥ f(y)
             active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold)
-            return false
-        end
-        linesearch_method = L === nothing || !isfinite(L) ? backtracking : shortstep
-        if linesearch_method == backtracking
-            gamma, _ =
-            backtrackingLS(f, direction, x, x - y, 1.0, linesearch_tol=linesearch_tol, step_lim=step_lim)
-        else # == shortstep, just two methods here for now
-            gamma = fast_dot(gradient, x - y) / (L * norm(x - y)^2)
-        end
-        gamma = min(1.0, gamma)
-        # step back from y to x by (1 - γ) η d
-        # new point is x - γ η d
-        if gamma == 1.0
-            active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold)
         else
-            @. active_set.weights += η * (1 - gamma) * d
-            @. active_set.x =  x + gamma * (y - x)
+            linesearch_method = L === nothing || !isfinite(L) ? backtracking : shortstep
+            if linesearch_method == backtracking
+                gamma, _ =
+                backtrackingLS(f, direction, x, x - y, 1.0, linesearch_tol=linesearch_tol, step_lim=step_lim)
+            else # == shortstep, just two methods here for now
+                gamma = fast_dot(gradient, x - y) / (L * norm(x - y)^2)
+            end
+            gamma = min(1.0, gamma)
+            # step back from y to x by (1 - γ) η d
+            # new point is x - γ η d
+            if gamma == 1.0
+                active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold)
+            else
+                @. active_set.weights += η * (1 - gamma) * d
+                @. active_set.x =  x + gamma * (y - x)
+            end
         end
-        number_of_steps = number_of_steps + 1
         x  = compute_active_set_iterate(active_set)
         primal = f(x)
         dual_gap = tolerance
@@ -564,7 +758,7 @@ function simplex_gradient_descent_over_convex_hull(
             end
             rep = (
                 tt,
-                string(t+ number_of_steps),
+                string(t + number_of_steps),
                 primal,
                 primal - dual_gap,
                 dual_gap,
@@ -578,44 +772,7 @@ function simplex_gradient_descent_over_convex_hull(
     end
 end
 
-#In case the matrix is a maybe hot vector
-#Returns the problem written in the form:
-# reduced_linear^T \lambda + 0.5* \lambda^T reduced_hessian \lambda
-#according to the current active set.
-function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, weights, gradient, hessian)
-    n = atoms[1].len
-    k = length(atoms)
-    aux_matrix = zeros(eltype(atoms[1].active_val), n, k)
-    reduced_linear = zeros(eltype(atoms[1].active_val), k)
-    #Compute the intermediate matrix.
-    for i in 1:k
-        reduced_linear[i] = dot(atoms[i], gradient)
-        aux_matrix[:,i] .= atoms[i].active_val*hessian[atoms[i].val_idx, :] 
-    end
-    #Compute the final matrix.
-    reduced_hessian = zeros(eltype(atoms[1].active_val), k, k)
-    for i in 1:k
-        reduced_hessian[:,i] .= atoms[i].active_val*aux_matrix[atoms[i].val_idx,:]
-    end
-    reduced_linear .-=  reduced_hessian * weights
-    return reduced_hessian, reduced_linear
-end
 
-#In case the active set atoms are not MaybeHotVectors
-function build_reduced_problem(active_set::AbstractVector{<:Array}, weights, gradient, hessian)
-    n = length(atoms[1])
-    k = length(atoms)
-    #Construct the matrix of vertices.
-    vertex_matrix = zeros(n, k)
-    reduced_linear = zeros(k)
-    for i in 1:k
-        reduced_linear[i] = dot(atoms[i], gradient)
-        vertex_matrix[:, i] .= active_set.atoms[i]
-    end
-    reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix
-    reduced_linear .-= reduced_hessian * weights
-    return reduced_hessian, reduced_linear
-end
 
 function bcg_backup(
     f,

From 48d914ea58a4fa755f9ec15ba02472289eab520e Mon Sep 17 00:00:00 2001
From: alejandro-carderera <alejandro.carderera@gmail.com>
Date: Wed, 10 Mar 2021 21:07:42 +0100
Subject: [PATCH 04/18] Minor to test file.

---
 examples/blended_cg.jl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl
index d6a0e6a1f..77096dece 100644
--- a/examples/blended_cg.jl
+++ b/examples/blended_cg.jl
@@ -44,6 +44,7 @@ f(x) = dot(linear, x) + 0.5*transpose(x) * hessian * x
 function grad!(storage, x)
     storage .= linear + hessian * x
 end
+L = eigmax(hessian)
 
 # better for memory consumption as we do coordinate-wise ops
 
@@ -96,7 +97,7 @@ x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg(
     print_iter=k / 10,
     hessian = hessian,
     emphasis=FrankWolfe.memory,
-    L=2,
+    L=L,
     accelerated = true,
     verbose=true,
     trajectory=true,
@@ -114,7 +115,7 @@ x, v, primal, dual_gap, trajectoryBCG2 = FrankWolfe.bcg(
     line_search=FrankWolfe.adaptive,
     print_iter=k / 10,
     emphasis=FrankWolfe.memory,
-    L=2,
+    L=L,
     verbose=true,
     trajectory=true,
     Ktolerance=1.00,
@@ -130,7 +131,7 @@ x, v, primal, dual_gap, trajectoryBCG2 = FrankWolfe.bcg(
 #    line_search=FrankWolfe.adaptive,
 #    print_iter=k / 10,
 #    emphasis=FrankWolfe.memory,
-#    L=2,
+#    L=L,
 #    verbose=true,
 #    trajectory=true,
 #    Ktolerance=1.00,

From a9770a55a708fa2ee6a7e11f1cb57756f1cf2142 Mon Sep 17 00:00:00 2001
From: alejandro-carderera <alejandro.carderera@gmail.com>
Date: Thu, 11 Mar 2021 17:04:21 +0100
Subject: [PATCH 05/18] Fixes for some bugs.

Reused some computations and fixed some bugs. Also added some function headers. Still testing...
---
 examples/blended_cg.jl |  73 +++++++----
 src/blended_cg.jl      | 275 ++++++++++++++++++++++++++---------------
 2 files changed, 224 insertions(+), 124 deletions(-)

diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl
index 77096dece..7d80f6ac9 100644
--- a/examples/blended_cg.jl
+++ b/examples/blended_cg.jl
@@ -5,9 +5,8 @@ using DoubleFloats
 using FrankWolfe
 using SparseArrays
 
-n = Int(1e4)
-#n = Int(1e3)
-k = 100
+n = Int(5e2)
+k = 1000
 
 s = rand(1:100)
 @info "Seed $s"
@@ -57,7 +56,7 @@ function cgrad!(storage, x, xp)
 end
 
 # this LMO might produce numerical instabilities do demonstrate the recovery feature
-const lmo = FrankWolfe.KSparseLMO(100, 1.0)
+const lmo = FrankWolfe.KSparseLMO(50, 1.0)
 
 # full upgrade of the lmo (and hence optimization) to Double64.
 # the same lmo with Double64 is much more numerically robust. costs relatively little in speed.
@@ -87,13 +86,13 @@ FrankWolfe.benchmark_oracles(x -> cf(x, xp), (str, x) -> cgrad!(str, x, xp), ()-
 # as we do multiple runs from the same initial point we do not want this here.
 
 x0 = deepcopy(x00)
-x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg(
+x, v, primal, dual_gap, trajectoryBCG_accel_simplex = FrankWolfe.bcg(
     f,
     grad!,
     lmo,
     x0,
     max_iteration=k,
-    line_search=FrankWolfe.adaptive,
+   line_search=FrankWolfe.adaptive,
     print_iter=k / 10,
     hessian = hessian,
     emphasis=FrankWolfe.memory,
@@ -106,7 +105,7 @@ x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg(
 )
 
 x0 = deepcopy(x00)
-x, v, primal, dual_gap, trajectoryBCG2 = FrankWolfe.bcg(
+x, v, primal, dual_gap, trajectoryBCG_simplex = FrankWolfe.bcg(
     f,
     grad!,
     lmo,
@@ -114,34 +113,54 @@ x, v, primal, dual_gap, trajectoryBCG2 = FrankWolfe.bcg(
     max_iteration=k,
     line_search=FrankWolfe.adaptive,
     print_iter=k / 10,
+    hessian = hessian,
     emphasis=FrankWolfe.memory,
     L=L,
+    accelerated = false,
     verbose=true,
     trajectory=true,
     Ktolerance=1.00,
     weight_purge_threshold=1e-10,
 )
 
-#x, v, primal, dual_gap, trajectoryBCG_backup = FrankWolfe.bcg_backup(
-#    f,
-#    grad!,
-#    lmo,
-#    x0,
-#    max_iteration=k,
-#    line_search=FrankWolfe.adaptive,
-#    print_iter=k / 10,
-#    emphasis=FrankWolfe.memory,
-#    L=L,
-#    verbose=true,
-#    trajectory=true,
-#    Ktolerance=1.00,
-#    goodstep_tolerance=0.95,
-#    weight_purge_threshold=1e-10,
-#)
-
-#data = [trajectoryBCG, trajectoryBCG2, trajectoryBCG_backup]
-#label = ["BCG", "BCG old", "BCG backup"]
-#FrankWolfe.plot_trajectories(data, label, filename="output_results.png")
+x0 = deepcopy(x00)
+x, v, primal, dual_gap, trajectoryBCG_convex = FrankWolfe.bcg(
+    f,
+    grad!,
+    lmo,
+    x0,
+    max_iteration=k,
+    line_search=FrankWolfe.adaptive,
+    print_iter=k / 10,
+    emphasis=FrankWolfe.memory,
+    L=L,
+    verbose=true,
+    trajectory=true,
+    Ktolerance=1.00,
+    weight_purge_threshold=1e-10,
+)
+
+x0 = deepcopy(x00)
+x, v, primal, dual_gap, trajectoryBCG_convex_backup = FrankWolfe.bcg_backup(
+    f,
+    grad!,
+    lmo,
+    x0,
+    max_iteration=k,
+    line_search=FrankWolfe.adaptive,
+    print_iter=k / 10,
+    emphasis=FrankWolfe.memory,
+    L=L,
+    verbose=true,
+    trajectory=true,
+    Ktolerance=1.00,
+    goodstep_tolerance=0.95,
+    weight_purge_threshold=1e-10,
+)
+
+data = [trajectoryBCG_accel_simplex, trajectoryBCG_simplex, trajectoryBCG_convex, trajectoryBCG_convex_backup]
+label = ["BCG (accel simplex)", "BCG (simplex)", "BCG (convex)", "BCG (convex backup)"]
+FrankWolfe.plot_trajectories(data, label, filename="output_results.png")
 
 #x0 = deepcopy(x00)
 #@time x, v, primal, dual_gap, trajectoryAda = FrankWolfe.afw(
diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 96fb032d7..c0361bd46 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -1,4 +1,5 @@
 import Arpack
+using DoubleFloats
 
 function print_header(data)
     @printf(
@@ -278,6 +279,21 @@ function bcg(
 end
 
 
+"""
+minimize_over_convex_hull
+
+Given a function f with gradient grad! and an active set 
+active_set this function will minimize the function over 
+the convex hull of the active set until the strong-wolfe 
+gap over the active set is below tolerance.
+
+It will either directly minimize over the convex hull using
+simplex gradient descent, or it will transform the problem 
+to barycentric coordinates and minimize over the unit 
+probability simplex using gradient descent or Nesterov's 
+accelerated gradient descent.
+"""
+
 function minimize_over_convex_hull(
     f,
     grad!,
@@ -314,7 +330,7 @@ function minimize_over_convex_hull(
             non_simplex_iter,
             verbose = verbose,
             print_iter=print_iter,
-            L=L,
+            L=nothing,
             linesearch_tol=linesearch_tol,
             step_lim=step_lim,
             weight_purge_threshold=weight_purge_threshold,
@@ -322,55 +338,61 @@ function minimize_over_convex_hull(
     else
         x = compute_active_set_iterate(active_set)
         grad!(gradient, x)
-        c = [fast_dot(gradient, a) for a in active_set.atoms]
-        if maximum(c) - minimum(c) <= tolerance
-            return 0
-        end
+        #c = [fast_dot(gradient, a) for a in active_set.atoms]
+        #if maximum(c) - minimum(c) <= tolerance
+        #    return 0
+        #end
 
         #Rewrite as problem over the simplex
-        M, b = build_reduced_problem(active_set.atoms, hessian, active_set.weights, gradient)
-        L = eigmax(M)
-        #L = Arpack.eigs(M, nev=1, which=:LM)
-        #L = 2.0
-
-        mu = eigmin(M)
-        #mu = Arpack.eigs(M, nev=1, which=:SM)
-        #mu = 2.0
-
+        M, b = build_reduced_problem(active_set.atoms, hessian, active_set.weights, gradient, tolerance)
+        #Early exit if we have detected that the strong-Wolfe gap is below the desired tolerance while building the reduced problem.
+        if isnothing(M) && isnothing(b)
+            return 0
+        end
+        #In case the matrices are DoubleFloats we need to cast them as Float64, because LinearAlgebra does not work with them.
+        if eltype(M) === Double64
+            converted_matrix = convert(Array{Float64}, M)
+            L_reduced = eigmax(converted_matrix)
+        else
+            L_reduced = eigmax(M)
+            #L_reduced = Arpack.eigs(M, nev=1, which=:LM)
+        end
         reduced_f(y) =  f(x) - dot(gradient, x) + 0.5*transpose(x) * hessian * x + dot(b, y) + 0.5*transpose(y) * M * y
         function reduced_grad!(storage, x)
             storage .= b + M*x
         end
-
-        
         #Solve using Nesterov's AGD
-        if accelerated && L / mu > 1.0
-            new_weights, number_of_steps = accelerated_simplex_gradient_descent_over_probability_simplex(
-                active_set.weights, 
-                reduced_f, 
-                reduced_grad!,
-                tolerance,
-                t, 
-                trajectory, 
-                traj_data, 
-                time_start, 
-                non_simplex_iter, 
-                verbose = verbose, 
-                print_iter=print_iter, 
-                L = L,
-                mu = mu,
-                )   
-        #Solve using gradient descent.
-        else
-            function reduced_linesearch(gradient, direction)
-                return -dot(gradient, direction)/ (transpose(direction)*M*direction)
+        if accelerated 
+            if eltype(M) === Double64
+                mu_reduced = eigmin(converted_matrix)
+            else
+                mu_reduced = eigmin(M)
             end
-
+            if L_reduced / mu_reduced > 1.0
+                new_weights, number_of_steps = accelerated_simplex_gradient_descent_over_probability_simplex(
+                    active_set.weights, 
+                    reduced_f, 
+                    reduced_grad!,
+                    tolerance,
+                    t, 
+                    trajectory, 
+                    traj_data, 
+                    time_start, 
+                    non_simplex_iter, 
+                    verbose = verbose, 
+                    print_iter=print_iter, 
+                    L = L_reduced,
+                    mu = mu_reduced,
+                    )  
+                @. active_set.weights = new_weights
+            end
+        end
+        #Solve using gradient descent.
+        if !accelerated || L_reduced / mu_reduced == 1.0
             new_weights, number_of_steps = simplex_gradient_descent_over_probability_simplex(
                 active_set.weights, 
                 reduced_f, 
                 reduced_grad!, 
-                reduced_linesearch, 
                 tolerance,
                 t, 
                 trajectory, 
@@ -379,7 +401,7 @@ function minimize_over_convex_hull(
                 non_simplex_iter, 
                 verbose = verbose, 
                 print_iter=print_iter, 
-                L = L,
+                L = L_reduced,
                 )   
             @. active_set.weights = new_weights
         end
@@ -389,18 +411,28 @@ function minimize_over_convex_hull(
     return number_of_steps
 end
 
-#In case the matrix is a maybe hot vector
-#Returns the problem written in the form:
-# reduced_linear^T \lambda + 0.5* \lambda^T reduced_hessian \lambda
-#according to the current active set.
-function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, hessian, weights, gradient)
+"""
+build_reduced_problem
+
+Given an active set formed by MaybeHotVector, a (constant)
+Hessian and a gradient constructs a quadratic problem 
+over the unit probability simplex that is equivalent to 
+minimizing the original function over the convex hull of the
+active set. If λ are the barycentric coordinates of dimension
+equal to the cardinality of the active set, the objective 
+function is:
+    f(λ) = reduced_linear^T λ + 0.5 * λ^T reduced_hessian λ
+"""
+function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, hessian, weights, gradient, tolerance)
     n = atoms[1].len
     k = length(atoms)
+    reduced_linear = [fast_dot(gradient, a) for a in atoms]
+    if Strong_Frank_Wolfe_gap(reduced_linear) <= tolerance
+        return nothing, nothing
+    end
     aux_matrix = zeros(eltype(atoms[1].active_val), n, k)
-    reduced_linear = zeros(eltype(atoms[1].active_val), k)
     #Compute the intermediate matrix.
     for i in 1:k
-        reduced_linear[i] = dot(atoms[i], gradient)
         aux_matrix[:,i] .= atoms[i].active_val*hessian[atoms[i].val_idx, :] 
     end
     #Compute the final matrix.
@@ -412,15 +444,26 @@ function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector
     return reduced_hessian, reduced_linear
 end
 
-#Case where the active set contains sparse arrays
-function build_reduced_problem(atoms::AbstractVector{<:SparseArrays.AbstractSparseArray}, hessian, weights, gradient)
+"""
+build_reduced_problem
+
+Same as the function above, but for the case where the active 
+set is formed by Sparse Arrays.
+"""
+function build_reduced_problem(atoms::AbstractVector{<:SparseArrays.AbstractSparseArray}, hessian, weights, gradient, tolerance)
     n = length(atoms[1])
     k = length(atoms)
+
+    reduced_linear = [fast_dot(gradient, a) for a in atoms]
+    if Strong_Frank_Wolfe_gap(reduced_linear) <= tolerance
+        return nothing, nothing
+    end
+
     #Construct the matrix of vertices.
     vertex_matrix = zeros(n, k)
-    reduced_linear = zeros(k)
+    #reduced_linear = zeros(k)
     for i in 1:k
-        reduced_linear[i] = dot(atoms[i], gradient)
+        #reduced_linear[i] = dot(atoms[i], gradient)
         vertex_matrix[:, i] .= atoms[i]
     end
     reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix
@@ -428,15 +471,26 @@ function build_reduced_problem(atoms::AbstractVector{<:SparseArrays.AbstractSpar
     return reduced_hessian, reduced_linear
 end
 
-#General case where the active set contains normal Julia arrays
-function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, gradient)
+"""
+build_reduced_problem
+
+Same as the two function above, but for the case where the active 
+set is formed by normal Julia Arrays.
+"""
+function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, gradient, tolerance)
     n = length(atoms[1])
     k = length(atoms)
+
+    reduced_linear = [fast_dot(gradient, a) for a in atoms]
+    if Strong_Frank_Wolfe_gap(reduced_linear) <= tolerance
+        return nothing, nothing
+    end
+
     #Construct the matrix of vertices.
     vertex_matrix = zeros(n, k)
-    reduced_linear = zeros(k)
+    #reduced_linear = zeros(k)
     for i in 1:k
-        reduced_linear[i] = dot(atoms[i], gradient)
+        #reduced_linear[i] = dot(atoms[i], gradient)
         vertex_matrix[:, i] .= atoms[i]
     end
     reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix
@@ -444,6 +498,31 @@ function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights,
     return reduced_hessian, reduced_linear
 end
 
+"""
+Checks the strong-Wolfe gap for the reduced problem.
+"""
+function Strong_Frank_Wolfe_gap(gradient)
+    val_min = Inf
+    val_max = -Inf
+    for i in 1:length(gradient)
+        temp_val = gradient[i]
+        if temp_val < val_min
+            val_min = temp_val
+        end
+        if temp_val > val_max
+            val_max = temp_val
+        end
+    end
+    return val_max - val_min
+end
+
+"""
+accelerated_simplex_gradient_descent_over_probability_simplex
+
+Minimizes an objective function over the unit probability simplex 
+until the Strong-Wolfe gap is below tolerance using Nesterov's 
+accelerated gradient descent.
+"""
 function accelerated_simplex_gradient_descent_over_probability_simplex(
     initial_point,
     reduced_f,
@@ -468,34 +547,29 @@ function accelerated_simplex_gradient_descent_over_probability_simplex(
     d = similar(x)
     reduced_grad!(gradient_x, x)
     reduced_grad!(gradient_y, x)
-    strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient_x)
+    strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient_x, x)
     q = mu / L
     # If the problem is close to convex, simply use the accelerated algorithm for convex objective functions.
     if mu < 1.0e-3
         alpha = 0.0
+        alpha_old = 0.0
     else
-        alpha = sqrt(q)
+        gamma = (1 - sqrt(q))/(1 + sqrt(q))
     end
-    alpha_old = 0.0
     while strong_wolfe_gap > tolerance
         @. x_old =   x
         reduced_grad!(gradient_y, y)
-        @. d =   y - gradient_y/L
         x = projection_simplex_sort(y .- gradient_y/L)
         if mu < 1.0e-3
-            alpha_old = alpha_old
+            alpha_old = alpha
             alpha = 0.5 * (1 + sqrt(1 + 4 * alpha^2))
             gamma = (alpha_old - 1.0) / alpha
-        else
-            alpha_old = alpha
-            alpha = return_bounded_root_of_square(1, alpha^2 - q, -alpha^2)
-            gamma = alpha_old * (1 - alpha_old) / (alpha_old^2 - alpha)
         end
         @. y =  x + gamma*(x - x_old)
         number_of_steps = number_of_steps + 1
         primal = reduced_f(x)
         reduced_grad!(gradient_x, x)
-        strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient_x)
+        strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient_x, x)
         if trajectory
             push!(
                 traj_data,
@@ -531,25 +605,16 @@ function accelerated_simplex_gradient_descent_over_probability_simplex(
     return x, number_of_steps
 end
 
-function return_bounded_root_of_square(a,b,c)
-    root1 = (-b+sqrt(b^2 - 4*a*c))/(2*a)
-    if root1 >= 0 && root1 < 1.0
-        return root1
-    else
-        root2 = (-b - sqrt(b^2 - 4*a*c))/(2*a)
-        if root2 >= 0 && root2 < 1.0
-            return root2
-        else
-            print("\n TODO, introduce assert. Roots are: ", root1, " ", root2,"\n")
-        end
-    end
-end
+"""
+simplex_gradient_descent_over_probability_simplex
 
+Minimizes an objective function over the unit probability simplex 
+until the Strong-Wolfe gap is below tolerance using gradient descent.
+"""
 function simplex_gradient_descent_over_probability_simplex(
     initial_point,
     reduced_f,
     reduced_grad!,
-    reduced_linesearch,
     tolerance,
     t,
     trajectory,
@@ -565,16 +630,13 @@ function simplex_gradient_descent_over_probability_simplex(
     gradient = similar(x)
     d = similar(x)
     reduced_grad!(gradient, x)
-    strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient)
+    strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient, x)
     while strong_wolfe_gap > tolerance
-        y = projection_simplex_sort(x .- gradient/L)
-        @. d =   y - x
-        gamma = min(1.0, reduced_linesearch(gradient, d))
-        @. x =  x + gamma*d
+        x = projection_simplex_sort(x .- gradient/L)
         number_of_steps = number_of_steps + 1
         primal = reduced_f(x)
         reduced_grad!(gradient, x)
-        strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient)
+        strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient, x)
         if trajectory
             push!(
                 traj_data,
@@ -612,7 +674,12 @@ end
 
 
 
-# Sort projection for the simplex.
+"""
+projection_simplex_sort
+
+Perform a projection onto the unit probability simplex using 
+a sorting algorithm.
+"""
 function projection_simplex_sort(x)
     n = length(x)
     if sum(x) == 1.0 && all(>=(0.0), x)
@@ -627,23 +694,37 @@ function projection_simplex_sort(x)
     return w
 end
 
-function Strong_Frank_Wolfe_gap_probability_simplex(gradient)
-    val_min = gradient[1]
-    val_max = gradient[1]
-    for i in 2:length(gradient)
-        temp_val = gradient[i]
-        if temp_val < val_min
-            val_min = temp_val
-        else
+"""
+Strong_Frank_Wolfe_gap_probability_simplex
+
+Compute the Strong-Wolfe gap over the unit probability simplex 
+given a gradient.
+"""
+function Strong_Frank_Wolfe_gap_probability_simplex(gradient, x)
+    val_min = Inf
+    val_max = -Inf
+    for i in 1:length(gradient)
+        if x[i] > 0 
+            temp_val = gradient[i]
+            if temp_val < val_min
+                val_min = temp_val
+            end
             if temp_val > val_max
                 val_max = temp_val
             end
         end
-
     end
     return val_max - val_min
 end
 
+
+"""
+simplex_gradient_descent_over_convex_hull
+
+Minimizes an objective function over the convex hull of the active set
+until the Strong-Wolfe gap is below tolerance using simplex gradient
+descent.
+"""
 function simplex_gradient_descent_over_convex_hull(
     f,
     grad!,
@@ -684,7 +765,7 @@ function simplex_gradient_descent_over_convex_hull(
             a0 = active_set.atoms[1]
             empty!(active_set)
             push!(active_set, (1, a0))
-            return false
+            return number_of_steps
         end
         # NOTE: sometimes the direction is non-improving
         # usual suspects are floating-point errors when multiplying atoms with near-zero weights
@@ -695,7 +776,7 @@ function simplex_gradient_descent_over_convex_hull(
             If higher accuracy is required, consider using Double64 (still quite fast) and if that does not help BigFloat (slower) as type for the numbers.
             Alternatively, consider using AFW (with lazy = true) instead."
             println(fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), gradient))
-            return true
+            return number_of_steps
         end
 
         η = eltype(d)(Inf)
@@ -721,7 +802,7 @@ function simplex_gradient_descent_over_convex_hull(
             linesearch_method = L === nothing || !isfinite(L) ? backtracking : shortstep
             if linesearch_method == backtracking
                 gamma, _ =
-                backtrackingLS(f, direction, x, x - y, 1.0, linesearch_tol=linesearch_tol, step_lim=step_lim)
+                backtrackingLS(f, gradient, x, x - y, 1.0, linesearch_tol=linesearch_tol, step_lim=step_lim)
             else # == shortstep, just two methods here for now
                 gamma = fast_dot(gradient, x - y) / (L * norm(x - y)^2)
             end

From bd69a589535532dd129d6e741bebe13ea2e3f013 Mon Sep 17 00:00:00 2001
From: alejandro-carderera <alejandro.carderera@gmail.com>
Date: Thu, 11 Mar 2021 17:44:26 +0100
Subject: [PATCH 06/18] Delete old code and update test file.

Add an example over the probability simplex that utilizes MaybeHotVectors, and an example over the K-Sparse LMO that uses sparse arrays.
---
 examples/blended_cg.jl | 169 ++++++++----------
 src/blended_cg.jl      | 379 -----------------------------------------
 2 files changed, 67 insertions(+), 481 deletions(-)

diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl
index 7d80f6ac9..20a82823e 100644
--- a/examples/blended_cg.jl
+++ b/examples/blended_cg.jl
@@ -5,8 +5,8 @@ using DoubleFloats
 using FrankWolfe
 using SparseArrays
 
-n = Int(5e2)
-k = 1000
+n = Int(1000)
+k = 10000
 
 s = rand(1:100)
 @info "Seed $s"
@@ -15,27 +15,7 @@ s = rand(1:100)
 s = 41
 Random.seed!(s)
 
-xpi = rand(n);
-total = sum(xpi);
-const xp = xpi # ./ total;
-
-#matrix = rand(n, n)
-#hessian = matrix * transpose(matrix)
-#linear_term = rand(n)
-#constant_term = rand()
-#f(x) = constant_term + dot(linear_term, x) + 0.5* transpose(x) * hessian * x
-#function grad!(storage, x)
-#    storage .= linear_term +  hessian * x
-#end
-#gradient = zeros(n)
-
-
-#f(x) = norm(x - xp)^2
-#function grad!(storage, x)
-#    @. storage = 2 * (x - xp)
-#end
-#hessian = Matrix(2.0I, n, n) 
-
+"""
 matrix = rand(n,n)
 hessian = transpose(matrix) * matrix
 linear = rand(n)
@@ -45,45 +25,9 @@ function grad!(storage, x)
 end
 L = eigmax(hessian)
 
-# better for memory consumption as we do coordinate-wise ops
-
-function cf(x, xp)
-    return @. norm(x - xp)^2
-end
-
-function cgrad!(storage, x, xp)
-    return @. storage = 2 * (x - xp)
-end
-
-# this LMO might produce numerical instabilities do demonstrate the recovery feature
-const lmo = FrankWolfe.KSparseLMO(50, 1.0)
-
-# full upgrade of the lmo (and hence optimization) to Double64.
-# the same lmo with Double64 is much more numerically robust. costs relatively little in speed.
- #const lmo = FrankWolfe.KSparseLMO(100, Double64(1.0))
-
-# as above but now to bigfloats
-# the same lmo here with bigfloat. even more robust but much slower
-# const lmo = FrankWolfe.KSparseLMO(100, big"1.0")
-
-# other oracles to test / experiment with
-# const lmo = FrankWolfe.LpNormLMO{Float64,1}(1.0)
-#const lmo = FrankWolfe.ProbabilitySimplexOracle(Double64(1.0));
-#const lmo = FrankWolfe.ProbabilitySimplexOracle(1.0);
-# const lmo = FrankWolfe.UnitSimplexOracle(1.0);
-
-const x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n))
-
-#active_set = FrankWolfe.ActiveSet([(1.0, x00)]) 
-#vect = FrankWolfe.compute_extreme_point(lmo, rand(n))
-#FrankWolfe.active_set_update!(active_set, 0.2, vect)
-#vect = FrankWolfe.compute_extreme_point(lmo, rand(n))
-#FrankWolfe.active_set_update!(active_set, 0.2, vect)
-
-FrankWolfe.benchmark_oracles(x -> cf(x, xp), (str, x) -> cgrad!(str, x, xp), ()->randn(n), lmo; k=100)
-
-# copying here and below the x00 as the algorithms write back into the variables to save memory.
-# as we do multiple runs from the same initial point we do not want this here.
+#Run over the probability simplex
+lmo = FrankWolfe.ProbabilitySimplexOracle(1.0);
+x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n))
 
 x0 = deepcopy(x00)
 x, v, primal, dual_gap, trajectoryBCG_accel_simplex = FrankWolfe.bcg(
@@ -140,8 +84,65 @@ x, v, primal, dual_gap, trajectoryBCG_convex = FrankWolfe.bcg(
     weight_purge_threshold=1e-10,
 )
 
+data = [trajectoryBCG_accel_simplex, trajectoryBCG_simplex, trajectoryBCG_convex]
+label = ["BCG (accel simplex)", "BCG (simplex)", "BCG (convex)"]
+FrankWolfe.plot_trajectories(data, label)
+"""
+
+
+matrix = rand(n,n)
+hessian = transpose(matrix) * matrix
+linear = rand(n)
+f(x) = dot(linear, x) + 0.5*transpose(x) * hessian * x
+function grad!(storage, x)
+    storage .= linear + hessian * x
+end
+L = eigmax(hessian)
+
+#Run over the K-sparse polytope
+lmo = FrankWolfe.KSparseLMO(100, 100.0)
+x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n))
+
 x0 = deepcopy(x00)
-x, v, primal, dual_gap, trajectoryBCG_convex_backup = FrankWolfe.bcg_backup(
+x, v, primal, dual_gap, trajectoryBCG_accel_simplex = FrankWolfe.bcg(
+    f,
+    grad!,
+    lmo,
+    x0,
+    max_iteration=k,
+   line_search=FrankWolfe.adaptive,
+    print_iter=k / 10,
+    hessian = hessian,
+    emphasis=FrankWolfe.memory,
+    L=L,
+    accelerated = true,
+    verbose=true,
+    trajectory=true,
+    Ktolerance=1.00,
+    weight_purge_threshold=1e-10,
+)
+
+x0 = deepcopy(x00)
+x, v, primal, dual_gap, trajectoryBCG_simplex = FrankWolfe.bcg(
+    f,
+    grad!,
+    lmo,
+    x0,
+    max_iteration=k,
+    line_search=FrankWolfe.adaptive,
+    print_iter=k / 10,
+    hessian = hessian,
+    emphasis=FrankWolfe.memory,
+    L=L,
+    accelerated = false,
+    verbose=true,
+    trajectory=true,
+    Ktolerance=1.00,
+    weight_purge_threshold=1e-10,
+)
+
+x0 = deepcopy(x00)
+x, v, primal, dual_gap, trajectoryBCG_convex = FrankWolfe.bcg(
     f,
     grad!,
     lmo,
@@ -154,45 +155,9 @@ x, v, primal, dual_gap, trajectoryBCG_convex_backup = FrankWolfe.bcg_backup(
     verbose=true,
     trajectory=true,
     Ktolerance=1.00,
-    goodstep_tolerance=0.95,
     weight_purge_threshold=1e-10,
 )
 
-data = [trajectoryBCG_accel_simplex, trajectoryBCG_simplex, trajectoryBCG_convex, trajectoryBCG_convex_backup]
-label = ["BCG (accel simplex)", "BCG (simplex)", "BCG (convex)", "BCG (convex backup)"]
-FrankWolfe.plot_trajectories(data, label, filename="output_results.png")
-
-#x0 = deepcopy(x00)
-#@time x, v, primal, dual_gap, trajectoryAda = FrankWolfe.afw(
-#    f,
-#    grad!,
-#    lmo,
-#    x0,
-#    max_iteration=k,
-#    line_search=FrankWolfe.adaptive,
-#    L=100,
-#    print_iter=k / 10,
-#    emphasis=FrankWolfe.memory,
-#    verbose=true,
-#    trajectory=true,
-#);
-
-#x0 = deepcopy(x00)
-#@time x, v, primal, dual_gap, trajectorySs = FrankWolfe.fw(
-#    f,
-#    grad!,
-#    lmo,
-#    x0,
-#    max_iteration=k,
-#    line_search=FrankWolfe.shortstep,
-#    L=2,
-#    print_iter=k / 10,
-#    emphasis=FrankWolfe.memory,
-#    verbose=true,
-#    trajectory=true,
-#);
-
-#data = [trajectorySs, trajectoryAda, trajectoryBCG]
-#label = ["short step", "AFW", "BCG"]
-#
-#FrankWolfe.plot_trajectories(data, label)
+data = [trajectoryBCG_accel_simplex, trajectoryBCG_simplex, trajectoryBCG_convex]
+label = ["BCG (accel simplex)", "BCG (simplex)", "BCG (convex)"]
+FrankWolfe.plot_trajectories(data, label)
\ No newline at end of file
diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index c0361bd46..98610dcac 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -853,385 +853,6 @@ function simplex_gradient_descent_over_convex_hull(
     end
 end
 
-
-
-function bcg_backup(
-    f,
-    grad!,
-    lmo,
-    x0;
-    line_search::LineSearchMethod=adaptive,
-    L=Inf,
-    gamma0=0,
-    step_lim=20,
-    epsilon=1e-7,
-    max_iteration=10000,
-    print_iter=1000,
-    trajectory=false,
-    verbose=false,
-    linesearch_tol=1e-7,
-    emphasis=nothing,
-    Ktolerance=1.0,
-    goodstep_tolerance=1.0,
-    weight_purge_threshold=1e-9,
-    gradient=nothing,
-    direction_storage=nothing,
-    lmo_kwargs...,
-)
-    function print_header(data)
-        @printf(
-            "\n────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n"
-        )
-        @printf(
-            "%6s %13s %14s %14s %14s %14s %14s %14s %14s\n",
-            data[1],
-            data[2],
-            data[3],
-            data[4],
-            data[5],
-            data[6],
-            data[7],
-            data[8],
-            data[9],
-        )
-        @printf(
-            "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n"
-        )
-    end
-
-    function print_footer()
-        @printf(
-            "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n\n"
-        )
-    end
-
-    function print_iter_func(data)
-        @printf(
-            "%6s %13s %14e %14e %14e %14e %14i %14i %14i\n",
-            st[Symbol(data[1])],
-            data[2],
-            Float64(data[3]),
-            Float64(data[4]),
-            Float64(data[5]),
-            data[6],
-            data[7],
-            data[8],
-            data[9],
-        )
-    end
-
-    t = 0
-    primal = Inf
-    dual_gap = Inf
-    active_set = ActiveSet([(1.0, x0)])
-    x = x0
-    if gradient === nothing
-        gradient = similar(x0, float(eltype(x0)))
-    end
-    grad!(gradient, x)
-    # initial gap estimate computation
-    vmax = compute_extreme_point(lmo, gradient)
-    phi = fast_dot(gradient, x0 - vmax) / 2
-    traj_data = []
-    tt = regular
-    time_start = time_ns()
-    v = x0
-    if direction_storage === nothing
-        direction_storage = Vector{float(eltype(x))}()
-        Base.sizehint!(direction_storage, 100)
-    end
-
-    if line_search == shortstep && !isfinite(L)
-        @error("Lipschitz constant not set to a finite value. Prepare to blow up spectacularly.")
-    end
-
-    if line_search == agnostic || line_search == nonconvex
-        @error("Lazification is not known to converge with open-loop step size strategies.")
-    end
-
-    if line_search == fixed && gamma0 == 0
-        println("WARNING: gamma0 not set. We are not going to move a single bit.")
-    end
-
-    if verbose
-        println("\nBlended Conditional Gradients Algorithm.")
-        numType = eltype(x0)
-        println(
-            "EMPHASIS: $memory STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $numType",
-        )
-        println("K: $Ktolerance")
-        println("WARNING: In memory emphasis mode iterates are written back into x0!")
-        headers = (
-            "Type",
-            "Iteration",
-            "Primal",
-            "Dual",
-            "Dual Gap",
-            "Time",
-            "#ActiveSet",
-            "#non-simplex",
-            "#forced FW",
-        )
-        print_header(headers)
-    end
-
-    if !isa(x, Union{Array, SparseVector})
-            x = convert(Array{float(eltype(x))}, x)
-    end
-    non_simplex_iter = 0
-    nforced_fw = 0
-    force_fw_step = false
-
-    while t <= max_iteration && phi ≥ epsilon
-        # TODO replace with single call interface from function_gradient.jl
-        primal = f(x)
-        grad!(gradient, x)
-        if !force_fw_step
-            (idx_fw, idx_as, good_progress) = find_minmax_directions(
-                active_set, gradient, phi, goodstep_tolerance=goodstep_tolerance,
-            )
-        end
-        if !force_fw_step && good_progress
-            tt = simplex_descent
-            force_fw_step = update_simplex_gradient_descent!(
-                active_set,
-                gradient,
-                f,
-                L=nothing, #don't use the same L as we transform the function
-                linesearch_tol=linesearch_tol,
-                weight_purge_threshold=weight_purge_threshold,
-                storage=direction_storage,
-            )
-            nforced_fw += force_fw_step
-        else
-            non_simplex_iter += 1
-            # compute new atom
-            (v, value) = lp_separation_oracle(
-                lmo,
-                active_set,
-                gradient,
-                phi,
-                Ktolerance;
-                inplace_loop=(emphasis == memory),
-                force_fw_step=force_fw_step,
-                lmo_kwargs...,
-            )
-
-            force_fw_step = false
-            xval = fast_dot(x, gradient)
-            if value > xval - phi/Ktolerance
-                tt = dualstep
-                # setting gap estimate as ∇f(x) (x - v_FW) / 2
-                phi = (xval - value) / 2
-            else
-                tt = regular
-                gamma, L = line_search_wrapper(line_search,t,f,grad!,x,x - v,gradient,dual_gap,L,gamma0,linesearch_tol,step_lim, 1.0)
-
-                if gamma == 1.0
-                    active_set_initialize!(active_set, v)
-                else
-                    active_set_update!(active_set, gamma, v)
-                end
-            end
-        end
-        x  = compute_active_set_iterate(active_set)
-        dual_gap = phi
-        if trajectory
-            push!(
-                traj_data,
-                (
-                    t,
-                    primal,
-                    primal - dual_gap,
-                    dual_gap,
-                    (time_ns() - time_start) / 1.0e9,
-                    length(active_set),
-                ),
-            )
-        end
-
-        if verbose && mod(t, print_iter) == 0
-            if t == 0
-                tt = initial
-            end
-            rep = (
-                tt,
-                string(t),
-                primal,
-                primal - dual_gap,
-                dual_gap,
-                (time_ns() - time_start) / 1.0e9,
-                length(active_set),
-                non_simplex_iter,
-                nforced_fw,
-            )
-            print_iter_func(rep)
-            flush(stdout)
-        end
-        t = t + 1
-    end
-    if verbose
-        x = compute_active_set_iterate(active_set)
-        grad!(gradient, x)
-        v = compute_extreme_point(lmo, gradient)
-        primal = f(x)
-        dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-        rep = (
-            last,
-            string(t - 1),
-            primal,
-            primal - dual_gap,
-            dual_gap,
-            (time_ns() - time_start) / 1.0e9,
-            length(active_set),
-            non_simplex_iter,
-            nforced_fw,
-        )
-        print_iter_func(rep)
-        flush(stdout)
-    end
-    active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold)
-    active_set_renormalize!(active_set)
-    x = compute_active_set_iterate(active_set)
-    grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
-    primal = f(x)
-    #dual_gap = 2phi
-    dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-    if verbose
-        rep = (
-            pp,
-            string(t - 1),
-            primal,
-            primal - dual_gap,
-            dual_gap,
-            (time_ns() - time_start) / 1.0e9,
-            length(active_set),
-            non_simplex_iter,
-            nforced_fw,
-        )
-        print_iter_func(rep)
-        print_footer()
-        flush(stdout)
-    end
-    return x, v, primal, dual_gap, traj_data
-end
-
-
-"""
-    update_simplex_gradient_descent!(active_set::ActiveSet, direction, f)
-
-Performs a Simplex Gradient Descent step and modifies `active_set` inplace.
-
-Returns boolean flag -> whether next step must be a FW step (if numerical instability).
-
-Algorithm reference and notation taken from:
-Blended Conditional Gradients:The Unconditioning of Conditional Gradients
-https://arxiv.org/abs/1805.07311
-"""
-function update_simplex_gradient_descent!(
-    active_set::ActiveSet,
-    direction,
-    f;
-    L=nothing,
-    linesearch_tol=10e-10,
-    step_lim=100,
-    weight_purge_threshold=1e-12,
-    storage=nothing,
-)
-    c = if storage === nothing
-        [fast_dot(direction, a) for a in active_set.atoms]
-    else
-        if length(storage) == length(active_set)
-            for (idx, a) in enumerate(active_set.atoms)
-                storage[idx] = fast_dot(direction, a)
-            end
-            storage
-        elseif length(storage) > length(active_set)
-            for (idx, a) in enumerate(active_set.atoms)
-                storage[idx] = fast_dot(direction, a)
-            end
-            storage[1:length(active_set)]
-        else
-            for idx in 1:length(storage)
-                storage[idx] = fast_dot(direction, active_set.atoms[idx])
-            end
-            for idx in (length(storage)+1):length(active_set)
-                push!(storage, fast_dot(direction, active_set.atoms[idx]))
-            end
-            storage
-        end
-    end
-    k = length(active_set)
-    c .-= (sum(c) / k)
-    # name change to stay consistent with the paper, c is actually updated in-place
-    d = c
-    if norm(d) <= 1e-8
-        @info "Resetting active set."
-        # resetting active set to singleton
-        a0 = active_set.atoms[1]
-        active_set_initialize!(active_set, a0)
-        return false
-    end
-    # NOTE: sometimes the direction is non-improving
-    # usual suspects are floating-point errors when multiplying atoms with near-zero weights
-    @inbounds if fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), direction) < 0
-        defect = fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), direction)
-        @warn "Non-improving d ($defect) due to numerical instability. Temporarily upgrading precision to BigFloat for the current iteration. 
-        If the numerical instability is persistent try to run the whole algorithm with Double64 (still quite fast) or BigFloat (slower)."
-        bdir = big.(direction)
-        c = [fast_dot(bdir, a) for a in active_set.atoms]
-        c .-= sum(c) / k
-        d = c
-        @inbounds if fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), direction) < 0
-            @warn "d non-improving in large precision, forcing FW"
-            @warn "dot value: $(fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), direction))"
-            return true
-        end
-    end
-
-    η = eltype(d)(Inf)
-    rem_idx = -1
-    @inbounds for idx in eachindex(d)
-        if d[idx] > 0
-            max_val = active_set.weights[idx] / d[idx]
-            if η > max_val
-                η = max_val
-                rem_idx = idx
-            end
-        end
-    end
-
-
-
-    # TODO at some point avoid materializing both x and y
-    x = copy(active_set.x)
-    η = max(0, η)
-    @. active_set.weights -= η * d
-    y = copy(update_active_set_iterate!(active_set))
-    if f(x) ≥ f(y)
-        active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold)
-        return false
-    end
-    linesearch_method = L === nothing || !isfinite(L) ? backtracking : shortstep
-    if linesearch_method == backtracking
-        gamma, _ =
-            backtrackingLS(f, direction, x, x - y, 1.0, linesearch_tol=linesearch_tol, step_lim=step_lim)
-    else # == shortstep, just two methods here for now
-        gamma = fast_dot(direction, x - y) / (L * norm(x - y)^2)
-    end
-    gamma = min(1.0, gamma)
-    # step back from y to x by (1 - γ) η d
-    # new point is x - γ η d
-    if gamma == 1.0
-        active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold)
-    else
-        @. active_set.weights += η * (1 - gamma) * d
-        @. active_set.x =  x + gamma * (y - x)
-    end
-    return false
-end
-
 """
 Returns either a tuple `(y, val)` with `y` an atom from the active set satisfying
 the progress criterion and `val` the corresponding gap `dot(y, direction)`

From 84efc1ae384c783ebbd8498baac2020b8efa5dd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Fri, 12 Mar 2021 12:03:28 +0100
Subject: [PATCH 07/18] Update src/blended_cg.jl

---
 src/blended_cg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 98610dcac..a6cc5e978 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -578,7 +578,7 @@ function accelerated_simplex_gradient_descent_over_probability_simplex(
                     primal,
                     primal - tolerance,
                     tolerance,
-                    (time_ns() - time_start) / 1.0e9,
+                    (time_ns() - time_start) / 1e9,
                     length(initial_point),
                 ),
             )

From 6559f0c56a0b6414dc0dde816d2a2745235d490a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Fri, 12 Mar 2021 12:03:43 +0100
Subject: [PATCH 08/18] Update examples/blended_cg.jl

---
 examples/blended_cg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl
index 20a82823e..04eda74d5 100644
--- a/examples/blended_cg.jl
+++ b/examples/blended_cg.jl
@@ -160,4 +160,4 @@ x, v, primal, dual_gap, trajectoryBCG_convex = FrankWolfe.bcg(
 
 data = [trajectoryBCG_accel_simplex, trajectoryBCG_simplex, trajectoryBCG_convex]
 label = ["BCG (accel simplex)", "BCG (simplex)", "BCG (convex)"]
-FrankWolfe.plot_trajectories(data, label)
\ No newline at end of file
+FrankWolfe.plot_trajectories(data, label)

From 66ea33cb0fb78fdf30f393404b8b1b1833c662ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= <mathieu.besancon@gmail.com>
Date: Fri, 12 Mar 2021 12:20:46 +0100
Subject: [PATCH 09/18] Update examples/blended_cg.jl

---
 examples/blended_cg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl
index 04eda74d5..dc0fe84c5 100644
--- a/examples/blended_cg.jl
+++ b/examples/blended_cg.jl
@@ -5,7 +5,7 @@ using DoubleFloats
 using FrankWolfe
 using SparseArrays
 
-n = Int(1000)
+n = 1000
 k = 10000
 
 s = rand(1:100)

From 93196102c008f810f5680156d658cf552e0cd79c Mon Sep 17 00:00:00 2001
From: alejandro-carderera
 <56440520+alejandro-carderera@users.noreply.github.com>
Date: Fri, 12 Mar 2021 12:33:11 +0100
Subject: [PATCH 10/18] Update src/blended_cg.jl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Mathieu Besançon <mathieu.besancon@gmail.com>
---
 src/blended_cg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index a6cc5e978..0f196bcb2 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -412,7 +412,7 @@ function minimize_over_convex_hull(
 end
 
 """
-build_reduced_problem
+    build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, hessian, weights, gradient, tolerance)
 
 Given an active set formed by MaybeHotVector, a (constant)
 Hessian and a gradient constructs a quadratic problem 

From 6220695316e76aef748bd8b5eba85f262b63d864 Mon Sep 17 00:00:00 2001
From: alejandro-carderera
 <56440520+alejandro-carderera@users.noreply.github.com>
Date: Fri, 12 Mar 2021 12:33:23 +0100
Subject: [PATCH 11/18] Update src/blended_cg.jl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Mathieu Besançon <mathieu.besancon@gmail.com>
---
 src/blended_cg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 0f196bcb2..31b2fdef4 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -472,7 +472,7 @@ function build_reduced_problem(atoms::AbstractVector{<:SparseArrays.AbstractSpar
 end
 
 """
-build_reduced_problem
+    build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, gradient, tolerance)
 
 Same as the two function above, but for the case where the active 
 set is formed by normal Julia Arrays.

From 9553587213786e1393289a5e6c1aa358322fac64 Mon Sep 17 00:00:00 2001
From: alejandro-carderera
 <56440520+alejandro-carderera@users.noreply.github.com>
Date: Fri, 12 Mar 2021 12:33:32 +0100
Subject: [PATCH 12/18] Update src/blended_cg.jl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Mathieu Besançon <mathieu.besancon@gmail.com>
---
 src/blended_cg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 31b2fdef4..65888361f 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -475,7 +475,7 @@ end
     build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, gradient, tolerance)
 
 Same as the two function above, but for the case where the active 
-set is formed by normal Julia Arrays.
+set is formed by dense Arrays.
 """
 function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, gradient, tolerance)
     n = length(atoms[1])

From ba57b071ebe959506d72091d74c40b0f48620400 Mon Sep 17 00:00:00 2001
From: alejandro-carderera
 <56440520+alejandro-carderera@users.noreply.github.com>
Date: Fri, 12 Mar 2021 12:33:41 +0100
Subject: [PATCH 13/18] Update src/blended_cg.jl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Mathieu Besançon <mathieu.besancon@gmail.com>
---
 src/blended_cg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 65888361f..e6dc1bcfc 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -499,7 +499,7 @@ function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights,
 end
 
 """
-Checks the strong-Wolfe gap for the reduced problem.
+Checks the strong Frank-Wolfe gap for the reduced problem.
 """
 function Strong_Frank_Wolfe_gap(gradient)
     val_min = Inf

From 96c34dc2258dac86a6a58e0f7e85fa27b5539680 Mon Sep 17 00:00:00 2001
From: alejandro-carderera
 <56440520+alejandro-carderera@users.noreply.github.com>
Date: Fri, 12 Mar 2021 12:33:51 +0100
Subject: [PATCH 14/18] Update src/blended_cg.jl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Mathieu Besançon <mathieu.besancon@gmail.com>
---
 src/blended_cg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index e6dc1bcfc..1552cef01 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -346,7 +346,7 @@ function minimize_over_convex_hull(
         #Rewrite as problem over the simplex
         M, b = build_reduced_problem(active_set.atoms, hessian, active_set.weights, gradient, tolerance)
         #Early exit if we have detected that the strong-Wolfe gap is below the desired tolerance while building the reduced problem.
-        if isnothing(M) && isnothing(b)
+        if isnothing(M)
             return 0
         end
         #In case the matrices are DoubleFloats we need to cast them as Float64, because LinearAlgebra does not work with them.

From 25484e1b3cf69918f3175bbf0d6ae33090a1e922 Mon Sep 17 00:00:00 2001
From: alejandro-carderera
 <56440520+alejandro-carderera@users.noreply.github.com>
Date: Fri, 12 Mar 2021 12:34:04 +0100
Subject: [PATCH 15/18] Update src/blended_cg.jl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Mathieu Besançon <mathieu.besancon@gmail.com>
---
 src/blended_cg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 1552cef01..56b21bbf1 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -357,7 +357,7 @@ function minimize_over_convex_hull(
             L_reduced = eigmax(M)
             #L_reduced = Arpack.eigs(M, nev=1, which=:LM)
         end
-        reduced_f(y) =  f(x) - dot(gradient, x) + 0.5*transpose(x) * hessian * x + dot(b, y) + 0.5*transpose(y) * M * y
+        reduced_f(y) = f(x) - dot(gradient, x) + 0.5*transpose(x) * hessian * x + dot(b, y) + 0.5*transpose(y) * M * y
         function reduced_grad!(storage, x)
             storage .= b + M*x
         end

From 4e02c0145bb05d3b74fd44e81d47edf0a2ceae8e Mon Sep 17 00:00:00 2001
From: alejandro-carderera
 <56440520+alejandro-carderera@users.noreply.github.com>
Date: Fri, 12 Mar 2021 12:34:48 +0100
Subject: [PATCH 16/18] Update src/blended_cg.jl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Mathieu Besançon <mathieu.besancon@gmail.com>
---
 src/blended_cg.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 56b21bbf1..68d658be8 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -557,7 +557,7 @@ function accelerated_simplex_gradient_descent_over_probability_simplex(
         gamma = (1 - sqrt(q))/(1 + sqrt(q))
     end
     while strong_wolfe_gap > tolerance
-        @. x_old =   x
+        @. x_old = x
         reduced_grad!(gradient_y, y)
         x = projection_simplex_sort(y .- gradient_y/L)
         if mu < 1.0e-3

From 68eeb501a118fe76272348b92d0e04c16dcfa9f5 Mon Sep 17 00:00:00 2001
From: alejandro-carderera <alejandro.carderera@gmail.com>
Date: Fri, 12 Mar 2021 12:44:16 +0100
Subject: [PATCH 17/18] Rest of Mathieu's comments.

Incorporate the rest of Mathieu's comments from Github.
---
 examples/blended_cg.jl |  4 ++--
 src/blended_cg.jl      | 15 ++++++++-------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl
index dc0fe84c5..812f8a31c 100644
--- a/examples/blended_cg.jl
+++ b/examples/blended_cg.jl
@@ -15,7 +15,7 @@ s = rand(1:100)
 s = 41
 Random.seed!(s)
 
-"""
+
 matrix = rand(n,n)
 hessian = transpose(matrix) * matrix
 linear = rand(n)
@@ -87,7 +87,7 @@ x, v, primal, dual_gap, trajectoryBCG_convex = FrankWolfe.bcg(
 data = [trajectoryBCG_accel_simplex, trajectoryBCG_simplex, trajectoryBCG_convex]
 label = ["BCG (accel simplex)", "BCG (simplex)", "BCG (convex)"]
 FrankWolfe.plot_trajectories(data, label)
-"""
+
 
 
 matrix = rand(n,n)
diff --git a/src/blended_cg.jl b/src/blended_cg.jl
index 68d658be8..a72f1eff3 100644
--- a/src/blended_cg.jl
+++ b/src/blended_cg.jl
@@ -148,7 +148,7 @@ function bcg(
     while t <= max_iteration && phi ≥ epsilon
         # TODO replace with single call interface from function_gradient.jl
         #Mininize over the convex hull until strong Wolfe gap is below a given tolerance.
-        num_simplex_descent_steps = minimize_over_convex_hull(
+        num_simplex_descent_steps = minimize_over_convex_hull!(
             f,
             grad!,
             gradient,
@@ -280,7 +280,7 @@ end
 
 
 """
-minimize_over_convex_hull
+minimize_over_convex_hull!
 
 Given a function f with gradient grad! and an active set 
 active_set this function will minimize the function over 
@@ -294,7 +294,7 @@ probability simplex using gradient descent or Nesterov's
 accelerated gradient descent.
 """
 
-function minimize_over_convex_hull(
+function minimize_over_convex_hull!(
     f,
     grad!,
     gradient,
@@ -422,6 +422,11 @@ active set. If λ are the barycentric coordinates of dimension
 equal to the cardinality of the active set, the objective 
 function is:
     f(λ) = reduced_linear^T λ + 0.5 * λ^T reduced_hessian λ
+
+In the case where we find that the current iterate has a strong-Wolfe 
+gap over the convex hull of the active set that is below the tolerance 
+we return nothing (as there is nothing to do).
+
 """
 function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, hessian, weights, gradient, tolerance)
     n = atoms[1].len
@@ -461,9 +466,7 @@ function build_reduced_problem(atoms::AbstractVector{<:SparseArrays.AbstractSpar
 
     #Construct the matrix of vertices.
     vertex_matrix = zeros(n, k)
-    #reduced_linear = zeros(k)
     for i in 1:k
-        #reduced_linear[i] = dot(atoms[i], gradient)
         vertex_matrix[:, i] .= atoms[i]
     end
     reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix
@@ -488,9 +491,7 @@ function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights,
 
     #Construct the matrix of vertices.
     vertex_matrix = zeros(n, k)
-    #reduced_linear = zeros(k)
     for i in 1:k
-        #reduced_linear[i] = dot(atoms[i], gradient)
         vertex_matrix[:, i] .= atoms[i]
     end
     reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix

From 6c0c9323535ce3758bae9983ea4535dee150f6c1 Mon Sep 17 00:00:00 2001
From: alejandro-carderera <alejandro.carderera@gmail.com>
Date: Fri, 12 Mar 2021 13:24:38 +0100
Subject: [PATCH 18/18] Modify tests to account for the new changes.

Modified tests to account for the new structure of the code.
---
 test/active_set.jl | 18 +++++++++---------
 test/runtests.jl   |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/active_set.jl b/test/active_set.jl
index f13f17d10..8aafb2845 100644
--- a/test/active_set.jl
+++ b/test/active_set.jl
@@ -93,18 +93,19 @@ end
     x = FrankWolfe.compute_active_set_iterate(active_set)
     @test x ≈ [0, 0.5]
     f(x) = (x[1] - 1)^2 + (x[2] - 1)^2
-    ∇f(x) = [2 * (x[1] - 1), 2 * (x[2] - 1)]
-    gradient_dir = ∇f([0, 0.5])
-    FrankWolfe.update_simplex_gradient_descent!(active_set, gradient_dir, f)
+
+    gradient = similar(x)
+    function grad!(storage, x)
+        storage .= [2 * (x[1] - 1), 2 * (x[2] - 1)]
+    end
+    FrankWolfe.simplex_gradient_descent_over_convex_hull(f, grad!, gradient, active_set, 1.0e-3, 1, false, [], 0.0, 0)
     @test length(active_set) == 2
     @test [1, 0] ∈ active_set.atoms
     @test [0, 1] ∈ active_set.atoms
-
     active_set2 = ActiveSet([(0.5, [0, 0]), (0.0, [0, 1]), (0.5, [1, 0])])
     x2 = FrankWolfe.compute_active_set_iterate(active_set2)
     @test x2 ≈ [0.5, 0]
-    gradient_dir = ∇f(FrankWolfe.compute_active_set_iterate(active_set2))
-    FrankWolfe.update_simplex_gradient_descent!(active_set2, gradient_dir, f, L=4.0)
+    FrankWolfe.simplex_gradient_descent_over_convex_hull(f, grad!, gradient, active_set2, 1.0e-3, 1, false, [], 0.0, 0)
     @test length(active_set) == 2
     @test [1, 0] ∈ active_set.atoms
     @test [0, 1] ∈ active_set.atoms
@@ -112,9 +113,8 @@ end
     # updating again (at optimum) triggers the active set emptying
     for as in (active_set, active_set2)
         x = FrankWolfe.compute_active_set_iterate(as)
-        gradient_dir = ∇f(x)
-        FrankWolfe.update_simplex_gradient_descent!(as, gradient_dir, f)
-        @test length(active_set) == 1
+        number_of_steps = FrankWolfe.simplex_gradient_descent_over_convex_hull(f, grad!, gradient, as, 1.0e-3, 1, false, [], 0.0, 0)
+        @test number_of_steps == 0
     end
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 250869f6f..3b7e685a0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -17,8 +17,8 @@ include("utils.jl")
     f(x) = norm(x)^2
     gradient = similar(a)
     grad!(gradient, a)
-    @test FrankWolfe.backtrackingLS(f, gradient, a, a - b, 1.0) == (1, 0.5)
-    @test abs(FrankWolfe.segment_search(f, grad!, a, a - b, 1.0)[2] - 0.5) < 0.0001
+    @test FrankWolfe.backtrackingLS(f, gradient, a, a - b, 1.0) == (0.5, 1)
+    @test abs(FrankWolfe.segment_search(f, grad!, a, a - b, 1.0)[1] - 0.5) < 0.0001
 end
 
 @testset "FrankWolfe.jl" begin