From 1456118064683d1d233db11d8b05ac54733dd942 Mon Sep 17 00:00:00 2001 From: alejandro-carderera Date: Thu, 4 Mar 2021 16:14:24 -0500 Subject: [PATCH 01/18] WIP: Starting reorganization of BCG and addition of enhancements. --- examples/bcgTests.jl | 72 ++++--- src/blended_cg.jl | 462 +++++++++++++++++++++++++++++++++++++++++++ src/utils.jl | 8 +- 3 files changed, 513 insertions(+), 29 deletions(-) diff --git a/examples/bcgTests.jl b/examples/bcgTests.jl index 8674e6146..fcdf29fce 100644 --- a/examples/bcgTests.jl +++ b/examples/bcgTests.jl @@ -4,7 +4,7 @@ using Random using DoubleFloats n = Int(1e4) -k = 3000 +k = 1000 s = rand(1:100) @info "Seed $s" @@ -52,37 +52,24 @@ const x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n)) FrankWolfe.benchmark_oracles(x -> cf(x, xp), (str, x) -> cgrad!(str, x, xp), ()->randn(n), lmo; k=100) x0 = deepcopy(x00) -@time x, v, primal, dual_gap, trajectorySs = FrankWolfe.fw( - f, - grad!, - lmo, - x0, - max_iteration=k, - line_search=FrankWolfe.shortstep, - L=2, - print_iter=k / 10, - emphasis=FrankWolfe.memory, - verbose=true, - trajectory=true, -); - -x0 = deepcopy(x00) -@time x, v, primal, dual_gap, trajectoryAda = FrankWolfe.afw( +x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg( f, grad!, lmo, x0, max_iteration=k, line_search=FrankWolfe.adaptive, - L=100, print_iter=k / 10, emphasis=FrankWolfe.memory, + L=2, verbose=true, trajectory=true, -); + Ktolerance=1.00, + goodstep_tolerance=0.95, + weight_purge_threshold=1e-10, +) -x0 = deepcopy(x00) -x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg( +x, v, primal, dual_gap, trajectoryBCG_backup = FrankWolfe.bcg_backup( f, grad!, lmo, @@ -99,7 +86,42 @@ x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg( weight_purge_threshold=1e-10, ) -data = [trajectorySs, trajectoryAda, trajectoryBCG] -label = ["short step", "AFW", "BCG"] - -FrankWolfe.plot_trajectories(data, label) +data = [trajectoryBCG, trajectoryBCG_backup] +label = ["BCG", "BCG backup"] + +FrankWolfe.plot_trajectories(data, label, filename="output_results.png") + +#x0 = deepcopy(x00) +#@time x, v, primal, dual_gap, trajectoryAda = FrankWolfe.afw( +# f, +# grad!, +# lmo, +# x0, +# max_iteration=k, +# line_search=FrankWolfe.adaptive, +# L=100, +# print_iter=k / 10, +# emphasis=FrankWolfe.memory, +# verbose=true, +# trajectory=true, +#); + +#x0 = deepcopy(x00) +#@time x, v, primal, dual_gap, trajectorySs = FrankWolfe.fw( +# f, +# grad!, +# lmo, +# x0, +# max_iteration=k, +# line_search=FrankWolfe.shortstep, +# L=2, +# print_iter=k / 10, +# emphasis=FrankWolfe.memory, +# verbose=true, +# trajectory=true, +#); + +#data = [trajectorySs, trajectoryAda, trajectoryBCG] +#label = ["short step", "AFW", "BCG"] +# +#FrankWolfe.plot_trajectories(data, label) diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 1925001ac..f6eb0cc1d 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -1,3 +1,42 @@ +function print_header(data) + @printf( + "\n────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" + ) + @printf( + "%6s %13s %14s %14s %14s %14s %14s %14s\n", + data[1], + data[2], + data[3], + data[4], + data[5], + data[6], + data[7], + data[8], + ) + @printf( + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" + ) +end + +function print_footer() + @printf( + "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n\n" + ) +end + +function print_iter_func(data) + @printf( + "%6s %13s %14e %14e %14e %14e %14i %14i\n", + st[Symbol(data[1])], + data[2], + Float64(data[3]), + Float64(data[4]), + Float64(data[5]), + data[6], + data[7], + data[8], + ) +end function bcg( f, @@ -21,6 +60,429 @@ function bcg( gradient=nothing, direction_storage=nothing, lmo_kwargs..., +) + t = 0 + primal = Inf + dual_gap = Inf + active_set = ActiveSet([(1.0, x0)]) + x = x0 + if gradient === nothing + gradient = similar(x0, float(eltype(x0))) + end + primal = f(x) + grad!(gradient, x) + # initial gap estimate computation + vmax = compute_extreme_point(lmo, gradient) + phi = fast_dot(gradient, x0 - vmax) / 2 + dual_gap = phi + traj_data = [] + tt = regular + time_start = time_ns() + v = x0 + if direction_storage === nothing + direction_storage = Vector{float(eltype(x))}() + Base.sizehint!(direction_storage, 100) + end + + if line_search == shortstep && !isfinite(L) + @error("Lipschitz constant not set to a finite value. Prepare to blow up spectacularly.") + end + + if line_search == agnostic || line_search == nonconvex + @error("Lazification is not known to converge with open-loop step size strategies.") + end + + if line_search == fixed && gamma0 == 0 + println("WARNING: gamma0 not set. We are not going to move a single bit.") + end + + if verbose + println("\nBlended Conditional Gradients Algorithm.") + numType = eltype(x0) + println( + "EMPHASIS: $memory STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $numType", + ) + println("K: $Ktolerance") + println("WARNING: In memory emphasis mode iterates are written back into x0!") + headers = ( + "Type", + "Iteration", + "Primal", + "Dual", + "Dual Gap", + "Time", + "#ActiveSet", + "#non-simplex", + "#forced FW", + ) + print_header(headers) + end + if !isa(x, Union{Array, SparseVector}) + x = convert(Array{float(eltype(x))}, x) + end + non_simplex_iter = 0 + nforced_fw = 0 + force_fw_step = false + if verbose && mod(t, print_iter) == 0 + if t == 0 + tt = initial + end + rep = ( + tt, + string(t), + primal, + primal - dual_gap, + dual_gap, + (time_ns() - time_start) / 1.0e9, + length(active_set), + non_simplex_iter, + nforced_fw, + ) + print_iter_func(rep) + flush(stdout) + end + + while t <= max_iteration && phi ≥ epsilon + # TODO replace with single call interface from function_gradient.jl + #Mininize over the convex hull until strong Wolfe gap is below a given tolerance. + num_simplex_descent_steps = minimize_over_convex_hull( + f, + grad!, + gradient, + active_set::ActiveSet, + phi, + t, + trajectory, + traj_data, + time_start, + non_simplex_iter, + verbose = verbose, + print_iter=print_iter, + hessian = nothing, + L=L, + accelerated = false, + ) + + t = t + num_simplex_descent_steps + #Take a FW step. + x = compute_active_set_iterate(active_set) + primal = f(x) + grad!(gradient, x) + non_simplex_iter += 1 + # compute new atom + (v, value) = lp_separation_oracle( + lmo, + active_set, + gradient, + phi, + Ktolerance; + inplace_loop=(emphasis == memory), + force_fw_step=force_fw_step, + lmo_kwargs..., + ) + force_fw_step = false + xval = fast_dot(x, gradient) + if value > xval - phi/Ktolerance + tt = dualstep + # setting gap estimate as ∇f(x) (x - v_FW) / 2 + phi = (xval - value) / 2 + else + tt = regular + L, gamma = line_search_wrapper(line_search, t, f, grad!, x, x - v, gradient, dual_gap, L, gamma0, linesearch_tol, step_lim, 1.0) + + if gamma == 1.0 + active_set_initialize!(active_set, v) + else + active_set_update!(active_set, gamma, v) + end + end + t = t + 1 + non_simplex_iter += 1 + x = compute_active_set_iterate(active_set) + dual_gap = phi + if trajectory + push!( + traj_data, + ( + t, + primal, + primal - dual_gap, + dual_gap, + (time_ns() - time_start) / 1.0e9, + length(active_set), + ), + ) + end + + if verbose && mod(t, print_iter) == 0 + if t == 0 + tt = initial + end + rep = ( + tt, + string(t), + primal, + primal - dual_gap, + dual_gap, + (time_ns() - time_start) / 1.0e9, + length(active_set), + non_simplex_iter, + ) + print_iter_func(rep) + flush(stdout) + end + + end + if verbose + x = compute_active_set_iterate(active_set) + grad!(gradient, x) + v = compute_extreme_point(lmo, gradient) + primal = f(x) + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + rep = ( + last, + string(t - 1), + primal, + primal - dual_gap, + dual_gap, + (time_ns() - time_start) / 1.0e9, + length(active_set), + non_simplex_iter, + ) + print_iter_func(rep) + flush(stdout) + end + active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold) + active_set_renormalize!(active_set) + x = compute_active_set_iterate(active_set) + grad!(gradient, x) + v = compute_extreme_point(lmo, gradient) + primal = f(x) + #dual_gap = 2phi + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + if verbose + rep = ( + pp, + string(t - 1), + primal, + primal - dual_gap, + dual_gap, + (time_ns() - time_start) / 1.0e9, + length(active_set), + non_simplex_iter, + ) + print_iter_func(rep) + print_footer() + flush(stdout) + end + return x, v, primal, dual_gap, traj_data +end + + +function minimize_over_convex_hull( + f, + grad!, + gradient, + active_set::ActiveSet, + tolerance, + t, + trajectory, + traj_data, + time_start, + non_simplex_iter; + verbose = true, + print_iter=1000, + hessian = nothing, + L=nothing, + linesearch_tol=10e-10, + step_lim=100, + weight_purge_threshold=1e-12, + storage=nothing, + accelerated = false, +) + #No hessian is known, use simplex gradient descent. + if isnothing(hessian) + number_of_steps = simplex_gradient_descent_over_convex_hull( + f, + grad!, + gradient, + active_set::ActiveSet, + tolerance, + t, + trajectory, + traj_data, + time_start, + non_simplex_iter, + verbose = verbose, + print_iter=print_iter, + L=L, + linesearch_tol=linesearch_tol, + step_lim=step_lim, + weight_purge_threshold=weight_purge_threshold, + ) + else + #Rewrite problem as problem over the simplex and solve using Nesterov's AGD + if accelerated + return + #Rewrite problem as problem over the simplex and solve using gradient descent. + else + return + end + end + return number_of_steps +end + + +function simplex_gradient_descent_over_convex_hull( + f, + grad!, + gradient, + active_set::ActiveSet, + tolerance, + t, + trajectory, + traj_data, + time_start, + non_simplex_iter; + verbose = true, + print_iter=1000, + hessian = nothing, + L=nothing, + linesearch_tol=10e-10, + step_lim=100, + weight_purge_threshold=1e-12, +) + number_of_steps = 0 + x = compute_active_set_iterate(active_set) + while true + grad!(gradient, x) + #Check if strong Wolfe gap over the convex hull is small enough. + c = [fast_dot(gradient, a) for a in active_set.atoms] + if maximum(c) - minimum(c) <= tolerance + return number_of_steps + end + #Otherwise perform simplex steps until we get there. + k = length(active_set) + csum = sum(c) + c .-= (csum / k) + # name change to stay consistent with the paper, c is actually updated in-place + d = c + if norm(d) <= 1e-8 + @info "Resetting active set." + # resetting active set to singleton + a0 = active_set.atoms[1] + empty!(active_set) + push!(active_set, (1, a0)) + return false + end + # NOTE: sometimes the direction is non-improving + # usual suspects are floating-point errors when multiplying atoms with near-zero weights + # in that case, inverting the sense of d + @inbounds if fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), gradient) < 0 + @warn "Non-improving d, aborting simplex descent. We likely reached the limits of the numerical accuracy. + The solution is still valid but we might not be able to converge further from here onwards. + If higher accuracy is required, consider using Double64 (still quite fast) and if that does not help BigFloat (slower) as type for the numbers. + Alternatively, consider using AFW (with lazy = true) instead." + println(fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), gradient)) + return true + end + + η = eltype(d)(Inf) + rem_idx = -1 + @inbounds for idx in eachindex(d) + if d[idx] > 0 + max_val = active_set.weights[idx] / d[idx] + if η > max_val + η = max_val + rem_idx = idx + end + end + end + # TODO at some point avoid materializing both x and y + x = copy(active_set.x) + η = max(0, η) + @. active_set.weights -= η * d + y = copy(update_active_set_iterate!(active_set)) + if f(x) ≥ f(y) + active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold) + return false + end + linesearch_method = L === nothing || !isfinite(L) ? backtracking : shortstep + if linesearch_method == backtracking + _, gamma = + backtrackingLS(f, gradient, x, x - y, linesearch_tol=linesearch_tol, step_lim=step_lim) + else # == shortstep, just two methods here for now + gamma = fast_dot(gradient, x - y) / (L * norm(x - y)^2) + end + gamma = min(1.0, gamma) + # step back from y to x by (1 - γ) η d + # new point is x - γ η d + if gamma == 1.0 + active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold) + else + @. active_set.weights += η * (1 - gamma) * d + @. active_set.x = x + gamma * (y - x) + end + number_of_steps = number_of_steps + 1 + x = compute_active_set_iterate(active_set) + primal = f(x) + dual_gap = tolerance + if trajectory + push!( + traj_data, + ( + t + number_of_steps, + primal, + primal - dual_gap, + dual_gap, + (time_ns() - time_start) / 1.0e9, + length(active_set), + ), + ) + end + tt = simplex_descent + if verbose && mod(t + number_of_steps, print_iter) == 0 + if t == 0 + tt = initial + end + rep = ( + tt, + string(t+ number_of_steps), + primal, + primal - dual_gap, + dual_gap, + (time_ns() - time_start) / 1.0e9, + length(active_set), + non_simplex_iter, + ) + print_iter_func(rep) + flush(stdout) + end + end +end + +function bcg_backup( + f, + grad!, + lmo, + x0; + line_search::LineSearchMethod=adaptive, + L=Inf, + gamma0=0, + step_lim=20, + epsilon=1e-7, + max_iteration=10000, + print_iter=1000, + trajectory=false, + verbose=false, + linesearch_tol=1e-7, + emphasis=nothing, + Ktolerance=1.0, + goodstep_tolerance=1.0, + weight_purge_threshold=1e-9, + gradient=nothing, + direction_storage=nothing, + lmo_kwargs..., ) function print_header(data) @printf( diff --git a/src/utils.jl b/src/utils.jl index f311b3fb9..67d9db340 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -262,7 +262,7 @@ function plot_trajectories(data, label; filename=nothing) x, y, label=label[i], - xaxis=:log, + #xaxis=:log, yaxis=:log, ylabel="Primal", legend=:topright, @@ -284,7 +284,7 @@ function plot_trajectories(data, label; filename=nothing) y, label=label[i], legend=false, - xaxis=:log, + #xaxis=:log, yaxis=:log, yguidefontsize=8, xguidefontsize=8, @@ -303,7 +303,7 @@ function plot_trajectories(data, label; filename=nothing) y, label=label[i], legend=false, - xaxis=:log, + #xaxis=:log, yaxis=:log, ylabel="Dual Gap", xlabel="Iterations", @@ -324,7 +324,7 @@ function plot_trajectories(data, label; filename=nothing) y, label=label[i], legend=false, - xaxis=:log, + #xaxis=:log, yaxis=:log, xlabel="Time", yguidefontsize=8, From 4440a7b9e22b9f499adaa8f1959630b0b7961159 Mon Sep 17 00:00:00 2001 From: alejandro-carderera Date: Wed, 10 Mar 2021 14:31:09 +0100 Subject: [PATCH 02/18] Simplex descent for quadratics. Write the problem in barycentric coordinates in closed form if the objective function is quadratic. Need to verify that the code works well for an active set with dense atoms (right now all my testing is with MaybeHotVectors), and perform more tests with the code. Next step is to implement Nesterov's AGD. --- examples/blended_cg.jl | 52 +++++++++++-- src/blended_cg.jl | 170 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 210 insertions(+), 12 deletions(-) diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl index 780470bc3..e8d02ef9b 100644 --- a/examples/blended_cg.jl +++ b/examples/blended_cg.jl @@ -2,8 +2,10 @@ import FrankWolfe using LinearAlgebra using Random using DoubleFloats +using FrankWolfe n = Int(1e4) +#n = Int(1e3) k = 1000 s = rand(1:100) @@ -17,10 +19,24 @@ xpi = rand(n); total = sum(xpi); const xp = xpi # ./ total; +#matrix = rand(n, n) +#hessian = matrix * transpose(matrix) +#linear_term = rand(n) +#constant_term = rand() +#f(x) = constant_term + dot(linear_term, x) + 0.5* transpose(x) * hessian * x +#function grad!(storage, x) +# storage .= linear_term + hessian * x +#end +#gradient = zeros(n) + + f(x) = norm(x - xp)^2 function grad!(storage, x) @. storage = 2 * (x - xp) end +hessian = Matrix(1.0I, n, n) + + # better for memory consumption as we do coordinate-wise ops @@ -33,11 +49,11 @@ function cgrad!(storage, x, xp) end # this LMO might produce numerical instabilities do demonstrate the recovery feature -const lmo = FrankWolfe.KSparseLMO(100, 1.0) +#const lmo = FrankWolfe.KSparseLMO(100, 1.0) # full upgrade of the lmo (and hence optimization) to Double64. # the same lmo with Double64 is much more numerically robust. costs relatively little in speed. -# const lmo = FrankWolfe.KSparseLMO(100, Double64(1.0)) + #const lmo = FrankWolfe.KSparseLMO(100, Double64(1.0)) # as above but now to bigfloats # the same lmo here with bigfloat. even more robust but much slower @@ -45,11 +61,18 @@ const lmo = FrankWolfe.KSparseLMO(100, 1.0) # other oracles to test / experiment with # const lmo = FrankWolfe.LpNormLMO{Float64,1}(1.0) -# const lmo = FrankWolfe.ProbabilitySimplexOracle(Double64(1.0)); +#const lmo = FrankWolfe.ProbabilitySimplexOracle(Double64(1.0)); +const lmo = FrankWolfe.ProbabilitySimplexOracle(1.0); # const lmo = FrankWolfe.UnitSimplexOracle(1.0); const x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n)) +#active_set = FrankWolfe.ActiveSet([(1.0, x00)]) +#vect = FrankWolfe.compute_extreme_point(lmo, rand(n)) +#FrankWolfe.active_set_update!(active_set, 0.2, vect) +#vect = FrankWolfe.compute_extreme_point(lmo, rand(n)) +#FrankWolfe.active_set_update!(active_set, 0.2, vect) + FrankWolfe.benchmark_oracles(x -> cf(x, xp), (str, x) -> cgrad!(str, x, xp), ()->randn(n), lmo; k=100) # copying here and below the x00 as the algorithms write back into the variables to save memory. @@ -57,6 +80,25 @@ FrankWolfe.benchmark_oracles(x -> cf(x, xp), (str, x) -> cgrad!(str, x, xp), ()- x0 = deepcopy(x00) x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg( + f, + grad!, + lmo, + x0, + max_iteration=k, + line_search=FrankWolfe.adaptive, + print_iter=k / 10, + hessian = hessian, + emphasis=FrankWolfe.memory, + L=2, + verbose=true, + trajectory=true, + Ktolerance=1.00, + goodstep_tolerance=0.95, + weight_purge_threshold=1e-10, +) + +x0 = deepcopy(x00) +x, v, primal, dual_gap, trajectoryBCG2 = FrankWolfe.bcg( f, grad!, lmo, @@ -90,8 +132,8 @@ x, v, primal, dual_gap, trajectoryBCG_backup = FrankWolfe.bcg_backup( weight_purge_threshold=1e-10, ) -data = [trajectoryBCG, trajectoryBCG_backup] -label = ["BCG", "BCG backup"] +data = [trajectoryBCG, trajectoryBCG2, trajectoryBCG_backup] +label = ["BCG", "BCG old", "BCG backup"] FrankWolfe.plot_trajectories(data, label, filename="output_results.png") diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 7691b78df..2121b12b9 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -46,6 +46,7 @@ function bcg( line_search::LineSearchMethod=adaptive, L=Inf, gamma0=0, + hessian = nothing, step_lim=20, epsilon=1e-7, max_iteration=10000, @@ -158,7 +159,7 @@ function bcg( non_simplex_iter, verbose = verbose, print_iter=print_iter, - hessian = nothing, + hessian = hessian, L=L, accelerated = false, ) @@ -188,7 +189,7 @@ function bcg( phi = (xval - value) / 2 else tt = regular - L, gamma = line_search_wrapper(line_search, t, f, grad!, x, x - v, gradient, dual_gap, L, gamma0, linesearch_tol, step_lim, 1.0) + gamma, L = line_search_wrapper(line_search, t, f, grad!, x, x - v, gradient, dual_gap, L, gamma0, linesearch_tol, step_lim, 1.0) if gamma == 1.0 active_set_initialize!(active_set, v) @@ -321,17 +322,133 @@ function minimize_over_convex_hull( weight_purge_threshold=weight_purge_threshold, ) else - #Rewrite problem as problem over the simplex and solve using Nesterov's AGD + x = compute_active_set_iterate(active_set) + grad!(gradient, x) + #Rewrite as problem over the simplex + M, b = build_reduced_problem(active_set.atoms, active_set.weights, gradient, hessian) + L = eigmax(M) + reduced_f(y) = f(x) - dot(gradient, x) + 0.5*transpose(x) * hessian * x + dot(b, y) + 0.5*transpose(y) * M * y + + function reduced_grad!(storage, x) + storage .= b + M*x + end + + function reduced_linesearch(gradient, direction) + return -dot(gradient, direction)/ (transpose(direction)*M*direction) + end + + #Solve using Nesterov's AGD if accelerated return - #Rewrite problem as problem over the simplex and solve using gradient descent. + #Solve using gradient descent. else - return + new_weights, number_of_steps = simplex_gradient_descent_over_probability_simplex( + active_set.weights, + reduced_f, + reduced_grad!, + reduced_linesearch, + tolerance, + t, + trajectory, + traj_data, + time_start, + non_simplex_iter, + verbose = verbose, + print_iter=print_iter, + L = L, + ) + @. active_set.weights = new_weights end end + active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold) return number_of_steps end +function simplex_gradient_descent_over_probability_simplex( + initial_point, + reduced_f, + reduced_grad!, + reduced_linesearch, + tolerance, + t, + trajectory, + traj_data, + time_start, + non_simplex_iter; + verbose = verbose, + print_iter=print_iter, + L = 1.0, +) + number_of_steps = 0 + x = deepcopy(initial_point) + gradient = similar(x) + d = similar(x) + reduced_grad!(gradient, x) + strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient) + while strong_wolfe_gap > tolerance + y = projection_simplex_sort(x .- gradient/L) + @. d = y - x + gamma = min(1.0, reduced_linesearch(gradient, d)) + @. x = x + gamma*d + number_of_steps = number_of_steps + 1 + primal = reduced_f(x) + reduced_grad!(gradient, x) + strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient) + if trajectory + push!( + traj_data, + ( + t + number_of_steps, + primal, + primal - tolerance, + tolerance, + (time_ns() - time_start) / 1.0e9, + length(initial_point), + ), + ) + end + tt = simplex_descent + if verbose && mod(t + number_of_steps, print_iter) == 0 + if t == 0 + tt = initial + end + rep = ( + tt, + string(t+ number_of_steps), + primal, + primal - tolerance, + tolerance, + (time_ns() - time_start) / 1.0e9, + length(initial_point), + non_simplex_iter, + ) + print_iter_func(rep) + flush(stdout) + end + end + return x, number_of_steps +end + +# Sort projection for the simplex. +function projection_simplex_sort(x) + n = length(x) + if sum(x) == 1.0 && all(>=(0.0), x) + return x + end + v = x .- maximum(x) + u = sort(v, rev=true) + cssv = cumsum(u) + rho = sum(u .* collect(1:1:n).>(cssv .- 1.0)) - 1 + theta = (cssv[rho + 1] - 1.0) / (rho + 1) + w = clamp.(v .- theta, 0.0, Inf) + return w +end + +function Strong_Frank_Wolfe_gap_probability_simplex(gradient) + index_min = argmin(gradient) + index_max = argmax(gradient) + return gradient[index_max] - gradient[index_min] +end function simplex_gradient_descent_over_convex_hull( f, @@ -409,8 +526,8 @@ function simplex_gradient_descent_over_convex_hull( end linesearch_method = L === nothing || !isfinite(L) ? backtracking : shortstep if linesearch_method == backtracking - _, gamma = - backtrackingLS(f, gradient, x, x - y, linesearch_tol=linesearch_tol, step_lim=step_lim) + gamma, _ = + backtrackingLS(f, direction, x, x - y, 1.0, linesearch_tol=linesearch_tol, step_lim=step_lim) else # == shortstep, just two methods here for now gamma = fast_dot(gradient, x - y) / (L * norm(x - y)^2) end @@ -461,6 +578,45 @@ function simplex_gradient_descent_over_convex_hull( end end +#In case the matrix is a maybe hot vector +#Returns the problem written in the form: +# reduced_linear^T \lambda + 0.5* \lambda^T reduced_hessian \lambda +#according to the current active set. +function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, weights, gradient, hessian) + n = atoms[1].len + k = length(atoms) + aux_matrix = zeros(eltype(atoms[1].active_val), n, k) + reduced_linear = zeros(eltype(atoms[1].active_val), k) + #Compute the intermediate matrix. + for i in 1:k + reduced_linear[i] = dot(atoms[i], gradient) + aux_matrix[:,i] .= atoms[i].active_val*hessian[atoms[i].val_idx, :] + end + #Compute the final matrix. + reduced_hessian = zeros(eltype(atoms[1].active_val), k, k) + for i in 1:k + reduced_hessian[:,i] .= atoms[i].active_val*aux_matrix[atoms[i].val_idx,:] + end + reduced_linear .-= reduced_hessian * weights + return reduced_hessian, reduced_linear +end + +#In case the active set atoms are not MaybeHotVectors +function build_reduced_problem(active_set::AbstractVector{<:Array}, weights, gradient, hessian) + n = length(atoms[1]) + k = length(atoms) + #Construct the matrix of vertices. + vertex_matrix = zeros(n, k) + reduced_linear = zeros(k) + for i in 1:k + reduced_linear[i] = dot(atoms[i], gradient) + vertex_matrix[:, i] .= active_set.atoms[i] + end + reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix + reduced_linear .-= reduced_hessian * weights + return reduced_hessian, reduced_linear +end + function bcg_backup( f, grad!, From 5084f095c09e6aa5d2b305cedb840bb72efc3f02 Mon Sep 17 00:00:00 2001 From: alejandro-carderera Date: Wed, 10 Mar 2021 21:04:41 +0100 Subject: [PATCH 03/18] Intermediate commit Added Nesterov's AGD and fixed some bugs. Many tests remaining, and I need to incorporate the comments from Mathieu above. --- examples/blended_cg.jl | 67 ++++----- src/blended_cg.jl | 299 +++++++++++++++++++++++++++++++---------- 2 files changed, 264 insertions(+), 102 deletions(-) diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl index e8d02ef9b..d6a0e6a1f 100644 --- a/examples/blended_cg.jl +++ b/examples/blended_cg.jl @@ -3,10 +3,11 @@ using LinearAlgebra using Random using DoubleFloats using FrankWolfe +using SparseArrays n = Int(1e4) #n = Int(1e3) -k = 1000 +k = 100 s = rand(1:100) @info "Seed $s" @@ -30,13 +31,19 @@ const xp = xpi # ./ total; #gradient = zeros(n) -f(x) = norm(x - xp)^2 +#f(x) = norm(x - xp)^2 +#function grad!(storage, x) +# @. storage = 2 * (x - xp) +#end +#hessian = Matrix(2.0I, n, n) + +matrix = rand(n,n) +hessian = transpose(matrix) * matrix +linear = rand(n) +f(x) = dot(linear, x) + 0.5*transpose(x) * hessian * x function grad!(storage, x) - @. storage = 2 * (x - xp) + storage .= linear + hessian * x end -hessian = Matrix(1.0I, n, n) - - # better for memory consumption as we do coordinate-wise ops @@ -49,7 +56,7 @@ function cgrad!(storage, x, xp) end # this LMO might produce numerical instabilities do demonstrate the recovery feature -#const lmo = FrankWolfe.KSparseLMO(100, 1.0) +const lmo = FrankWolfe.KSparseLMO(100, 1.0) # full upgrade of the lmo (and hence optimization) to Double64. # the same lmo with Double64 is much more numerically robust. costs relatively little in speed. @@ -62,7 +69,7 @@ end # other oracles to test / experiment with # const lmo = FrankWolfe.LpNormLMO{Float64,1}(1.0) #const lmo = FrankWolfe.ProbabilitySimplexOracle(Double64(1.0)); -const lmo = FrankWolfe.ProbabilitySimplexOracle(1.0); +#const lmo = FrankWolfe.ProbabilitySimplexOracle(1.0); # const lmo = FrankWolfe.UnitSimplexOracle(1.0); const x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n)) @@ -90,10 +97,10 @@ x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg( hessian = hessian, emphasis=FrankWolfe.memory, L=2, + accelerated = true, verbose=true, trajectory=true, Ktolerance=1.00, - goodstep_tolerance=0.95, weight_purge_threshold=1e-10, ) @@ -111,31 +118,29 @@ x, v, primal, dual_gap, trajectoryBCG2 = FrankWolfe.bcg( verbose=true, trajectory=true, Ktolerance=1.00, - goodstep_tolerance=0.95, - weight_purge_threshold=1e-10, -) - -x, v, primal, dual_gap, trajectoryBCG_backup = FrankWolfe.bcg_backup( - f, - grad!, - lmo, - x0, - max_iteration=k, - line_search=FrankWolfe.adaptive, - print_iter=k / 10, - emphasis=FrankWolfe.memory, - L=2, - verbose=true, - trajectory=true, - Ktolerance=1.00, - goodstep_tolerance=0.95, weight_purge_threshold=1e-10, ) -data = [trajectoryBCG, trajectoryBCG2, trajectoryBCG_backup] -label = ["BCG", "BCG old", "BCG backup"] - -FrankWolfe.plot_trajectories(data, label, filename="output_results.png") +#x, v, primal, dual_gap, trajectoryBCG_backup = FrankWolfe.bcg_backup( +# f, +# grad!, +# lmo, +# x0, +# max_iteration=k, +# line_search=FrankWolfe.adaptive, +# print_iter=k / 10, +# emphasis=FrankWolfe.memory, +# L=2, +# verbose=true, +# trajectory=true, +# Ktolerance=1.00, +# goodstep_tolerance=0.95, +# weight_purge_threshold=1e-10, +#) + +#data = [trajectoryBCG, trajectoryBCG2, trajectoryBCG_backup] +#label = ["BCG", "BCG old", "BCG backup"] +#FrankWolfe.plot_trajectories(data, label, filename="output_results.png") #x0 = deepcopy(x00) #@time x, v, primal, dual_gap, trajectoryAda = FrankWolfe.afw( diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 2121b12b9..96fb032d7 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -1,3 +1,5 @@ +import Arpack + function print_header(data) @printf( "\n────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" @@ -55,8 +57,8 @@ function bcg( verbose=false, linesearch_tol=1e-7, emphasis=nothing, + accelerated = false, Ktolerance=1.0, - goodstep_tolerance=1.0, weight_purge_threshold=1e-9, gradient=nothing, direction_storage=nothing, @@ -137,7 +139,6 @@ function bcg( (time_ns() - time_start) / 1.0e9, length(active_set), non_simplex_iter, - nforced_fw, ) print_iter_func(rep) flush(stdout) @@ -161,15 +162,13 @@ function bcg( print_iter=print_iter, hessian = hessian, L=L, - accelerated = false, + accelerated = accelerated, ) - t = t + num_simplex_descent_steps #Take a FW step. x = compute_active_set_iterate(active_set) primal = f(x) grad!(gradient, x) - non_simplex_iter += 1 # compute new atom (v, value) = lp_separation_oracle( lmo, @@ -232,7 +231,6 @@ function bcg( print_iter_func(rep) flush(stdout) end - end if verbose x = compute_active_set_iterate(active_set) @@ -324,24 +322,50 @@ function minimize_over_convex_hull( else x = compute_active_set_iterate(active_set) grad!(gradient, x) + c = [fast_dot(gradient, a) for a in active_set.atoms] + if maximum(c) - minimum(c) <= tolerance + return 0 + end + #Rewrite as problem over the simplex - M, b = build_reduced_problem(active_set.atoms, active_set.weights, gradient, hessian) + M, b = build_reduced_problem(active_set.atoms, hessian, active_set.weights, gradient) L = eigmax(M) - reduced_f(y) = f(x) - dot(gradient, x) + 0.5*transpose(x) * hessian * x + dot(b, y) + 0.5*transpose(y) * M * y + #L = Arpack.eigs(M, nev=1, which=:LM) + #L = 2.0 + + mu = eigmin(M) + #mu = Arpack.eigs(M, nev=1, which=:SM) + #mu = 2.0 + reduced_f(y) = f(x) - dot(gradient, x) + 0.5*transpose(x) * hessian * x + dot(b, y) + 0.5*transpose(y) * M * y function reduced_grad!(storage, x) storage .= b + M*x end - function reduced_linesearch(gradient, direction) - return -dot(gradient, direction)/ (transpose(direction)*M*direction) - end #Solve using Nesterov's AGD - if accelerated - return + if accelerated && L / mu > 1.0 + new_weights, number_of_steps = accelerated_simplex_gradient_descent_over_probability_simplex( + active_set.weights, + reduced_f, + reduced_grad!, + tolerance, + t, + trajectory, + traj_data, + time_start, + non_simplex_iter, + verbose = verbose, + print_iter=print_iter, + L = L, + mu = mu, + ) #Solve using gradient descent. else + function reduced_linesearch(gradient, direction) + return -dot(gradient, direction)/ (transpose(direction)*M*direction) + end + new_weights, number_of_steps = simplex_gradient_descent_over_probability_simplex( active_set.weights, reduced_f, @@ -360,10 +384,167 @@ function minimize_over_convex_hull( @. active_set.weights = new_weights end end + number_elements = length(active_set.atoms) active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold) return number_of_steps end +#In case the matrix is a maybe hot vector +#Returns the problem written in the form: +# reduced_linear^T \lambda + 0.5* \lambda^T reduced_hessian \lambda +#according to the current active set. +function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, hessian, weights, gradient) + n = atoms[1].len + k = length(atoms) + aux_matrix = zeros(eltype(atoms[1].active_val), n, k) + reduced_linear = zeros(eltype(atoms[1].active_val), k) + #Compute the intermediate matrix. + for i in 1:k + reduced_linear[i] = dot(atoms[i], gradient) + aux_matrix[:,i] .= atoms[i].active_val*hessian[atoms[i].val_idx, :] + end + #Compute the final matrix. + reduced_hessian = zeros(eltype(atoms[1].active_val), k, k) + for i in 1:k + reduced_hessian[:,i] .= atoms[i].active_val*aux_matrix[atoms[i].val_idx,:] + end + reduced_linear .-= reduced_hessian * weights + return reduced_hessian, reduced_linear +end + +#Case where the active set contains sparse arrays +function build_reduced_problem(atoms::AbstractVector{<:SparseArrays.AbstractSparseArray}, hessian, weights, gradient) + n = length(atoms[1]) + k = length(atoms) + #Construct the matrix of vertices. + vertex_matrix = zeros(n, k) + reduced_linear = zeros(k) + for i in 1:k + reduced_linear[i] = dot(atoms[i], gradient) + vertex_matrix[:, i] .= atoms[i] + end + reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix + reduced_linear .-= reduced_hessian * weights + return reduced_hessian, reduced_linear +end + +#General case where the active set contains normal Julia arrays +function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, gradient) + n = length(atoms[1]) + k = length(atoms) + #Construct the matrix of vertices. + vertex_matrix = zeros(n, k) + reduced_linear = zeros(k) + for i in 1:k + reduced_linear[i] = dot(atoms[i], gradient) + vertex_matrix[:, i] .= atoms[i] + end + reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix + reduced_linear .-= reduced_hessian * weights + return reduced_hessian, reduced_linear +end + +function accelerated_simplex_gradient_descent_over_probability_simplex( + initial_point, + reduced_f, + reduced_grad!, + tolerance, + t, + trajectory, + traj_data, + time_start, + non_simplex_iter; + verbose = verbose, + print_iter=print_iter, + L = 1.0, + mu = 1.0, +) + number_of_steps = 0 + x = deepcopy(initial_point) + x_old = deepcopy(initial_point) + y = deepcopy(initial_point) + gradient_x = similar(x) + gradient_y = similar(x) + d = similar(x) + reduced_grad!(gradient_x, x) + reduced_grad!(gradient_y, x) + strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient_x) + q = mu / L + # If the problem is close to convex, simply use the accelerated algorithm for convex objective functions. + if mu < 1.0e-3 + alpha = 0.0 + else + alpha = sqrt(q) + end + alpha_old = 0.0 + while strong_wolfe_gap > tolerance + @. x_old = x + reduced_grad!(gradient_y, y) + @. d = y - gradient_y/L + x = projection_simplex_sort(y .- gradient_y/L) + if mu < 1.0e-3 + alpha_old = alpha_old + alpha = 0.5 * (1 + sqrt(1 + 4 * alpha^2)) + gamma = (alpha_old - 1.0) / alpha + else + alpha_old = alpha + alpha = return_bounded_root_of_square(1, alpha^2 - q, -alpha^2) + gamma = alpha_old * (1 - alpha_old) / (alpha_old^2 - alpha) + end + @. y = x + gamma*(x - x_old) + number_of_steps = number_of_steps + 1 + primal = reduced_f(x) + reduced_grad!(gradient_x, x) + strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient_x) + if trajectory + push!( + traj_data, + ( + t + number_of_steps, + primal, + primal - tolerance, + tolerance, + (time_ns() - time_start) / 1.0e9, + length(initial_point), + ), + ) + end + tt = simplex_descent + if verbose && mod(t + number_of_steps, print_iter) == 0 + if t == 0 + tt = initial + end + rep = ( + tt, + string(t+ number_of_steps), + primal, + primal - tolerance, + tolerance, + (time_ns() - time_start) / 1.0e9, + length(initial_point), + non_simplex_iter, + ) + print_iter_func(rep) + flush(stdout) + end + end + return x, number_of_steps +end + +function return_bounded_root_of_square(a,b,c) + root1 = (-b+sqrt(b^2 - 4*a*c))/(2*a) + if root1 >= 0 && root1 < 1.0 + return root1 + else + root2 = (-b - sqrt(b^2 - 4*a*c))/(2*a) + if root2 >= 0 && root2 < 1.0 + return root2 + else + print("\n TODO, introduce assert. Roots are: ", root1, " ", root2,"\n") + end + end +end + function simplex_gradient_descent_over_probability_simplex( initial_point, reduced_f, @@ -429,6 +610,8 @@ function simplex_gradient_descent_over_probability_simplex( return x, number_of_steps end + + # Sort projection for the simplex. function projection_simplex_sort(x) n = length(x) @@ -445,9 +628,20 @@ function projection_simplex_sort(x) end function Strong_Frank_Wolfe_gap_probability_simplex(gradient) - index_min = argmin(gradient) - index_max = argmax(gradient) - return gradient[index_max] - gradient[index_min] + val_min = gradient[1] + val_max = gradient[1] + for i in 2:length(gradient) + temp_val = gradient[i] + if temp_val < val_min + val_min = temp_val + else + if temp_val > val_max + val_max = temp_val + end + end + + end + return val_max - val_min end function simplex_gradient_descent_over_convex_hull( @@ -520,27 +714,27 @@ function simplex_gradient_descent_over_convex_hull( η = max(0, η) @. active_set.weights -= η * d y = copy(update_active_set_iterate!(active_set)) + number_of_steps = number_of_steps + 1 if f(x) ≥ f(y) active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold) - return false - end - linesearch_method = L === nothing || !isfinite(L) ? backtracking : shortstep - if linesearch_method == backtracking - gamma, _ = - backtrackingLS(f, direction, x, x - y, 1.0, linesearch_tol=linesearch_tol, step_lim=step_lim) - else # == shortstep, just two methods here for now - gamma = fast_dot(gradient, x - y) / (L * norm(x - y)^2) - end - gamma = min(1.0, gamma) - # step back from y to x by (1 - γ) η d - # new point is x - γ η d - if gamma == 1.0 - active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold) else - @. active_set.weights += η * (1 - gamma) * d - @. active_set.x = x + gamma * (y - x) + linesearch_method = L === nothing || !isfinite(L) ? backtracking : shortstep + if linesearch_method == backtracking + gamma, _ = + backtrackingLS(f, direction, x, x - y, 1.0, linesearch_tol=linesearch_tol, step_lim=step_lim) + else # == shortstep, just two methods here for now + gamma = fast_dot(gradient, x - y) / (L * norm(x - y)^2) + end + gamma = min(1.0, gamma) + # step back from y to x by (1 - γ) η d + # new point is x - γ η d + if gamma == 1.0 + active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold) + else + @. active_set.weights += η * (1 - gamma) * d + @. active_set.x = x + gamma * (y - x) + end end - number_of_steps = number_of_steps + 1 x = compute_active_set_iterate(active_set) primal = f(x) dual_gap = tolerance @@ -564,7 +758,7 @@ function simplex_gradient_descent_over_convex_hull( end rep = ( tt, - string(t+ number_of_steps), + string(t + number_of_steps), primal, primal - dual_gap, dual_gap, @@ -578,44 +772,7 @@ function simplex_gradient_descent_over_convex_hull( end end -#In case the matrix is a maybe hot vector -#Returns the problem written in the form: -# reduced_linear^T \lambda + 0.5* \lambda^T reduced_hessian \lambda -#according to the current active set. -function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, weights, gradient, hessian) - n = atoms[1].len - k = length(atoms) - aux_matrix = zeros(eltype(atoms[1].active_val), n, k) - reduced_linear = zeros(eltype(atoms[1].active_val), k) - #Compute the intermediate matrix. - for i in 1:k - reduced_linear[i] = dot(atoms[i], gradient) - aux_matrix[:,i] .= atoms[i].active_val*hessian[atoms[i].val_idx, :] - end - #Compute the final matrix. - reduced_hessian = zeros(eltype(atoms[1].active_val), k, k) - for i in 1:k - reduced_hessian[:,i] .= atoms[i].active_val*aux_matrix[atoms[i].val_idx,:] - end - reduced_linear .-= reduced_hessian * weights - return reduced_hessian, reduced_linear -end -#In case the active set atoms are not MaybeHotVectors -function build_reduced_problem(active_set::AbstractVector{<:Array}, weights, gradient, hessian) - n = length(atoms[1]) - k = length(atoms) - #Construct the matrix of vertices. - vertex_matrix = zeros(n, k) - reduced_linear = zeros(k) - for i in 1:k - reduced_linear[i] = dot(atoms[i], gradient) - vertex_matrix[:, i] .= active_set.atoms[i] - end - reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix - reduced_linear .-= reduced_hessian * weights - return reduced_hessian, reduced_linear -end function bcg_backup( f, From 48d914ea58a4fa755f9ec15ba02472289eab520e Mon Sep 17 00:00:00 2001 From: alejandro-carderera Date: Wed, 10 Mar 2021 21:07:42 +0100 Subject: [PATCH 04/18] Minor to test file. --- examples/blended_cg.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl index d6a0e6a1f..77096dece 100644 --- a/examples/blended_cg.jl +++ b/examples/blended_cg.jl @@ -44,6 +44,7 @@ f(x) = dot(linear, x) + 0.5*transpose(x) * hessian * x function grad!(storage, x) storage .= linear + hessian * x end +L = eigmax(hessian) # better for memory consumption as we do coordinate-wise ops @@ -96,7 +97,7 @@ x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg( print_iter=k / 10, hessian = hessian, emphasis=FrankWolfe.memory, - L=2, + L=L, accelerated = true, verbose=true, trajectory=true, @@ -114,7 +115,7 @@ x, v, primal, dual_gap, trajectoryBCG2 = FrankWolfe.bcg( line_search=FrankWolfe.adaptive, print_iter=k / 10, emphasis=FrankWolfe.memory, - L=2, + L=L, verbose=true, trajectory=true, Ktolerance=1.00, @@ -130,7 +131,7 @@ x, v, primal, dual_gap, trajectoryBCG2 = FrankWolfe.bcg( # line_search=FrankWolfe.adaptive, # print_iter=k / 10, # emphasis=FrankWolfe.memory, -# L=2, +# L=L, # verbose=true, # trajectory=true, # Ktolerance=1.00, From a9770a55a708fa2ee6a7e11f1cb57756f1cf2142 Mon Sep 17 00:00:00 2001 From: alejandro-carderera Date: Thu, 11 Mar 2021 17:04:21 +0100 Subject: [PATCH 05/18] Fixes for some bugs. Reused some computations and fixed some bugs. Also added some function headers. Still testing... --- examples/blended_cg.jl | 73 +++++++---- src/blended_cg.jl | 275 ++++++++++++++++++++++++++--------------- 2 files changed, 224 insertions(+), 124 deletions(-) diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl index 77096dece..7d80f6ac9 100644 --- a/examples/blended_cg.jl +++ b/examples/blended_cg.jl @@ -5,9 +5,8 @@ using DoubleFloats using FrankWolfe using SparseArrays -n = Int(1e4) -#n = Int(1e3) -k = 100 +n = Int(5e2) +k = 1000 s = rand(1:100) @info "Seed $s" @@ -57,7 +56,7 @@ function cgrad!(storage, x, xp) end # this LMO might produce numerical instabilities do demonstrate the recovery feature -const lmo = FrankWolfe.KSparseLMO(100, 1.0) +const lmo = FrankWolfe.KSparseLMO(50, 1.0) # full upgrade of the lmo (and hence optimization) to Double64. # the same lmo with Double64 is much more numerically robust. costs relatively little in speed. @@ -87,13 +86,13 @@ FrankWolfe.benchmark_oracles(x -> cf(x, xp), (str, x) -> cgrad!(str, x, xp), ()- # as we do multiple runs from the same initial point we do not want this here. x0 = deepcopy(x00) -x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg( +x, v, primal, dual_gap, trajectoryBCG_accel_simplex = FrankWolfe.bcg( f, grad!, lmo, x0, max_iteration=k, - line_search=FrankWolfe.adaptive, + line_search=FrankWolfe.adaptive, print_iter=k / 10, hessian = hessian, emphasis=FrankWolfe.memory, @@ -106,7 +105,7 @@ x, v, primal, dual_gap, trajectoryBCG = FrankWolfe.bcg( ) x0 = deepcopy(x00) -x, v, primal, dual_gap, trajectoryBCG2 = FrankWolfe.bcg( +x, v, primal, dual_gap, trajectoryBCG_simplex = FrankWolfe.bcg( f, grad!, lmo, @@ -114,34 +113,54 @@ x, v, primal, dual_gap, trajectoryBCG2 = FrankWolfe.bcg( max_iteration=k, line_search=FrankWolfe.adaptive, print_iter=k / 10, + hessian = hessian, emphasis=FrankWolfe.memory, L=L, + accelerated = false, verbose=true, trajectory=true, Ktolerance=1.00, weight_purge_threshold=1e-10, ) -#x, v, primal, dual_gap, trajectoryBCG_backup = FrankWolfe.bcg_backup( -# f, -# grad!, -# lmo, -# x0, -# max_iteration=k, -# line_search=FrankWolfe.adaptive, -# print_iter=k / 10, -# emphasis=FrankWolfe.memory, -# L=L, -# verbose=true, -# trajectory=true, -# Ktolerance=1.00, -# goodstep_tolerance=0.95, -# weight_purge_threshold=1e-10, -#) - -#data = [trajectoryBCG, trajectoryBCG2, trajectoryBCG_backup] -#label = ["BCG", "BCG old", "BCG backup"] -#FrankWolfe.plot_trajectories(data, label, filename="output_results.png") +x0 = deepcopy(x00) +x, v, primal, dual_gap, trajectoryBCG_convex = FrankWolfe.bcg( + f, + grad!, + lmo, + x0, + max_iteration=k, + line_search=FrankWolfe.adaptive, + print_iter=k / 10, + emphasis=FrankWolfe.memory, + L=L, + verbose=true, + trajectory=true, + Ktolerance=1.00, + weight_purge_threshold=1e-10, +) + +x0 = deepcopy(x00) +x, v, primal, dual_gap, trajectoryBCG_convex_backup = FrankWolfe.bcg_backup( + f, + grad!, + lmo, + x0, + max_iteration=k, + line_search=FrankWolfe.adaptive, + print_iter=k / 10, + emphasis=FrankWolfe.memory, + L=L, + verbose=true, + trajectory=true, + Ktolerance=1.00, + goodstep_tolerance=0.95, + weight_purge_threshold=1e-10, +) + +data = [trajectoryBCG_accel_simplex, trajectoryBCG_simplex, trajectoryBCG_convex, trajectoryBCG_convex_backup] +label = ["BCG (accel simplex)", "BCG (simplex)", "BCG (convex)", "BCG (convex backup)"] +FrankWolfe.plot_trajectories(data, label, filename="output_results.png") #x0 = deepcopy(x00) #@time x, v, primal, dual_gap, trajectoryAda = FrankWolfe.afw( diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 96fb032d7..c0361bd46 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -1,4 +1,5 @@ import Arpack +using DoubleFloats function print_header(data) @printf( @@ -278,6 +279,21 @@ function bcg( end +""" +minimize_over_convex_hull + +Given a function f with gradient grad! and an active set +active_set this function will minimize the function over +the convex hull of the active set until the strong-wolfe +gap over the active set is below tolerance. + +It will either directly minimize over the convex hull using +simplex gradient descent, or it will transform the problem +to barycentric coordinates and minimize over the unit +probability simplex using gradient descent or Nesterov's +accelerated gradient descent. +""" + function minimize_over_convex_hull( f, grad!, @@ -314,7 +330,7 @@ function minimize_over_convex_hull( non_simplex_iter, verbose = verbose, print_iter=print_iter, - L=L, + L=nothing, linesearch_tol=linesearch_tol, step_lim=step_lim, weight_purge_threshold=weight_purge_threshold, @@ -322,55 +338,61 @@ function minimize_over_convex_hull( else x = compute_active_set_iterate(active_set) grad!(gradient, x) - c = [fast_dot(gradient, a) for a in active_set.atoms] - if maximum(c) - minimum(c) <= tolerance - return 0 - end + #c = [fast_dot(gradient, a) for a in active_set.atoms] + #if maximum(c) - minimum(c) <= tolerance + # return 0 + #end #Rewrite as problem over the simplex - M, b = build_reduced_problem(active_set.atoms, hessian, active_set.weights, gradient) - L = eigmax(M) - #L = Arpack.eigs(M, nev=1, which=:LM) - #L = 2.0 - - mu = eigmin(M) - #mu = Arpack.eigs(M, nev=1, which=:SM) - #mu = 2.0 - + M, b = build_reduced_problem(active_set.atoms, hessian, active_set.weights, gradient, tolerance) + #Early exit if we have detected that the strong-Wolfe gap is below the desired tolerance while building the reduced problem. + if isnothing(M) && isnothing(b) + return 0 + end + #In case the matrices are DoubleFloats we need to cast them as Float64, because LinearAlgebra does not work with them. + if eltype(M) === Double64 + converted_matrix = convert(Array{Float64}, M) + L_reduced = eigmax(converted_matrix) + else + L_reduced = eigmax(M) + #L_reduced = Arpack.eigs(M, nev=1, which=:LM) + end reduced_f(y) = f(x) - dot(gradient, x) + 0.5*transpose(x) * hessian * x + dot(b, y) + 0.5*transpose(y) * M * y function reduced_grad!(storage, x) storage .= b + M*x end - - #Solve using Nesterov's AGD - if accelerated && L / mu > 1.0 - new_weights, number_of_steps = accelerated_simplex_gradient_descent_over_probability_simplex( - active_set.weights, - reduced_f, - reduced_grad!, - tolerance, - t, - trajectory, - traj_data, - time_start, - non_simplex_iter, - verbose = verbose, - print_iter=print_iter, - L = L, - mu = mu, - ) - #Solve using gradient descent. - else - function reduced_linesearch(gradient, direction) - return -dot(gradient, direction)/ (transpose(direction)*M*direction) + if accelerated + if eltype(M) === Double64 + mu_reduced = eigmin(converted_matrix) + else + mu_reduced = eigmin(M) end - + if L_reduced / mu_reduced > 1.0 + new_weights, number_of_steps = accelerated_simplex_gradient_descent_over_probability_simplex( + active_set.weights, + reduced_f, + reduced_grad!, + tolerance, + t, + trajectory, + traj_data, + time_start, + non_simplex_iter, + verbose = verbose, + print_iter=print_iter, + L = L_reduced, + mu = mu_reduced, + ) + @. active_set.weights = new_weights + end + end + #Solve using gradient descent. + if !accelerated || L_reduced / mu_reduced == 1.0 new_weights, number_of_steps = simplex_gradient_descent_over_probability_simplex( active_set.weights, reduced_f, reduced_grad!, - reduced_linesearch, tolerance, t, trajectory, @@ -379,7 +401,7 @@ function minimize_over_convex_hull( non_simplex_iter, verbose = verbose, print_iter=print_iter, - L = L, + L = L_reduced, ) @. active_set.weights = new_weights end @@ -389,18 +411,28 @@ function minimize_over_convex_hull( return number_of_steps end -#In case the matrix is a maybe hot vector -#Returns the problem written in the form: -# reduced_linear^T \lambda + 0.5* \lambda^T reduced_hessian \lambda -#according to the current active set. -function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, hessian, weights, gradient) +""" +build_reduced_problem + +Given an active set formed by MaybeHotVector, a (constant) +Hessian and a gradient constructs a quadratic problem +over the unit probability simplex that is equivalent to +minimizing the original function over the convex hull of the +active set. If λ are the barycentric coordinates of dimension +equal to the cardinality of the active set, the objective +function is: + f(λ) = reduced_linear^T λ + 0.5 * λ^T reduced_hessian λ +""" +function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, hessian, weights, gradient, tolerance) n = atoms[1].len k = length(atoms) + reduced_linear = [fast_dot(gradient, a) for a in atoms] + if Strong_Frank_Wolfe_gap(reduced_linear) <= tolerance + return nothing, nothing + end aux_matrix = zeros(eltype(atoms[1].active_val), n, k) - reduced_linear = zeros(eltype(atoms[1].active_val), k) #Compute the intermediate matrix. for i in 1:k - reduced_linear[i] = dot(atoms[i], gradient) aux_matrix[:,i] .= atoms[i].active_val*hessian[atoms[i].val_idx, :] end #Compute the final matrix. @@ -412,15 +444,26 @@ function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector return reduced_hessian, reduced_linear end -#Case where the active set contains sparse arrays -function build_reduced_problem(atoms::AbstractVector{<:SparseArrays.AbstractSparseArray}, hessian, weights, gradient) +""" +build_reduced_problem + +Same as the function above, but for the case where the active +set is formed by Sparse Arrays. +""" +function build_reduced_problem(atoms::AbstractVector{<:SparseArrays.AbstractSparseArray}, hessian, weights, gradient, tolerance) n = length(atoms[1]) k = length(atoms) + + reduced_linear = [fast_dot(gradient, a) for a in atoms] + if Strong_Frank_Wolfe_gap(reduced_linear) <= tolerance + return nothing, nothing + end + #Construct the matrix of vertices. vertex_matrix = zeros(n, k) - reduced_linear = zeros(k) + #reduced_linear = zeros(k) for i in 1:k - reduced_linear[i] = dot(atoms[i], gradient) + #reduced_linear[i] = dot(atoms[i], gradient) vertex_matrix[:, i] .= atoms[i] end reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix @@ -428,15 +471,26 @@ function build_reduced_problem(atoms::AbstractVector{<:SparseArrays.AbstractSpar return reduced_hessian, reduced_linear end -#General case where the active set contains normal Julia arrays -function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, gradient) +""" +build_reduced_problem + +Same as the two function above, but for the case where the active +set is formed by normal Julia Arrays. +""" +function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, gradient, tolerance) n = length(atoms[1]) k = length(atoms) + + reduced_linear = [fast_dot(gradient, a) for a in atoms] + if Strong_Frank_Wolfe_gap(reduced_linear) <= tolerance + return nothing, nothing + end + #Construct the matrix of vertices. vertex_matrix = zeros(n, k) - reduced_linear = zeros(k) + #reduced_linear = zeros(k) for i in 1:k - reduced_linear[i] = dot(atoms[i], gradient) + #reduced_linear[i] = dot(atoms[i], gradient) vertex_matrix[:, i] .= atoms[i] end reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix @@ -444,6 +498,31 @@ function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, return reduced_hessian, reduced_linear end +""" +Checks the strong-Wolfe gap for the reduced problem. +""" +function Strong_Frank_Wolfe_gap(gradient) + val_min = Inf + val_max = -Inf + for i in 1:length(gradient) + temp_val = gradient[i] + if temp_val < val_min + val_min = temp_val + end + if temp_val > val_max + val_max = temp_val + end + end + return val_max - val_min +end + +""" +accelerated_simplex_gradient_descent_over_probability_simplex + +Minimizes an objective function over the unit probability simplex +until the Strong-Wolfe gap is below tolerance using Nesterov's +accelerated gradient descent. +""" function accelerated_simplex_gradient_descent_over_probability_simplex( initial_point, reduced_f, @@ -468,34 +547,29 @@ function accelerated_simplex_gradient_descent_over_probability_simplex( d = similar(x) reduced_grad!(gradient_x, x) reduced_grad!(gradient_y, x) - strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient_x) + strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient_x, x) q = mu / L # If the problem is close to convex, simply use the accelerated algorithm for convex objective functions. if mu < 1.0e-3 alpha = 0.0 + alpha_old = 0.0 else - alpha = sqrt(q) + gamma = (1 - sqrt(q))/(1 + sqrt(q)) end - alpha_old = 0.0 while strong_wolfe_gap > tolerance @. x_old = x reduced_grad!(gradient_y, y) - @. d = y - gradient_y/L x = projection_simplex_sort(y .- gradient_y/L) if mu < 1.0e-3 - alpha_old = alpha_old + alpha_old = alpha alpha = 0.5 * (1 + sqrt(1 + 4 * alpha^2)) gamma = (alpha_old - 1.0) / alpha - else - alpha_old = alpha - alpha = return_bounded_root_of_square(1, alpha^2 - q, -alpha^2) - gamma = alpha_old * (1 - alpha_old) / (alpha_old^2 - alpha) end @. y = x + gamma*(x - x_old) number_of_steps = number_of_steps + 1 primal = reduced_f(x) reduced_grad!(gradient_x, x) - strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient_x) + strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient_x, x) if trajectory push!( traj_data, @@ -531,25 +605,16 @@ function accelerated_simplex_gradient_descent_over_probability_simplex( return x, number_of_steps end -function return_bounded_root_of_square(a,b,c) - root1 = (-b+sqrt(b^2 - 4*a*c))/(2*a) - if root1 >= 0 && root1 < 1.0 - return root1 - else - root2 = (-b - sqrt(b^2 - 4*a*c))/(2*a) - if root2 >= 0 && root2 < 1.0 - return root2 - else - print("\n TODO, introduce assert. Roots are: ", root1, " ", root2,"\n") - end - end -end +""" +simplex_gradient_descent_over_probability_simplex +Minimizes an objective function over the unit probability simplex +until the Strong-Wolfe gap is below tolerance using gradient descent. +""" function simplex_gradient_descent_over_probability_simplex( initial_point, reduced_f, reduced_grad!, - reduced_linesearch, tolerance, t, trajectory, @@ -565,16 +630,13 @@ function simplex_gradient_descent_over_probability_simplex( gradient = similar(x) d = similar(x) reduced_grad!(gradient, x) - strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient) + strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient, x) while strong_wolfe_gap > tolerance - y = projection_simplex_sort(x .- gradient/L) - @. d = y - x - gamma = min(1.0, reduced_linesearch(gradient, d)) - @. x = x + gamma*d + x = projection_simplex_sort(x .- gradient/L) number_of_steps = number_of_steps + 1 primal = reduced_f(x) reduced_grad!(gradient, x) - strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient) + strong_wolfe_gap = Strong_Frank_Wolfe_gap_probability_simplex(gradient, x) if trajectory push!( traj_data, @@ -612,7 +674,12 @@ end -# Sort projection for the simplex. +""" +projection_simplex_sort + +Perform a projection onto the unit probability simplex using +a sorting algorithm. +""" function projection_simplex_sort(x) n = length(x) if sum(x) == 1.0 && all(>=(0.0), x) @@ -627,23 +694,37 @@ function projection_simplex_sort(x) return w end -function Strong_Frank_Wolfe_gap_probability_simplex(gradient) - val_min = gradient[1] - val_max = gradient[1] - for i in 2:length(gradient) - temp_val = gradient[i] - if temp_val < val_min - val_min = temp_val - else +""" +Strong_Frank_Wolfe_gap_probability_simplex + +Compute the Strong-Wolfe gap over the unit probability simplex +given a gradient. +""" +function Strong_Frank_Wolfe_gap_probability_simplex(gradient, x) + val_min = Inf + val_max = -Inf + for i in 1:length(gradient) + if x[i] > 0 + temp_val = gradient[i] + if temp_val < val_min + val_min = temp_val + end if temp_val > val_max val_max = temp_val end end - end return val_max - val_min end + +""" +simplex_gradient_descent_over_convex_hull + +Minimizes an objective function over the convex hull of the active set +until the Strong-Wolfe gap is below tolerance using simplex gradient +descent. +""" function simplex_gradient_descent_over_convex_hull( f, grad!, @@ -684,7 +765,7 @@ function simplex_gradient_descent_over_convex_hull( a0 = active_set.atoms[1] empty!(active_set) push!(active_set, (1, a0)) - return false + return number_of_steps end # NOTE: sometimes the direction is non-improving # usual suspects are floating-point errors when multiplying atoms with near-zero weights @@ -695,7 +776,7 @@ function simplex_gradient_descent_over_convex_hull( If higher accuracy is required, consider using Double64 (still quite fast) and if that does not help BigFloat (slower) as type for the numbers. Alternatively, consider using AFW (with lazy = true) instead." println(fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), gradient)) - return true + return number_of_steps end η = eltype(d)(Inf) @@ -721,7 +802,7 @@ function simplex_gradient_descent_over_convex_hull( linesearch_method = L === nothing || !isfinite(L) ? backtracking : shortstep if linesearch_method == backtracking gamma, _ = - backtrackingLS(f, direction, x, x - y, 1.0, linesearch_tol=linesearch_tol, step_lim=step_lim) + backtrackingLS(f, gradient, x, x - y, 1.0, linesearch_tol=linesearch_tol, step_lim=step_lim) else # == shortstep, just two methods here for now gamma = fast_dot(gradient, x - y) / (L * norm(x - y)^2) end From bd69a589535532dd129d6e741bebe13ea2e3f013 Mon Sep 17 00:00:00 2001 From: alejandro-carderera Date: Thu, 11 Mar 2021 17:44:26 +0100 Subject: [PATCH 06/18] Delete old code and update test file. Add an example over the probability simplex that utilizes MaybeHotVectors, and an example over the K-Sparse LMO that uses sparse arrays. --- examples/blended_cg.jl | 169 ++++++++---------- src/blended_cg.jl | 379 ----------------------------------------- 2 files changed, 67 insertions(+), 481 deletions(-) diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl index 7d80f6ac9..20a82823e 100644 --- a/examples/blended_cg.jl +++ b/examples/blended_cg.jl @@ -5,8 +5,8 @@ using DoubleFloats using FrankWolfe using SparseArrays -n = Int(5e2) -k = 1000 +n = Int(1000) +k = 10000 s = rand(1:100) @info "Seed $s" @@ -15,27 +15,7 @@ s = rand(1:100) s = 41 Random.seed!(s) -xpi = rand(n); -total = sum(xpi); -const xp = xpi # ./ total; - -#matrix = rand(n, n) -#hessian = matrix * transpose(matrix) -#linear_term = rand(n) -#constant_term = rand() -#f(x) = constant_term + dot(linear_term, x) + 0.5* transpose(x) * hessian * x -#function grad!(storage, x) -# storage .= linear_term + hessian * x -#end -#gradient = zeros(n) - - -#f(x) = norm(x - xp)^2 -#function grad!(storage, x) -# @. storage = 2 * (x - xp) -#end -#hessian = Matrix(2.0I, n, n) - +""" matrix = rand(n,n) hessian = transpose(matrix) * matrix linear = rand(n) @@ -45,45 +25,9 @@ function grad!(storage, x) end L = eigmax(hessian) -# better for memory consumption as we do coordinate-wise ops - -function cf(x, xp) - return @. norm(x - xp)^2 -end - -function cgrad!(storage, x, xp) - return @. storage = 2 * (x - xp) -end - -# this LMO might produce numerical instabilities do demonstrate the recovery feature -const lmo = FrankWolfe.KSparseLMO(50, 1.0) - -# full upgrade of the lmo (and hence optimization) to Double64. -# the same lmo with Double64 is much more numerically robust. costs relatively little in speed. - #const lmo = FrankWolfe.KSparseLMO(100, Double64(1.0)) - -# as above but now to bigfloats -# the same lmo here with bigfloat. even more robust but much slower -# const lmo = FrankWolfe.KSparseLMO(100, big"1.0") - -# other oracles to test / experiment with -# const lmo = FrankWolfe.LpNormLMO{Float64,1}(1.0) -#const lmo = FrankWolfe.ProbabilitySimplexOracle(Double64(1.0)); -#const lmo = FrankWolfe.ProbabilitySimplexOracle(1.0); -# const lmo = FrankWolfe.UnitSimplexOracle(1.0); - -const x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n)) - -#active_set = FrankWolfe.ActiveSet([(1.0, x00)]) -#vect = FrankWolfe.compute_extreme_point(lmo, rand(n)) -#FrankWolfe.active_set_update!(active_set, 0.2, vect) -#vect = FrankWolfe.compute_extreme_point(lmo, rand(n)) -#FrankWolfe.active_set_update!(active_set, 0.2, vect) - -FrankWolfe.benchmark_oracles(x -> cf(x, xp), (str, x) -> cgrad!(str, x, xp), ()->randn(n), lmo; k=100) - -# copying here and below the x00 as the algorithms write back into the variables to save memory. -# as we do multiple runs from the same initial point we do not want this here. +#Run over the probability simplex +lmo = FrankWolfe.ProbabilitySimplexOracle(1.0); +x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n)) x0 = deepcopy(x00) x, v, primal, dual_gap, trajectoryBCG_accel_simplex = FrankWolfe.bcg( @@ -140,8 +84,65 @@ x, v, primal, dual_gap, trajectoryBCG_convex = FrankWolfe.bcg( weight_purge_threshold=1e-10, ) +data = [trajectoryBCG_accel_simplex, trajectoryBCG_simplex, trajectoryBCG_convex] +label = ["BCG (accel simplex)", "BCG (simplex)", "BCG (convex)"] +FrankWolfe.plot_trajectories(data, label) +""" + + +matrix = rand(n,n) +hessian = transpose(matrix) * matrix +linear = rand(n) +f(x) = dot(linear, x) + 0.5*transpose(x) * hessian * x +function grad!(storage, x) + storage .= linear + hessian * x +end +L = eigmax(hessian) + +#Run over the K-sparse polytope +lmo = FrankWolfe.KSparseLMO(100, 100.0) +x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n)) + x0 = deepcopy(x00) -x, v, primal, dual_gap, trajectoryBCG_convex_backup = FrankWolfe.bcg_backup( +x, v, primal, dual_gap, trajectoryBCG_accel_simplex = FrankWolfe.bcg( + f, + grad!, + lmo, + x0, + max_iteration=k, + line_search=FrankWolfe.adaptive, + print_iter=k / 10, + hessian = hessian, + emphasis=FrankWolfe.memory, + L=L, + accelerated = true, + verbose=true, + trajectory=true, + Ktolerance=1.00, + weight_purge_threshold=1e-10, +) + +x0 = deepcopy(x00) +x, v, primal, dual_gap, trajectoryBCG_simplex = FrankWolfe.bcg( + f, + grad!, + lmo, + x0, + max_iteration=k, + line_search=FrankWolfe.adaptive, + print_iter=k / 10, + hessian = hessian, + emphasis=FrankWolfe.memory, + L=L, + accelerated = false, + verbose=true, + trajectory=true, + Ktolerance=1.00, + weight_purge_threshold=1e-10, +) + +x0 = deepcopy(x00) +x, v, primal, dual_gap, trajectoryBCG_convex = FrankWolfe.bcg( f, grad!, lmo, @@ -154,45 +155,9 @@ x, v, primal, dual_gap, trajectoryBCG_convex_backup = FrankWolfe.bcg_backup( verbose=true, trajectory=true, Ktolerance=1.00, - goodstep_tolerance=0.95, weight_purge_threshold=1e-10, ) -data = [trajectoryBCG_accel_simplex, trajectoryBCG_simplex, trajectoryBCG_convex, trajectoryBCG_convex_backup] -label = ["BCG (accel simplex)", "BCG (simplex)", "BCG (convex)", "BCG (convex backup)"] -FrankWolfe.plot_trajectories(data, label, filename="output_results.png") - -#x0 = deepcopy(x00) -#@time x, v, primal, dual_gap, trajectoryAda = FrankWolfe.afw( -# f, -# grad!, -# lmo, -# x0, -# max_iteration=k, -# line_search=FrankWolfe.adaptive, -# L=100, -# print_iter=k / 10, -# emphasis=FrankWolfe.memory, -# verbose=true, -# trajectory=true, -#); - -#x0 = deepcopy(x00) -#@time x, v, primal, dual_gap, trajectorySs = FrankWolfe.fw( -# f, -# grad!, -# lmo, -# x0, -# max_iteration=k, -# line_search=FrankWolfe.shortstep, -# L=2, -# print_iter=k / 10, -# emphasis=FrankWolfe.memory, -# verbose=true, -# trajectory=true, -#); - -#data = [trajectorySs, trajectoryAda, trajectoryBCG] -#label = ["short step", "AFW", "BCG"] -# -#FrankWolfe.plot_trajectories(data, label) +data = [trajectoryBCG_accel_simplex, trajectoryBCG_simplex, trajectoryBCG_convex] +label = ["BCG (accel simplex)", "BCG (simplex)", "BCG (convex)"] +FrankWolfe.plot_trajectories(data, label) \ No newline at end of file diff --git a/src/blended_cg.jl b/src/blended_cg.jl index c0361bd46..98610dcac 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -853,385 +853,6 @@ function simplex_gradient_descent_over_convex_hull( end end - - -function bcg_backup( - f, - grad!, - lmo, - x0; - line_search::LineSearchMethod=adaptive, - L=Inf, - gamma0=0, - step_lim=20, - epsilon=1e-7, - max_iteration=10000, - print_iter=1000, - trajectory=false, - verbose=false, - linesearch_tol=1e-7, - emphasis=nothing, - Ktolerance=1.0, - goodstep_tolerance=1.0, - weight_purge_threshold=1e-9, - gradient=nothing, - direction_storage=nothing, - lmo_kwargs..., -) - function print_header(data) - @printf( - "\n────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" - ) - @printf( - "%6s %13s %14s %14s %14s %14s %14s %14s %14s\n", - data[1], - data[2], - data[3], - data[4], - data[5], - data[6], - data[7], - data[8], - data[9], - ) - @printf( - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n" - ) - end - - function print_footer() - @printf( - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n\n" - ) - end - - function print_iter_func(data) - @printf( - "%6s %13s %14e %14e %14e %14e %14i %14i %14i\n", - st[Symbol(data[1])], - data[2], - Float64(data[3]), - Float64(data[4]), - Float64(data[5]), - data[6], - data[7], - data[8], - data[9], - ) - end - - t = 0 - primal = Inf - dual_gap = Inf - active_set = ActiveSet([(1.0, x0)]) - x = x0 - if gradient === nothing - gradient = similar(x0, float(eltype(x0))) - end - grad!(gradient, x) - # initial gap estimate computation - vmax = compute_extreme_point(lmo, gradient) - phi = fast_dot(gradient, x0 - vmax) / 2 - traj_data = [] - tt = regular - time_start = time_ns() - v = x0 - if direction_storage === nothing - direction_storage = Vector{float(eltype(x))}() - Base.sizehint!(direction_storage, 100) - end - - if line_search == shortstep && !isfinite(L) - @error("Lipschitz constant not set to a finite value. Prepare to blow up spectacularly.") - end - - if line_search == agnostic || line_search == nonconvex - @error("Lazification is not known to converge with open-loop step size strategies.") - end - - if line_search == fixed && gamma0 == 0 - println("WARNING: gamma0 not set. We are not going to move a single bit.") - end - - if verbose - println("\nBlended Conditional Gradients Algorithm.") - numType = eltype(x0) - println( - "EMPHASIS: $memory STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $numType", - ) - println("K: $Ktolerance") - println("WARNING: In memory emphasis mode iterates are written back into x0!") - headers = ( - "Type", - "Iteration", - "Primal", - "Dual", - "Dual Gap", - "Time", - "#ActiveSet", - "#non-simplex", - "#forced FW", - ) - print_header(headers) - end - - if !isa(x, Union{Array, SparseVector}) - x = convert(Array{float(eltype(x))}, x) - end - non_simplex_iter = 0 - nforced_fw = 0 - force_fw_step = false - - while t <= max_iteration && phi ≥ epsilon - # TODO replace with single call interface from function_gradient.jl - primal = f(x) - grad!(gradient, x) - if !force_fw_step - (idx_fw, idx_as, good_progress) = find_minmax_directions( - active_set, gradient, phi, goodstep_tolerance=goodstep_tolerance, - ) - end - if !force_fw_step && good_progress - tt = simplex_descent - force_fw_step = update_simplex_gradient_descent!( - active_set, - gradient, - f, - L=nothing, #don't use the same L as we transform the function - linesearch_tol=linesearch_tol, - weight_purge_threshold=weight_purge_threshold, - storage=direction_storage, - ) - nforced_fw += force_fw_step - else - non_simplex_iter += 1 - # compute new atom - (v, value) = lp_separation_oracle( - lmo, - active_set, - gradient, - phi, - Ktolerance; - inplace_loop=(emphasis == memory), - force_fw_step=force_fw_step, - lmo_kwargs..., - ) - - force_fw_step = false - xval = fast_dot(x, gradient) - if value > xval - phi/Ktolerance - tt = dualstep - # setting gap estimate as ∇f(x) (x - v_FW) / 2 - phi = (xval - value) / 2 - else - tt = regular - gamma, L = line_search_wrapper(line_search,t,f,grad!,x,x - v,gradient,dual_gap,L,gamma0,linesearch_tol,step_lim, 1.0) - - if gamma == 1.0 - active_set_initialize!(active_set, v) - else - active_set_update!(active_set, gamma, v) - end - end - end - x = compute_active_set_iterate(active_set) - dual_gap = phi - if trajectory - push!( - traj_data, - ( - t, - primal, - primal - dual_gap, - dual_gap, - (time_ns() - time_start) / 1.0e9, - length(active_set), - ), - ) - end - - if verbose && mod(t, print_iter) == 0 - if t == 0 - tt = initial - end - rep = ( - tt, - string(t), - primal, - primal - dual_gap, - dual_gap, - (time_ns() - time_start) / 1.0e9, - length(active_set), - non_simplex_iter, - nforced_fw, - ) - print_iter_func(rep) - flush(stdout) - end - t = t + 1 - end - if verbose - x = compute_active_set_iterate(active_set) - grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) - primal = f(x) - dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - rep = ( - last, - string(t - 1), - primal, - primal - dual_gap, - dual_gap, - (time_ns() - time_start) / 1.0e9, - length(active_set), - non_simplex_iter, - nforced_fw, - ) - print_iter_func(rep) - flush(stdout) - end - active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold) - active_set_renormalize!(active_set) - x = compute_active_set_iterate(active_set) - grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) - primal = f(x) - #dual_gap = 2phi - dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - if verbose - rep = ( - pp, - string(t - 1), - primal, - primal - dual_gap, - dual_gap, - (time_ns() - time_start) / 1.0e9, - length(active_set), - non_simplex_iter, - nforced_fw, - ) - print_iter_func(rep) - print_footer() - flush(stdout) - end - return x, v, primal, dual_gap, traj_data -end - - -""" - update_simplex_gradient_descent!(active_set::ActiveSet, direction, f) - -Performs a Simplex Gradient Descent step and modifies `active_set` inplace. - -Returns boolean flag -> whether next step must be a FW step (if numerical instability). - -Algorithm reference and notation taken from: -Blended Conditional Gradients:The Unconditioning of Conditional Gradients -https://arxiv.org/abs/1805.07311 -""" -function update_simplex_gradient_descent!( - active_set::ActiveSet, - direction, - f; - L=nothing, - linesearch_tol=10e-10, - step_lim=100, - weight_purge_threshold=1e-12, - storage=nothing, -) - c = if storage === nothing - [fast_dot(direction, a) for a in active_set.atoms] - else - if length(storage) == length(active_set) - for (idx, a) in enumerate(active_set.atoms) - storage[idx] = fast_dot(direction, a) - end - storage - elseif length(storage) > length(active_set) - for (idx, a) in enumerate(active_set.atoms) - storage[idx] = fast_dot(direction, a) - end - storage[1:length(active_set)] - else - for idx in 1:length(storage) - storage[idx] = fast_dot(direction, active_set.atoms[idx]) - end - for idx in (length(storage)+1):length(active_set) - push!(storage, fast_dot(direction, active_set.atoms[idx])) - end - storage - end - end - k = length(active_set) - c .-= (sum(c) / k) - # name change to stay consistent with the paper, c is actually updated in-place - d = c - if norm(d) <= 1e-8 - @info "Resetting active set." - # resetting active set to singleton - a0 = active_set.atoms[1] - active_set_initialize!(active_set, a0) - return false - end - # NOTE: sometimes the direction is non-improving - # usual suspects are floating-point errors when multiplying atoms with near-zero weights - @inbounds if fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), direction) < 0 - defect = fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), direction) - @warn "Non-improving d ($defect) due to numerical instability. Temporarily upgrading precision to BigFloat for the current iteration. - If the numerical instability is persistent try to run the whole algorithm with Double64 (still quite fast) or BigFloat (slower)." - bdir = big.(direction) - c = [fast_dot(bdir, a) for a in active_set.atoms] - c .-= sum(c) / k - d = c - @inbounds if fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), direction) < 0 - @warn "d non-improving in large precision, forcing FW" - @warn "dot value: $(fast_dot(sum(d[i] * active_set.atoms[i] for i in eachindex(active_set)), direction))" - return true - end - end - - η = eltype(d)(Inf) - rem_idx = -1 - @inbounds for idx in eachindex(d) - if d[idx] > 0 - max_val = active_set.weights[idx] / d[idx] - if η > max_val - η = max_val - rem_idx = idx - end - end - end - - - - # TODO at some point avoid materializing both x and y - x = copy(active_set.x) - η = max(0, η) - @. active_set.weights -= η * d - y = copy(update_active_set_iterate!(active_set)) - if f(x) ≥ f(y) - active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold) - return false - end - linesearch_method = L === nothing || !isfinite(L) ? backtracking : shortstep - if linesearch_method == backtracking - gamma, _ = - backtrackingLS(f, direction, x, x - y, 1.0, linesearch_tol=linesearch_tol, step_lim=step_lim) - else # == shortstep, just two methods here for now - gamma = fast_dot(direction, x - y) / (L * norm(x - y)^2) - end - gamma = min(1.0, gamma) - # step back from y to x by (1 - γ) η d - # new point is x - γ η d - if gamma == 1.0 - active_set_cleanup!(active_set, weight_purge_threshold=weight_purge_threshold) - else - @. active_set.weights += η * (1 - gamma) * d - @. active_set.x = x + gamma * (y - x) - end - return false -end - """ Returns either a tuple `(y, val)` with `y` an atom from the active set satisfying the progress criterion and `val` the corresponding gap `dot(y, direction)` From 84efc1ae384c783ebbd8498baac2020b8efa5dd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Fri, 12 Mar 2021 12:03:28 +0100 Subject: [PATCH 07/18] Update src/blended_cg.jl --- src/blended_cg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 98610dcac..a6cc5e978 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -578,7 +578,7 @@ function accelerated_simplex_gradient_descent_over_probability_simplex( primal, primal - tolerance, tolerance, - (time_ns() - time_start) / 1.0e9, + (time_ns() - time_start) / 1e9, length(initial_point), ), ) From 6559f0c56a0b6414dc0dde816d2a2745235d490a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Fri, 12 Mar 2021 12:03:43 +0100 Subject: [PATCH 08/18] Update examples/blended_cg.jl --- examples/blended_cg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl index 20a82823e..04eda74d5 100644 --- a/examples/blended_cg.jl +++ b/examples/blended_cg.jl @@ -160,4 +160,4 @@ x, v, primal, dual_gap, trajectoryBCG_convex = FrankWolfe.bcg( data = [trajectoryBCG_accel_simplex, trajectoryBCG_simplex, trajectoryBCG_convex] label = ["BCG (accel simplex)", "BCG (simplex)", "BCG (convex)"] -FrankWolfe.plot_trajectories(data, label) \ No newline at end of file +FrankWolfe.plot_trajectories(data, label) From 66ea33cb0fb78fdf30f393404b8b1b1833c662ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Fri, 12 Mar 2021 12:20:46 +0100 Subject: [PATCH 09/18] Update examples/blended_cg.jl --- examples/blended_cg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl index 04eda74d5..dc0fe84c5 100644 --- a/examples/blended_cg.jl +++ b/examples/blended_cg.jl @@ -5,7 +5,7 @@ using DoubleFloats using FrankWolfe using SparseArrays -n = Int(1000) +n = 1000 k = 10000 s = rand(1:100) From 93196102c008f810f5680156d658cf552e0cd79c Mon Sep 17 00:00:00 2001 From: alejandro-carderera <56440520+alejandro-carderera@users.noreply.github.com> Date: Fri, 12 Mar 2021 12:33:11 +0100 Subject: [PATCH 10/18] Update src/blended_cg.jl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mathieu Besançon --- src/blended_cg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blended_cg.jl b/src/blended_cg.jl index a6cc5e978..0f196bcb2 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -412,7 +412,7 @@ function minimize_over_convex_hull( end """ -build_reduced_problem + build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, hessian, weights, gradient, tolerance) Given an active set formed by MaybeHotVector, a (constant) Hessian and a gradient constructs a quadratic problem From 6220695316e76aef748bd8b5eba85f262b63d864 Mon Sep 17 00:00:00 2001 From: alejandro-carderera <56440520+alejandro-carderera@users.noreply.github.com> Date: Fri, 12 Mar 2021 12:33:23 +0100 Subject: [PATCH 11/18] Update src/blended_cg.jl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mathieu Besançon --- src/blended_cg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 0f196bcb2..31b2fdef4 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -472,7 +472,7 @@ function build_reduced_problem(atoms::AbstractVector{<:SparseArrays.AbstractSpar end """ -build_reduced_problem + build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, gradient, tolerance) Same as the two function above, but for the case where the active set is formed by normal Julia Arrays. From 9553587213786e1393289a5e6c1aa358322fac64 Mon Sep 17 00:00:00 2001 From: alejandro-carderera <56440520+alejandro-carderera@users.noreply.github.com> Date: Fri, 12 Mar 2021 12:33:32 +0100 Subject: [PATCH 12/18] Update src/blended_cg.jl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mathieu Besançon --- src/blended_cg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 31b2fdef4..65888361f 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -475,7 +475,7 @@ end build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, gradient, tolerance) Same as the two function above, but for the case where the active -set is formed by normal Julia Arrays. +set is formed by dense Arrays. """ function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, gradient, tolerance) n = length(atoms[1]) From ba57b071ebe959506d72091d74c40b0f48620400 Mon Sep 17 00:00:00 2001 From: alejandro-carderera <56440520+alejandro-carderera@users.noreply.github.com> Date: Fri, 12 Mar 2021 12:33:41 +0100 Subject: [PATCH 13/18] Update src/blended_cg.jl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mathieu Besançon --- src/blended_cg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 65888361f..e6dc1bcfc 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -499,7 +499,7 @@ function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, end """ -Checks the strong-Wolfe gap for the reduced problem. +Checks the strong Frank-Wolfe gap for the reduced problem. """ function Strong_Frank_Wolfe_gap(gradient) val_min = Inf From 96c34dc2258dac86a6a58e0f7e85fa27b5539680 Mon Sep 17 00:00:00 2001 From: alejandro-carderera <56440520+alejandro-carderera@users.noreply.github.com> Date: Fri, 12 Mar 2021 12:33:51 +0100 Subject: [PATCH 14/18] Update src/blended_cg.jl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mathieu Besançon --- src/blended_cg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blended_cg.jl b/src/blended_cg.jl index e6dc1bcfc..1552cef01 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -346,7 +346,7 @@ function minimize_over_convex_hull( #Rewrite as problem over the simplex M, b = build_reduced_problem(active_set.atoms, hessian, active_set.weights, gradient, tolerance) #Early exit if we have detected that the strong-Wolfe gap is below the desired tolerance while building the reduced problem. - if isnothing(M) && isnothing(b) + if isnothing(M) return 0 end #In case the matrices are DoubleFloats we need to cast them as Float64, because LinearAlgebra does not work with them. From 25484e1b3cf69918f3175bbf0d6ae33090a1e922 Mon Sep 17 00:00:00 2001 From: alejandro-carderera <56440520+alejandro-carderera@users.noreply.github.com> Date: Fri, 12 Mar 2021 12:34:04 +0100 Subject: [PATCH 15/18] Update src/blended_cg.jl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mathieu Besançon --- src/blended_cg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 1552cef01..56b21bbf1 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -357,7 +357,7 @@ function minimize_over_convex_hull( L_reduced = eigmax(M) #L_reduced = Arpack.eigs(M, nev=1, which=:LM) end - reduced_f(y) = f(x) - dot(gradient, x) + 0.5*transpose(x) * hessian * x + dot(b, y) + 0.5*transpose(y) * M * y + reduced_f(y) = f(x) - dot(gradient, x) + 0.5*transpose(x) * hessian * x + dot(b, y) + 0.5*transpose(y) * M * y function reduced_grad!(storage, x) storage .= b + M*x end From 4e02c0145bb05d3b74fd44e81d47edf0a2ceae8e Mon Sep 17 00:00:00 2001 From: alejandro-carderera <56440520+alejandro-carderera@users.noreply.github.com> Date: Fri, 12 Mar 2021 12:34:48 +0100 Subject: [PATCH 16/18] Update src/blended_cg.jl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mathieu Besançon --- src/blended_cg.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 56b21bbf1..68d658be8 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -557,7 +557,7 @@ function accelerated_simplex_gradient_descent_over_probability_simplex( gamma = (1 - sqrt(q))/(1 + sqrt(q)) end while strong_wolfe_gap > tolerance - @. x_old = x + @. x_old = x reduced_grad!(gradient_y, y) x = projection_simplex_sort(y .- gradient_y/L) if mu < 1.0e-3 From 68eeb501a118fe76272348b92d0e04c16dcfa9f5 Mon Sep 17 00:00:00 2001 From: alejandro-carderera Date: Fri, 12 Mar 2021 12:44:16 +0100 Subject: [PATCH 17/18] Rest of Mathieu's comments. Incorporate the rest of Mathieu's comments from Github. --- examples/blended_cg.jl | 4 ++-- src/blended_cg.jl | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/blended_cg.jl b/examples/blended_cg.jl index dc0fe84c5..812f8a31c 100644 --- a/examples/blended_cg.jl +++ b/examples/blended_cg.jl @@ -15,7 +15,7 @@ s = rand(1:100) s = 41 Random.seed!(s) -""" + matrix = rand(n,n) hessian = transpose(matrix) * matrix linear = rand(n) @@ -87,7 +87,7 @@ x, v, primal, dual_gap, trajectoryBCG_convex = FrankWolfe.bcg( data = [trajectoryBCG_accel_simplex, trajectoryBCG_simplex, trajectoryBCG_convex] label = ["BCG (accel simplex)", "BCG (simplex)", "BCG (convex)"] FrankWolfe.plot_trajectories(data, label) -""" + matrix = rand(n,n) diff --git a/src/blended_cg.jl b/src/blended_cg.jl index 68d658be8..a72f1eff3 100644 --- a/src/blended_cg.jl +++ b/src/blended_cg.jl @@ -148,7 +148,7 @@ function bcg( while t <= max_iteration && phi ≥ epsilon # TODO replace with single call interface from function_gradient.jl #Mininize over the convex hull until strong Wolfe gap is below a given tolerance. - num_simplex_descent_steps = minimize_over_convex_hull( + num_simplex_descent_steps = minimize_over_convex_hull!( f, grad!, gradient, @@ -280,7 +280,7 @@ end """ -minimize_over_convex_hull +minimize_over_convex_hull! Given a function f with gradient grad! and an active set active_set this function will minimize the function over @@ -294,7 +294,7 @@ probability simplex using gradient descent or Nesterov's accelerated gradient descent. """ -function minimize_over_convex_hull( +function minimize_over_convex_hull!( f, grad!, gradient, @@ -422,6 +422,11 @@ active set. If λ are the barycentric coordinates of dimension equal to the cardinality of the active set, the objective function is: f(λ) = reduced_linear^T λ + 0.5 * λ^T reduced_hessian λ + +In the case where we find that the current iterate has a strong-Wolfe +gap over the convex hull of the active set that is below the tolerance +we return nothing (as there is nothing to do). + """ function build_reduced_problem(atoms::AbstractVector{<:FrankWolfe.MaybeHotVector}, hessian, weights, gradient, tolerance) n = atoms[1].len @@ -461,9 +466,7 @@ function build_reduced_problem(atoms::AbstractVector{<:SparseArrays.AbstractSpar #Construct the matrix of vertices. vertex_matrix = zeros(n, k) - #reduced_linear = zeros(k) for i in 1:k - #reduced_linear[i] = dot(atoms[i], gradient) vertex_matrix[:, i] .= atoms[i] end reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix @@ -488,9 +491,7 @@ function build_reduced_problem(atoms::AbstractVector{<:Array}, hessian, weights, #Construct the matrix of vertices. vertex_matrix = zeros(n, k) - #reduced_linear = zeros(k) for i in 1:k - #reduced_linear[i] = dot(atoms[i], gradient) vertex_matrix[:, i] .= atoms[i] end reduced_hessian = transpose(vertex_matrix) * hessian * vertex_matrix From 6c0c9323535ce3758bae9983ea4535dee150f6c1 Mon Sep 17 00:00:00 2001 From: alejandro-carderera Date: Fri, 12 Mar 2021 13:24:38 +0100 Subject: [PATCH 18/18] Modify tests to account for the new changes. Modified tests to account for the new structure of the code. --- test/active_set.jl | 18 +++++++++--------- test/runtests.jl | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/test/active_set.jl b/test/active_set.jl index f13f17d10..8aafb2845 100644 --- a/test/active_set.jl +++ b/test/active_set.jl @@ -93,18 +93,19 @@ end x = FrankWolfe.compute_active_set_iterate(active_set) @test x ≈ [0, 0.5] f(x) = (x[1] - 1)^2 + (x[2] - 1)^2 - ∇f(x) = [2 * (x[1] - 1), 2 * (x[2] - 1)] - gradient_dir = ∇f([0, 0.5]) - FrankWolfe.update_simplex_gradient_descent!(active_set, gradient_dir, f) + + gradient = similar(x) + function grad!(storage, x) + storage .= [2 * (x[1] - 1), 2 * (x[2] - 1)] + end + FrankWolfe.simplex_gradient_descent_over_convex_hull(f, grad!, gradient, active_set, 1.0e-3, 1, false, [], 0.0, 0) @test length(active_set) == 2 @test [1, 0] ∈ active_set.atoms @test [0, 1] ∈ active_set.atoms - active_set2 = ActiveSet([(0.5, [0, 0]), (0.0, [0, 1]), (0.5, [1, 0])]) x2 = FrankWolfe.compute_active_set_iterate(active_set2) @test x2 ≈ [0.5, 0] - gradient_dir = ∇f(FrankWolfe.compute_active_set_iterate(active_set2)) - FrankWolfe.update_simplex_gradient_descent!(active_set2, gradient_dir, f, L=4.0) + FrankWolfe.simplex_gradient_descent_over_convex_hull(f, grad!, gradient, active_set2, 1.0e-3, 1, false, [], 0.0, 0) @test length(active_set) == 2 @test [1, 0] ∈ active_set.atoms @test [0, 1] ∈ active_set.atoms @@ -112,9 +113,8 @@ end # updating again (at optimum) triggers the active set emptying for as in (active_set, active_set2) x = FrankWolfe.compute_active_set_iterate(as) - gradient_dir = ∇f(x) - FrankWolfe.update_simplex_gradient_descent!(as, gradient_dir, f) - @test length(active_set) == 1 + number_of_steps = FrankWolfe.simplex_gradient_descent_over_convex_hull(f, grad!, gradient, as, 1.0e-3, 1, false, [], 0.0, 0) + @test number_of_steps == 0 end end diff --git a/test/runtests.jl b/test/runtests.jl index 250869f6f..3b7e685a0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,8 +17,8 @@ include("utils.jl") f(x) = norm(x)^2 gradient = similar(a) grad!(gradient, a) - @test FrankWolfe.backtrackingLS(f, gradient, a, a - b, 1.0) == (1, 0.5) - @test abs(FrankWolfe.segment_search(f, grad!, a, a - b, 1.0)[2] - 0.5) < 0.0001 + @test FrankWolfe.backtrackingLS(f, gradient, a, a - b, 1.0) == (0.5, 1) + @test abs(FrankWolfe.segment_search(f, grad!, a, a - b, 1.0)[1] - 0.5) < 0.0001 end @testset "FrankWolfe.jl" begin