In [3]:
using NLPModels, Krylov, LinearOperators

$$
\min \qquad f(x) \\
\mbox{s. a} \qquad c(x) = 0
$$

Newton
$$
\left[\begin{array}{cc}
B_k & A_k^T \\ A_k & 0
\end{array}\right]
\left[\begin{array}{c}
d_x \\ d_{\lambda}
\end{array}\right]
=
\left[\begin{array}{c}
-\nabla f(x^k) - A_k^T\lambda^k \\ -c^k
\end{array}\right]
$$

que podemos escrever como
$$
\left[\begin{array}{cc}
B_k & A_k^T \\ A_k & 0
\end{array}\right]
\left[\begin{array}{c}
d_x \\ \lambda^{k+1}
\end{array}\right]
=
\left[\begin{array}{c}
-\nabla f(x^k) \\ -c^k
\end{array}\right]
$$
Que pode ser visto como
$$
\min \qquad \frac{1}{2}d^TB_kd + d^Tg^k \\
\mbox{suj. a} \qquad A_kd = -c^k
$$
Usando região de confiança, fica
$$
\min \qquad \frac{1}{2}d^TB_kd + d^Tg^k \\
\mbox{suj. a} \qquad A_kd = -c^k \\
\qquad \qquad \Vert d\Vert \leq \Delta_k
$$

O problema dessa estratégia é resolver esse subproblema, que pode nem ter solução.
Uma opção é relaxar essa condição, mudando para
$$
\min \qquad \frac{1}{2}d^TB_kd + d^Tg^k \\
\mbox{suj. a} \qquad A_kd + c^k = r^k \\
\qquad \qquad \Vert d\Vert \leq \Delta_k,
$$
onde $r^k$ deveria ser apenas pequeno suficiente para deixar esse problema factível.
Uma escolha que além disso, também facilita o método é
$r^k = A_k v^k + c^k$, onde $v^k$ é a solução do problema
$$
\min \qquad \frac{1}{2}\Vert A_k v + c^k \Vert^2 \\
\mbox{suj. a} \qquad \Vert v\Vert \leq 0.8\Delta_k,
$$


Substituindo $r^k = A_kv^k + c^k$ na equação, obtemos $A_k(d - v^k) = 0$,
de maneira que podemos fazer
$d = v^k + s$, obtendo a equação $A_k s = 0$. Dessa maneira,
podemos escrever $s = Z_ku$, onde $Z_k$ é uma matriz que gera o espaço nulo
de $A_k$. Assim, $d = v^k + Z_ku$.
Como $v^k$ está no espaço imagem de $A_k^T$ (não é trivial), então
$$\Vert d\Vert^2 = \Vert v^k\Vert^2 + \Vert Z_ku\Vert^2,$$
por pitágoras.

O subproblema em $u$ fica
$$
\min \qquad \frac{1}{2}u^TZ_k^TB_kZ_k + u^TZ_k^T(B_kv^k + g^k) \\
\mbox{suj. a} \qquad \Vert Z_k u\Vert^2 \leq \sqrt{\Delta_k^2 - \Vert v_k\Vert^2}.
$$

Aqui removemos $A_k s = 0$, mas temos que lidar com $Z_k$.
Na implementação, podemos trabalhar com gradientes conjugados precondicionados por uma
matriz que projete no espaço nulo de $A_k$, e resolvemos esse problema aproximadamente.
A implementação é bastante complicada.

Outro ponto, para encontrar $v^k$, podemos resolver esse problema aproximadamente também.

In [4]:
"""stepsize_to_radius(u, v, Δ)

finds τ such that |u + τv| = Δ, under the hypothesis that |u| < Δ.
"""
function stepsize_to_radius(u, v, Δ; eval::Nullable{Function} = Nullable{Function}())
    utu = dot(u, u)
    @assert utu <= Δ
    vtv = dot(v, v)
    @assert vtv > 0
    utv = dot(u, v)
    δ = sqrt(utv^2 - vtv*(utu-Δ^2))
    τ = 0
    if isnull(eval)
        τ = (-utv + δ)/vtv
        if τ < 0
            τ = -(utv + δ)/vtv
        end
    else
        τ1 = (-utv + δ)/vtv
        τ2 = (-utv - δ)/vtv
        q1, q2 = eval(τ1), eval(τ2)
        τ = q1 < q2 ? τ1 : τ2
    end
    return τ
end

stepsize_to_radius

In [5]:
"""d = dogleg_ls(A, c, Δ)

Solves approximately

min ¹/₂|Ad + c|^2
s.t |d| ≦ Δ

using Dogleg.
"""
function dogleg_ls(A, c, Δ)
    dG = -A' * c
    ndG = norm(dG)
    if ndG > Δ
        return (Δ / ndG) * dG
    end
    dN = -Krylov.cgls(A, c)[1]
    if norm(dN) <= Δ
        return dN
    end
    r = dN - dG
    return dG + r * stepsize_to_radius(dG, r, Δ)
end

dogleg_ls

In [6]:
"""d = steihaug_nullspace(B, g, Z, Δ)

Solves

min ¹/₂uᵀZᵀBZu + gᵀZu
s.t.  |Zu| ≦ Δ
"""
function steihaug_nullspace(B, g, Z::LinearOperator, Δ; ϵ=1e-6, itmax=0)
    n = length(g)
    itmax = max(50, 5n)
    Zu = zeros(n)
    r = Z'*g
    t = Krylov.cg(Z' * Z, -r)[1]
    Zp = Z * t
    θ = dot(r, t)
    if θ < ϵ
        γ = dot(Zp, B*Zp)
        #println("Cauchy: γ: $γ")
        if γ <= 0.0
            return -Zp * (Δ/norm(Zp))
        else
            return -Zp * min(Δ/norm(Zp), dot(Zp, g)/γ)
        end
    end
    θ₀ = θ
    iter = 0
    while iter < itmax
        γ = dot(Zp, B*Zp)
        nZp = norm(Zp)
        if γ <= 1e-8*nZp^2
            #println("Neg curvature")
            return Zu + Zp * stepsize_to_radius(Zu, Zp, Δ)
        end
        α = θ/γ
        if norm(Zu + α * Zp) > Δ
            #println("On the border")
            return Zu + Zp * stepsize_to_radius(Zu, p, Δ)
        end
        Zu += α * Zp
        r -= α * Z' * B * Zp
        t = Krylov.cg(Z' * Z, -r)[1]
        θn = dot(t, r)
        if θn < ϵ * θ₀
            #println("Optimal")
            return Zu
        end
        β = θn/θ
        θ = θn
        Zp = Z * t  + β * Zp
        iter += 1
    end
    return Zu
end

steihaug_nullspace

In [14]:
function nullspace(A)
    (Q,R) = qr(A', thin=false)
    m = size(A, 1)
    return LinearOperator(Q[:,m+1:end])
end

# Péssimo
function nullspace(A::SparseMatrixCSC)
    return nullspace(full(A))
end



nullspace (generic function with 2 methods)

In [32]:
function treq(nlp::AbstractNLPModel; itmax=10000, η₁ = 1e-2,
        η₂ = 0.75, σ₁ = 0.25, σ₂ = 4, ϵ = 1e-6)
    x = nlp.meta.x0
    
    f = obj(nlp, x)
    g = grad(nlp, x)
    B = hess_op(nlp, x)
    c = cons(nlp, x)
    m = length(c)
    A = jac(nlp, x) # Note que nao é op
    Z = nullspace(A)
    
    λ = nlp.meta.y0
    ∇L = g + A'*λ
    Δ = max(1.0, min(100.0, 0.1*norm(g)))
    
    μ = 1.0
    
    iter = 0
    while (norm(∇L) > ϵ || norm(c) > ϵ) && (iter <= itmax)
        ## v^k por Dogleg
        v = dogleg_ls(A, c, 0.8Δ)
        s = steihaug_nullspace(B, B*v + g, Z, sqrt(Δ - norm(v)))
        
        d = v + s
        #println("x = $x")
        #println("|c| = $(norm(c))")
        #println("|gL| = $(norm(∇L))")
        #println("v = $v")
        #println("s = $s")
        xt = x + d
        ft = obj(nlp, xt)
        ct = cons(nlp, xt)
        
        q = dot(d,g) + 0.5*dot(d, B*d)
        difc = norm(c) - norm(A*d + c)
        μplus = abs(difc) < 1e-4 ? μ : max(μ, 0.1 + q/difc)
        μplus = 2μ
        
        Ared = f - ft + μ * (norm(c) - norm(ct))
        Pred = -q + μ * difc
        #println("Ared = $Ared, δf = $(f-ft), δc = $(norm(c)-norm(ct))")
        #println("Pred = $Pred")
        #println("μ+ = $μplus")
        ρ = Ared/Pred
        
        #println("ρ = $ρ")
        
        if ρ > η₁
            x = xt
            f = ft
            c = ct
            g = grad(nlp, x)
            B = hess_op(nlp, x)
            A = jac(nlp, x)
            Z = nullspace(A)
            λ = Krylov.cgls(A', -g)[1]
            ∇L = g + A'*λ
            
            μ = μplus
            if ρ > η₂
                Δ *= σ₂
            end
        else
            Δ = max(0.1Δ, max(σ₁ * min(Δ, norm(d))))
        end
        #println("Δ = $Δ")
        iter += 1
    end
    ef = (norm(∇L) <= ϵ && norm(c) <= ϵ) ? 0 : 1
    return x, f, norm(∇L), norm(c), ef, iter
end



treq (generic function with 1 method)

In [33]:
nlp = ADNLPModel(x->dot(x,[1.0;2.0].*x), zeros(2), c=x->[sum(x)-1], lcon=[0.0], ucon=[0.0])

NLPModels.ADNLPModel(Minimization problem Generic
nvar = 2, ncon = 1 (0 linear)
,NLPModels.Counters(0,0,0,0,0,0,0,0,0,0,0),#51,#50)

In [34]:
treq(nlp, itmax=10)

([0.666667,0.333333],0.6666666666666666,0.0,0.0,0,2)

In [35]:
nlp = ADNLPModel(x->(x[1]-1)^2 + 4*(x[2]-x[1]^2)^2, zeros(2),
    c=x->[sum(x)-1], lcon=[0.0], ucon=[0.0])

NLPModels.ADNLPModel(Minimization problem Generic
nvar = 2, ncon = 1 (0 linear)
,NLPModels.Counters(0,0,0,0,0,0,0,0,0,0,0),#55,#54)

In [36]:
treq(nlp)

([0.635817,0.364183],0.13905496564783668,3.4290819297653647e-13,0.0,0,7)

In [37]:
nlp = ADNLPModel(x->dot(x.^3,[1.0;2.0].*x), zeros(2), c=x->[sum((x-1).^2)-1], lcon=[0.0], ucon=[0.0])

NLPModels.ADNLPModel(Minimization problem Generic
nvar = 2, ncon = 1 (0 linear)
,NLPModels.Counters(0,0,0,0,0,0,0,0,0,0,0),#59,#58)

In [38]:
treq(nlp)

([0.0,0.0],0.0,0.0,1.0,1,10001)

In [24]:
using CUTEst
nlp = CUTEstModel("BT1")
treq(nlp, itmax=1000)

([0.999903,0.0139484],-0.9999024980923041,0.013948412473126942,2.191660904274145e-9,1,1001)

In [25]:
finalize(nlp)