# Restrições não-lineares de igualdade

Buscamos
$$
\min \quad f(x) \qquad \mbox{s. a} \qquad c(x) = 0.
$$

As condições KKT desse problema é
$$
\nabla f(x) + J(x)^T\lambda = 0 \\
c(x) = 0
$$

Definindo $L(x,\lambda) = f(x) + \lambda^Tc(x)$, também podemos escrever
$$ \nabla_{x,\lambda} L(x,\lambda) = 0. $$
Agora, vamos ver o Newton no sistema KKT:
$$
\left[\begin{array}{cc}
\nabla_{xx}^2\ L(x^k,\lambda^k) & J(x^k)^T \\
J(x^k) & 0
\end{array}\right]
\left[\begin{array}{c}
d_x^k \\ d_{\lambda}^k
\end{array}\right]
= -
\left[\begin{array}{c}
\nabla_x L(x^k,\lambda^k) \\ c(x^k)
\end{array}\right]
$$

Chamando $B_k = \nabla_{xx}^2 L(x^k,\lambda^k)$, $\zeta^k = \nabla_x L(x^k,\lambda^k)$,
$A_k = J(x^k)$ e $c^k = c(x^k)$, podemos escrever
$$
B_kd_x^k + \zeta^k + A_k^Td_{\lambda}^k = 0 \\
A_kd_x^k + c^k = 0
$$

Por acaso, $(d_x^k,d_\lambda^k)$ é solução também de
$$
\min\ \frac{1}{2}d^TB_kd + d^T\zeta^k \\
A_kd + c^k = 0.
$$
Mas como
$$
L(x^k + d,\lambda^k) = L(x^k,\lambda^k) + d^T\zeta^k + \frac{1}{2}d^TB_kd + o(\Vert d\Vert^2) \\
c(x^k + d) = c^k + A_kd + o(\Vert d\Vert),
$$
então esse problema pode ser enxergado como um modelo em torno de $x^k$.

In [1]:
using ForwardDiff, Plots
gr()

Plots.GRBackend()

In [2]:
f(x) = (1-x[1])^2 + 100*(x[2]-x[1]^2)^2
c(x) = [(x[1]+0.5)^2 + (x[2]+0.5)^2 - 1]
x = zeros(2)
λ = zeros(1)

1-element Array{Float64,1}:
 0.0

In [3]:
contour(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->f([x;y]), leg=false, levels=100)
contour!(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->c([x;y])[1], levels=[-0.01;0;0.01], leg=false)
scatter!([x[1]], [x[2]], c=:blue)

[Plots.jl] Initializing backend: gr


In [4]:
∇f(x) = ForwardDiff.gradient(f, x)
H(x) = ForwardDiff.hessian(f, x)
J(x) = ForwardDiff.jacobian(c, x)
∇L(x,λ) = ∇f(x) + J(x)'*λ
W(x,λ) = H(x) + λ[1]*ForwardDiff.hessian(x->c(x)[1], x)

W (generic function with 1 method)

In [5]:
d = -[W(x,λ) J(x)'; J(x) zeros(1,1)]\[∇L(x,λ); c(x)]
dx, dλ = d[1:2], d[3:3]

([0.504950495049505,-0.0049504950495049506],[0.9900990099009901])

In [6]:
scatter!([x[1] + d[1]], [x[2] + d[2]], c=:red)

In [7]:
plot(t->f(x+t*dx), 0, 1, lab="f")

In [8]:
plot(t->norm(c(x+t*dx),1), 0, 1, lab="||c||_1")

$$
\Phi(x,\mu) = f(x) + \mu\left\Vert c(x)\right\Vert_1 \\
= f(x) + \mu \sum_{i=1}^m \left\vert c_i(x)\right\vert
$$

Armijo:
$$
\Phi(x+td,\mu) \leq \Phi(x,\mu) + \alpha tD\Phi(x,v,\mu),
$$
onde
$$
D\Phi(x,v,\mu) = \lim_{h\rightarrow 0^+} \frac{\Phi(x+hv,\mu) - \Phi(x,\mu)}{h}.
$$
Daí,
$$
\lim_{h\rightarrow0^+} \frac{\left\vert c_i(x+hv)\right\vert - \left\vert c_i(x)\right\vert}{h}
$$
Se $c_i(x) > 0$, vira $\nabla c_i(x)^Tv$, se $c_i(x) < 0$, vira $-\nabla c_i(x)^Tv$, e se
$c_i(x) = 0$, vira $\left\vert \nabla c_i(x)^Tv\right\vert$.

In [9]:
Φ(x,μ) = f(x) + μ*norm(c(x), 1)
#function DΦ(x,v,μ)
#    d = dot(∇f(x),v)
#    cx = c(x)
#    Jx = J(x)
#    for i = 1:length(cx)
#        if cx[i] != 0
#            d += μ*dot(Jx[i,:][:],v)*sign(cx[i])
#        else
#            d += μ*abs(dot(Jx[i,:][:],v))
#        end
#    end
#    return d
#end
function DΦ(x,v,μ)
    cx = c(x)
    Jxv = J(x)*v
    return dot(∇f(x),v) + μ*dot(Jxv, [cx[i] != 0 ? sign(cx[i]) : sign(Jxv[i]) for i=1:length(cx)])
end

DΦ (generic function with 1 method)

In [10]:
μ = 1.0
plot(t->Φ(x+t*dx,μ), 0, 1, lab="Phi")
plot!(t->Φ(x,μ) + 0.5*t*DΦ(x,dx,μ), 0, 1, ls=:dash)

In [11]:
t = 1.0
while Φ(x+t*dx,μ) > Φ(x,μ) + 0.5*t*DΦ(x,dx,μ)
    t = t * 0.9
end
t

0.43046721000000016

In [12]:
contour(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->f([x;y]), leg=false, levels=100)
contour!(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->c([x;y])[1], levels=[-0.01;0;0.01], leg=false)
scatter!([x[1]], [x[2]], c=:blue)
scatter!([x[1] + t*d[1]], [x[2] + t*d[2]], c=:red)

In [13]:
x = zeros(2)
λ = zeros(1)
d = -[W(x,λ) J(x)'; J(x) zeros(1,1)]\[∇L(x,λ); c(x)]
dx, dλ = d[1:2], d[3:3]
x = t*dx
λ = t*dλ
x, λ

([0.2173646307920793,-0.0021310257920792086],[0.4262051584158417])

In [14]:
d = -[W(x,λ) J(x)'; J(x) zeros(1,1)]\[∇L(x,λ); c(x)]
dx, dλ = d[1:2], d[3:3]

([0.10002521834943136,0.09440773397958697],[-0.8174339532110266])

In [15]:
contour(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->f([x;y]), leg=false, levels=100)
contour!(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->c([x;y])[1], levels=[-0.01;0;0.01], leg=false)
scatter!([x[1]], [x[2]], c=:blue)
scatter!([x[1] + t*d[1]], [x[2] + t*d[2]], c=:red)

In [16]:
function newton_method(f, x0, c; tol = 1e-5, max_iter = 1000, max_time = 60)
    exit_flag = 0
    ∇f(x) = ForwardDiff.gradient(f, x)
    H(x) = ForwardDiff.hessian(f, x)
    J(x) = ForwardDiff.jacobian(c, x)
    μ = 1e-2
    
    x = copy(x0) # Cópia de x0
    iter = 0
    start_time = time()
    elapsed_time = 0.0
    fx = f(x)
    ∇fx = ∇f(x)
    B = H(x)
    cx = c(x)
    Jx = J(x)
    
    n = length(x0)
    m = length(cx)
    λ = zeros(m)

    ∇Lx = ∇fx + Jx'*λ
    
    Φ(x,μ) = f(x) + μ*norm(c(x), 1)
    while norm(∇Lx) > tol || norm(c(x), 1) > tol
        dtil = -[B Jx'; Jx zeros(m,m)]\[∇Lx; cx]
        d = dtil[1:n]
        dλ = dtil[n+1:n+m]
        Jxd = Jx*d
        
        DPx = 0.0
        for i = 1:m
            DPx += Jxd[i]*(cx[i] != 0 ? sign(cx[i]) : sign(Jxd[i]))
        end
        DΦx = dot(∇fx,d)
        #μ = DPx != 0.0 ? max(-DΦx/DPx, norm(λ,Inf)) : norm(λ,Inf)
        while DΦx + μ*DPx >= 0
            μ *= 2
        end
        DΦx += μ*DPx
        Φx = Φ(x,μ)
        
        if DΦx >= 0
            exit_flag = -1
            break
        end

        t = 1.0
        while Φ(x + t*d,μ) > Φx + 0.5*t*DΦx
            t = t*0.9
        end
        x = x + t*d
        
        fx = f(x)
        ∇fx = ∇f(x)
        B = H(x)
        cx = c(x)
        Jx = J(x)
        λ += t*dλ
        ∇Lx = ∇fx + Jx'*λ
        iter = iter + 1
        if iter >= max_iter
            exit_flag = 1
            break
        end
        elapsed_time = time() - start_time
        if elapsed_time >= max_time
            exit_flag = 2
            break
        end
    end
    return x, fx, ∇Lx, cx, exit_flag, iter, elapsed_time # Precisamos retornar o ponto encontrado
end

newton_method (generic function with 1 method)

In [17]:
f(x) = (1-x[1])^2 + 100*(x[2]-x[1]^2)^2
c(x) = [(x[1]+0.5)^2 + (x[2]+0.5)^2 - 1]
x₀ = -ones(2)
x, fx, gx, cx, ef, iter, el_time = newton_method(f, x₀, c)

([0.30690063400691214,0.09068719881658531],0.4816122914401558,[1.294501471016396e-6,-2.7421156956197734e-6],[6.540989971881572e-12],0,43,0.02451300621032715)

In [18]:
contour(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->f([x;y]), leg=false, levels=100)
contour!(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->c([x;y])[1], levels=[-0.01;0;0.01], leg=false)
scatter!([x₀[1]], [x₀[2]], c=:blue)
scatter!([x[1]], [x[2]], c=:red)

In [19]:
c(x) = [x[2] + x[1]^3 - 0.1]
x₀ = [0.2; -1.0]
x, fx, gx, cx, ef, iter, el_time = newton_method(f, x₀, c)

([0.2900471081744514,0.07559911267688425],0.5113061490866652,[9.522937629014017e-7,-2.0585977367204578e-11],[8.95672425116345e-14],0,19,0.005377054214477539)

In [20]:
contour(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->f([x;y]), leg=false, levels=100)
contour!(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->c([x;y])[1], levels=[-0.01;0;0.01], leg=false)
scatter!([x₀[1]], [x₀[2]], c=:blue)
scatter!([x[1]], [x[2]], c=:red)

## O sistema (Quase-)Newton-KKT

Como vimos, os passos obtidos aqui são solução de
$$
\left[\begin{array}{cc}
B_k & A_k^T \\
A_k & 0
\end{array}\right]
\left[\begin{array}{c}
d_x^k \\ d_{\lambda}^k
\end{array}\right]
= -
\left[\begin{array}{c}
\zeta^k \\ c^k
\end{array}\right]
$$

Então a resolução desse sistema é bastante importante. Temos algumas opções.
Uma delas é fazer a decomposição $LDL^T$.

Outra opção é separar esses valores, obtendo
$$
(A_kB_k^{-1}A_k^T)d_\lambda^k = c^k - A_kB_k^{-1}\zeta^k \\
d_x^k = -B_k^{-1}(A_k^Td_{\lambda}^k + \zeta^k)
$$

Essa opção pode ser viável se

- $B_k^{-1}$ é fácil de se obter, e.g., $B_k$ é diagonal ou bloco-diagonal 1x1, 2x2.
- Se $m$ é muito menor que $n$, de modo que a matriz do sistema seja razoavelmente rápida
de se obter, e o sistema fica bastante rápido e pequeno.
- Se $B_k^{-1}$ é conhecida explicitamente, como em DFP ou BFGS. Fazendo $H_k = B_k^{-1}$.

Ainda assim, será necessário resolver um sistema. Nesse caso, no entanto, a matriz é
definida positiva, então é possível utilizar um método iterativo.

# Método de Filtro

Outra estratégia é pensar em $f$ e $c$ como dois objetivos.
Como $c$ não é uma função real, vamos, novamente, penalizá-la,
chamando a penalização de $h$. (e.g. $h(x) = \left\Vert c(x) \right\Vert$.

$$
\min\ f(x) \qquad \mbox{e} \qquad \min\ h(x).
$$

A teoria segue a de minimização multi-objetivo.
A ideia é manter um conjunto de pontos que são bons o suficiente, num
chamado filtro.

In [21]:
f(x) = (1-x[1])^2 + 100*(x[2]-x[1]^2)^2
c(x) = [(x[1]+0.5)^2 + (x[2]+0.5)^2 - 1]
Filtro = Any[[0.5;0.25], [-0.5;-1.6], -ones(2)]

3-element Array{Any,1}:
 [0.5,0.25] 
 [-0.5,-1.6]
 [-1.0,-1.0]

In [22]:
contour(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->f([x;y]), leg=false, levels=100)
contour!(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->c([x;y])[1], levels=[-0.01;0;0.01], leg=false)
xs = [x[1] for x in Filtro]
ys = [x[2] for x in Filtro]
scatter!(xs, ys, c=:blue)

Olhamos para pares $(f_i,h_i)$, onde $f_i = f(x_i)$ e $h_i = h(x_i)$ para
$x_i \in \mathcal{F}$.

In [78]:
h(x) = norm(c(x), 1)
plot()
maxf = 0.0
maxh = 0.0
for (i,x) in enumerate(Filtro)
    fi, hi = f(x), h(x)
    maxf = max(fi, maxf)
    maxh = max(hi, maxh)
    scatter!([fi], [hi], ms=4, c=:blue, leg=false, ann=(fi+5, hi, text(i)))
end
xlabel!("f")
ylabel!("h")
plot!()

O terceiro ponto é pior tanto no valor de $f$ quanto na factibilidade em relação ao ponto 2.
Então podemos removê-lo do nosso filtro.
Dizemos que o ponto 3 está dominado pelo ponto 2 ($f(x_2) < f(x_3)$ e $h(x_2) < h(x_3)$,
e definimos o filtro como **um conjunto de pontos não dominados**.

In [47]:
function eh_dominado(x, F, f, h)
    for y in F
        if f(y) < f(x) && h(y) < h(x)
            return true
        end
    end
    return false
end

eh_dominado (generic function with 1 method)

In [50]:
for x in Filtro
    if eh_dominado(x, Filtro, f, h)
        println("$x é dominado")
    end
end

[-1.0,-1.0] é dominado


Nosso filtro deveria ser apenas

In [74]:
Filtro = Any[[0.5;0.25], [-0.5;-1.6]]

2-element Array{Any,1}:
 [0.5,0.25] 
 [-0.5,-1.6]

Ao invés de remover elementos desnecessários no filtro, vamos colocar apenas os necessários.

In [68]:
function atualiza_filtro(F, x, f, h)
    n = length(F)
    remover = []
    for i = 1:n
        # Se x é dominado, não o adicione
        if f(F[i]) <= f(x) && h(F[i]) <= h(x)
            return false # false pra indicar que x não foi adicionado
        elseif f(x) <= f(F[i]) && h(x) <= h(F[i])
            # Por outro lado, se x domina alguém, remova este outro ponto.
            push!(remover, i)
        end
    end
    deleteat!(F, remover)
    push!(F, x)
    return true
end

atualiza_filtro (generic function with 1 method)

In [69]:
atualiza_filtro(Filtro, -ones(2), f, h)

false

In [70]:
Filtro

2-element Array{Any,1}:
 [0.5,0.25] 
 [-0.5,-1.6]

In [75]:
atualiza_filtro(Filtro, [0.4; 0.3], f, h)

true

In [76]:
Filtro

3-element Array{Any,1}:
 [0.5,0.25] 
 [-0.5,-1.6]
 [0.4,0.3]  

In [79]:
h(x) = norm(c(x), 1)
plot()
maxf = 0.0
maxh = 0.0
for (i,x) in enumerate(Filtro)
    fi, hi = f(x), h(x)
    maxf = max(fi, maxf)
    maxh = max(hi, maxh)
    scatter!([fi], [hi], ms=4, c=:blue, leg=false, ann=(fi+5, hi, text(i)))
end
xlabel!("f")
ylabel!("h")
plot!()

In [80]:
contour(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->f([x;y]), leg=false, levels=100)
contour!(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->c([x;y])[1], levels=[-0.01;0;0.01], leg=false)
xs = [x[1] for x in Filtro]
ys = [x[2] for x in Filtro]
scatter!(xs, ys, c=:blue)

In [83]:
for a = -1.5:0.01:1.0
    for b = -1.5:0.01:1.0
        atualiza_filtro(Filtro, [a;b], f, h)
    end
end

In [84]:
h(x) = norm(c(x), 1)
plot()
maxf = 0.0
maxh = 0.0
for (i,x) in enumerate(Filtro)
    fi, hi = f(x), h(x)
    maxf = max(fi, maxf)
    maxh = max(hi, maxh)
    scatter!([fi], [hi], ms=4, c=:blue, leg=false, ann=(fi+5, hi, text(i)))
end
xlabel!("f")
ylabel!("h")
plot!()

In [85]:
contour(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->f([x;y]), leg=false, levels=100)
contour!(linspace(-1.5,0.8,100), linspace(-2,1,100), (x,y)->c([x;y])[1], levels=[-0.01;0;0.01], leg=false)
xs = [x[1] for x in Filtro]
ys = [x[2] for x in Filtro]
scatter!(xs, ys, c=:blue)

In [89]:
function newton_method_filter(f, x0, c; tol = 1e-5, max_iter = 1000, max_time = 60)
    exit_flag = 0
    ∇f(x) = ForwardDiff.gradient(f, x)
    H(x) = ForwardDiff.hessian(f, x)
    J(x) = ForwardDiff.jacobian(c, x)
    Filter = Any[x0]
    
    x = copy(x0) # Cópia de x0
    iter = 0
    start_time = time()
    elapsed_time = 0.0
    fx = f(x)
    ∇fx = ∇f(x)
    B = H(x)
    cx = c(x)
    Jx = J(x)
    h(x) = norm(c(x), 1)
    
    n = length(x0)
    m = length(cx)
    λ = zeros(m)

    ∇Lx = ∇fx + Jx'*λ
    
    Φ(x,μ) = f(x) + μ*norm(c(x), 1)
    while norm(∇Lx) > tol || norm(c(x), 1) > tol
        dtil = -[B Jx'; Jx zeros(m,m)]\[∇Lx; cx]
        d = dtil[1:n]
        dλ = dtil[n+1:n+m]
        Jxd = Jx*d
        
        t = 1.0
        while atualiza_filtro(Filtro, x + t*d, f, h)
            t = t*0.9
        end
        x = x + t*d
        
        fx = f(x)
        ∇fx = ∇f(x)
        B = H(x)
        cx = c(x)
        Jx = J(x)
        λ += t*dλ
        ∇Lx = ∇fx + Jx'*λ
        iter = iter + 1
        if iter >= max_iter
            exit_flag = 1
            break
        end
        elapsed_time = time() - start_time
        if elapsed_time >= max_time
            exit_flag = 2
            break
        end
    end
    return x, fx, ∇Lx, cx, exit_flag, iter, elapsed_time # Precisamos retornar o ponto encontrado
end

newton_method_filter (generic function with 1 method)

In [90]:
f(x) = (1-x[1])^2 + 100*(x[2]-x[1]^2)^2
c(x) = [(x[1]+0.5)^2 + (x[2]+0.5)^2 - 1]
x₀ = -ones(2)
x, fx, gx, cx, ef, iter, el_time = newton_method_filter(f, x₀, c)

([0.3069006279526664,0.09068720708138664],0.4816122914440067,[7.512613697802095e-8,-1.3177610891368374e-7],[1.6431300764452317e-14],0,19,0.7611989974975586)