In [1]:
using Statistics
using ChainRules
using DiffOpt
using MathOptInterface
using Flux
using Flux: onehotbatch, onecold, crossentropy, throttle
using Base.Iterators: repeated
using OSQP

const MOI = MathOptInterface;

In [2]:
imgs = Flux.Data.MNIST.images()
labels = Flux.Data.MNIST.labels();

In [25]:
## prepare data
# Preprocessing
X = hcat(float.(reshape.(imgs, :))...) #stack all the images
Y = onehotbatch(labels, 0:9); # just a common way to encode categorical variables

test_X = hcat(float.(reshape.(Flux.Data.MNIST.images(:test), :))...)
test_Y = onehotbatch(Flux.Data.MNIST.labels(:test), 0:9);

In [33]:
# define the NN
m = Chain(
  Dense(28^2, 32, relu),
  Dense(32, 10),
  softmax
)

Chain(Dense(784, 32, NNlib.relu), Dense(32, 10), NNlib.softmax)

In [34]:
loss(x, y) = crossentropy(m(x), y) 
opt = ADAM(); # popular stochastic gradient descent variant

accuracy(x, y) = mean(onecold(m(x)) .== onecold(y)) # cute way to find average of correct guesses

dataset = repeated((X,Y), 20) # repeat the data set
evalcb = () -> @show(loss(X, Y)) # callback to show loss

#11 (generic function with 1 method)

In [35]:
Flux.train!(loss, params(m), dataset, opt, cb = throttle(evalcb, 5)); #took me ~5 minutes to train on CPU

loss(X, Y) = 2.270676f0 (tracked)
loss(X, Y) = 1.9413667f0 (tracked)
loss(X, Y) = 1.6438714f0 (tracked)
loss(X, Y) = 1.384434f0 (tracked)


In [36]:
@show accuracy(X,Y)
@show accuracy(test_X, test_Y);

accuracy(X, Y) = 0.6924
accuracy(test_X, test_Y) = 0.696


In [2]:
function msin(x)
    return sin(x)
end

msin (generic function with 1 method)

In [3]:
msin(π/2)

1.0

In [41]:
function rrule(::typeof(msin), x)
    y = msin(x)
    function msin_backward(ȳ)
        return asin(ȳ)
    end
    function msin_pullback(ȳ)
        return NO_FIELDS, msin_backward(ȳ)
    end
    return Y, msin_pullback
end

rrule (generic function with 1 method)

In [4]:
∇msin(x) = Flux.gradient(msin, x)[1]

∇msin (generic function with 1 method)

In [5]:
∇msin(π) 

-1.0 (tracked)

## Simple logistic regression example

In [66]:
W = rand(2, 5)
b = rand(2)

predict(x) = W*x .+ b

function loss(x, y)
  ŷ = predict(x)
  sum((y .- ŷ).^2)
end

x, y = rand(5), rand(2) # Dummy data
loss(x, y)

8.71916280814926

In [67]:
using Flux.Tracker

W = param(W)
b = param(b)

gs = Tracker.gradient(() -> loss(x, y), params(W))

Grads(...)


In [68]:
using Flux.Tracker: update!

Δ = gs[W]

# Update the parameter and reset the gradient
update!(W, -0.1Δ)

loss(x, y) # ~ 2.5

3.1199540483701917 (tracked)

## DiffOpt and Flux

In [72]:
n = 2 # variable dimension
m = 1; # no of inequality constraints

Q = [4. 1.;1. 2.]
q = [1.; 1.]
G = [1. 1.;]
h = [-1.;]   # initial values set

function myLayer(n,m,Q,q,G,h)
    # create the optimizer
    model = diff_optimizer(OSQP.Optimizer)
    x = MOI.add_variables(model, n);

    # define objective
    quad_terms = MOI.ScalarQuadraticTerm{Float64}[]
    for i in 1:n
        for j in i:n # indexes (i,j), (j,i) will be mirrored. specify only one kind
            push!(quad_terms, MOI.ScalarQuadraticTerm(Q[i,j],x[i],x[j]))
        end
    end

    objective_function = MOI.ScalarQuadraticFunction(MOI.ScalarAffineTerm.(q, x),quad_terms,0.)
    MOI.set(model, MOI.ObjectiveFunction{MOI.ScalarQuadraticFunction{Float64}}(), objective_function)
    MOI.set(model, MOI.ObjectiveSense(), MOI.MIN_SENSE)

    # add constraint
    MOI.add_constraint(
        model,
        MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(G[1,:], x), 0.),
        MOI.LessThan(h[1])
    )

    # solve
    MOI.optimize!(model)

    # sanity-check
    @assert MOI.get(model, MOI.TerminationStatus()) in [MOI.LOCALLY_SOLVED, MOI.OPTIMAL]

    x̄ = MOI.get(model, MOI.VariablePrimal(), x)
    return model, x̄
end

myLayer (generic function with 1 method)

In [61]:
model, x̄ = myLayer(n,m,Q,q,G,h)

-----------------------------------------------------------------
           OSQP v0.6.2  -  Operator Splitting QP Solver
              (c) Bartolomeo Stellato,  Goran Banjac
        University of Oxford  -  Stanford University 2021
-----------------------------------------------------------------
problem:  variables n = 2, constraints m = 1
          nnz(P) + nnz(A) = 5
settings: linear system solver = qdldl,
          eps_abs = 1.0e-03, eps_rel = 1.0e-03,
          eps_prim_inf = 1.0e-04, eps_dual_inf = 1.0e-04,
          rho = 1.00e-01 (adaptive),
          sigma = 1.00e-06, alpha = 1.60, max_iter = 4000
          check_termination: on (interval 25),
          scaling: on, scaled_termination: off
          warm start: on, polish: off, time_limit: off

iter   objective    pri res    dua res    rho        time
   1  -2.3138e-01   1.79e-01   4.00e-01   1.00e-01   2.44e-05s
  50  -1.2500e-01   3.03e-13   5.59e-12   7.14e+00   4.26e-05s

status:               solved
number of iterations:

(Optimizer{MOIB.LazyBridgeOptimizer{MOIU.CachingOptimizer{OSQP.MOIOSQP.Optimizer,MOIU.UniversalFallback{MOIU.Model{Float64}}}}}, [-0.25, -0.75])

In [76]:
loss(x̄) = x̄'*Q*x̄/2 + q'*x̄
Tracker.gradient(() -> loss(x̄), h[1])

MethodError: MethodError: no method matching (::getfield(Main, Symbol("##37#38")))(::Tracker.TrackedReal{Float64})
Closest candidates are:
  #37() at In[76]:2

In [69]:
function rrule(::typeof(myLayer), dA, db, dc)
    y = msin(x)
    function msin_backward(ȳ)
        return asin(ȳ)
    end
    function msin_pullback(ȳ)
        return NO_FIELDS, msin_backward(ȳ)
    end
    return Y, msin_pullback
end

rrule (generic function with 1 method)