In [1]:
using JuMP
import DiffOpt
import HiGHS
import ChainRulesCore
import Flux
import MLDatasets
import Statistics
import Base.Iterators: repeated
using LinearAlgebra

In [16]:
function matrix_relu(
    y::Matrix;
    model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
)
    N, M = size(y)
    empty!(model)
    set_silent(model)
    @variable(model, x[1:N, 1:M] >= 0)
    @objective(model, Min, x[:]'x[:] -2y[:]'x[:])
    optimize!(model)
    return value.(x)
end

matrix_relu (generic function with 1 method)

In [17]:
function ChainRulesCore.rrule(::typeof(matrix_relu), y::Matrix{T}) where T
    model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
    pv = matrix_relu(y, model = model)
    function pullback_matrix_relu(dl_dx)
        # some value from the backpropagation (e.g., loss) is denoted by `l`
        # so `dl_dy` is the derivative of `l` wrt `y`
        x = model[:x] ## load decision variable `x` into scope
        dl_dy = zeros(T, size(dl_dx))
        dl_dq = zeros(T, size(dl_dx))
        # set sensitivities
        MOI.set.(model, DiffOpt.BackwardInVariablePrimal(), x[:], dl_dx[:])
        # compute grad
        DiffOpt.backward(model)
        # return gradient wrt objective function parameters
        obj_exp = MOI.get(model, DiffOpt.BackwardOutObjective())
        # coeff of `x` in q'x = -2y'x
        dl_dq[:] .= JuMP.coefficient.(obj_exp, x[:])
        dq_dy = -2 ## dq/dy = -2
        dl_dy[:] .= dl_dq[:] * dq_dy
        return (ChainRulesCore.NoTangent(), dl_dy,)
    end
    return pv, pullback_matrix_relu
end

In [2]:
N = 30
imgs = MLDatasets.MNIST.traintensor(1:N)
labels = MLDatasets.MNIST.trainlabels(1:N);

In [3]:
train_X = float.(reshape(imgs, size(imgs, 1) * size(imgs, 2), N)) ## stack all the images
train_Y = Flux.onehotbatch(labels, 0:9);

test_imgs = MLDatasets.MNIST.testtensor(1:N)
test_X = float.(reshape(test_imgs, size(test_imgs, 1) * size(test_imgs, 2), N))
test_Y = Flux.onehotbatch(MLDatasets.MNIST.testlabels(1:N), 0:9);

In [229]:
m = Flux.Chain(
    Flux.Dense(784, 50, Flux.relu), #784 being image linear dimension (28 x 28)
    Flux.Dense(50, 10), # 10 beinf the number of outcomes (0 to 9)
    Flux.softmax,
)

Chain(
  Dense(784, 50, relu),                 [90m# 39_250 parameters[39m
  Dense(50, 10),                        [90m# 510 parameters[39m
  NNlib.softmax,
)[90m                   # Total: 4 arrays, [39m39_760 parameters, 155.562 KiB.

In [197]:
@info size(m(train_X))
@info size(train_Y)

┌ Info: (10, 1000)
└ @ Main In[197]:1
┌ Info: (10, 1000)
└ @ Main In[197]:2


In [195]:
epochs = 5
# epochs = 50 # ~1 minute (i7 8th gen with 16gb RAM)
# epochs = 100 # leads to 77.8% in about 2 minutes

dataset = repeated((train_X, train_Y), epochs);

In [61]:
custom_loss(x, y) = Flux.crossentropy(m(x), y)
opt = Flux.ADAM()
evalcb = () -> @show(custom_loss(train_X, train_Y))

#42 (generic function with 1 method)

In [62]:
@time Flux.train!(custom_loss, Flux.params(m), dataset, opt, cb = Flux.throttle(evalcb, 5));

custom_loss(train_X, train_Y) = 2.2152708f0
  0.232270 seconds (362.10 k allocations: 43.637 MiB, 78.62% compilation time)


In [63]:
accuracy(x, y) = Statistics.mean(Flux.onecold(m(x)) .== Flux.onecold(y));

In [64]:
accuracy(train_X, train_Y)

0.422

In [65]:
accuracy(test_X, test_Y)

0.41

## using svm at last

In [4]:
struct SVM{M<:AbstractMatrix, B}
    weight::M
    bias::B
    alpha::Number
    function SVM(W::M, b::B, α) where {M<:AbstractMatrix, B}
        new{M,B}(W, b, α)
    end
end

In [5]:
function SVM(in::Integer, out::Integer; alpha=0.05)
    W = Flux.glorot_uniform(out, in)
    b = Flux.glorot_uniform(out)
    
    return SVM(W, b, alpha)
end

SVM

In [6]:
function (svm::SVM)(x::Matrix; model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer)))
    W, b, alpha = svm.weight, svm.bias, svm.alpha
    
    N, M = size(x)
    empty!(model)
    set_silent(model)
    
    @variable(model, y[1:10, 1:M]) # should be 10 x 1000
    
#     @variable(model, e[1:10, 1:M]) # slack variables  
#     @constraint(model, cons[i in 1:M], e[:, i] .== (y[:, i] - w*x[:, i] - b))

    # objective minimizing squared error and ridge penalty
    @objective(
        model,
        Min,
        dot((W*x.+b.-y), (W*x.+b.-y)) + alpha * dot(W, W),
    )
    
    optimize!(model)
    return value.(y)
end

In [276]:
# function svm(
#     x::Matrix;
#     model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer)),
#     alpha = 0.01,
# )
#     N, M = size(x) # 50-1000
#     empty!(model)
#     set_silent(model)
    
#     @variable(model, w[1:10, 1:N]) # angular coefficient
#     @variable(model, b[1:10]) # linear coefficient
#     @variable(model, y[1:10, 1:M]) # should be 10 x 1000
    
# #     @variable(model, e[1:10, 1:M]) # slack variables  
# #     @constraint(model, cons[i in 1:M], e[:, i] .== (y[:, i] - w*x[:, i] - b))

#     # objective minimizing squared error and ridge penalty
#     @objective(
#         model,
#         Min,
#         dot((w*x.+b.-y), (w*x.+b.-y)) + alpha * dot(w, w),
#     )
    
#     optimize!(model)
#     return value.(y)
# end

svm (generic function with 1 method)

In [11]:
function ChainRulesCore.rrule(svm::typeof(SVM), x::Matrix{T}) where T
    model = Model(() -> DiffOpt.diff_optimizer(HiGHS.Optimizer))
    py = svm(x, model = model)
    
    function pullback_svm(dl_dy)
        # some value from the backpropagation (e.g., loss) is denoted by `l`
        # so `dl_dy` is the derivative of `l` wrt `y`
        y = model[:y]
        W, b = svm.weight, svm.bias
        N, M = size(x)
        
        MOI.set(
            model,
            DiffOpt.BackwardInVariablePrimal(),
            y,
            dl_dy
        )
        DiffOpt.backward(model)
        obj = MOI.get(
            model,
            DiffOpt.BackwardOutObjective(),
        )
        
        @info obj
        
        # dl_dx is needed by the previous dense layer
        return (ChainRulesCore.NoTangent(), dl_dx,)
    end
    
    return py, pullback_svm
end

In [12]:
m = Flux.Chain(
    Flux.Dense(784, 50, Flux.relu), #784 being image linear dimension (28 x 28)
#     Flux.Dense(50, 10), # 10 beinf the number of outcomes (0 to 9)
    SVM(50, 10),
)

Chain(
  Dense(784, 50, relu),                 [90m# 39_250 parameters[39m
  SVM{Matrix{Float32}, Vector{Float32}}(Float32[-0.20215495 0.16586693 … -0.08584728 0.15465824; 0.14348093 0.19293313 … 0.13694264 -0.05919899; … ; -0.09913504 0.30569953 … 0.24543075 0.030610574; 0.012736179 -0.2579372 … 0.02508468 0.24031003], Float32[-0.6149052, 0.040175453, 0.7128812, -0.30544832, 0.6230561, 0.14602259, 0.5525571, -0.5266073, -0.25275296, -0.053562045], 0.05),
)

In [13]:
@info size(m(train_X)) # should be same sizes
@info size(train_Y)

┌ Info: (10, 30)
└ @ Main In[13]:1
┌ Info: (10, 30)
└ @ Main In[13]:2


In [14]:
epochs = 5
dataset = repeated((train_X, train_Y), epochs)

custom_loss(x, y) = Flux.crossentropy(m(x), y)
opt = Flux.ADAM()
evalcb = () -> @show(custom_loss(train_X, train_Y))

@time Flux.train!(custom_loss, Flux.params(m), dataset, opt, cb = Flux.throttle(evalcb, 5));

LoadError: Compiling Tuple{typeof(MathOptInterface.add_variable), MathOptInterface.Utilities.CachingOptimizer{MathOptInterface.Bridges.LazyBridgeOptimizer{DiffOpt.Optimizer{MathOptInterface.Utilities.CachingOptimizer{MathOptInterface.Bridges.LazyBridgeOptimizer{HiGHS.Optimizer}, MathOptInterface.Utilities.UniversalFallback{MathOptInterface.Utilities.Model{Float64}}}}}, MathOptInterface.Utilities.UniversalFallback{MathOptInterface.Utilities.Model{Float64}}}}: try/catch is not supported.

In [167]:
accuracy(train_X, train_Y)