In [1]:
import SQLite
using Knet
import Blosc
import MsgPack



# Load data

In [2]:
db = SQLite.DB("games.sqlite")

SQLite.DB("games.sqlite")

In [3]:
positions = SQLite.query(db, "select positions.*, games.outcome from positions, games where positions.game_id=games.id order by id limit 10000");
positions[:, :state_tensor] = @. Array{Float32}(reshape(Blosc.decompress(UInt8, get(positions[:, :board_state])), 8, 4, 8));
positions[:, :moves_tensor] = @. Array{Float32}(reshape(Blosc.decompress(Float16, get(positions[:, :mcts_moves])), 8, 4, 4));

In [4]:
n = size(positions, 1)
x = cat(4, positions[1:n, :state_tensor]...);
y_moves = cat(4, positions[1:n, :moves_tensor]...);
y_outcome = cat(4, positions[1:n, :outcome]...);

# Model

In [10]:
function ConvLayer(n_filters, kernel_shape, input_shape; activation=identity, T=Float32)
    θ = [
        randn(T, kernel_shape[1], kernel_shape[2], input_shape[3], n_filters),
        randn(T, 1, 1, n_filters, 1)
    ]
    ϕ = (θ_, x) -> activation.( conv4(θ_[1], x, padding=div(kernel_shape[1], 2)) .+ θ_[2] )
    output_shape = size(ϕ(θ, zeros(T, input_shape..., 1)), 1, 2, 3)
    return Dict(
        :θ => θ,
        :ϕ => ϕ,
        :input_shape => input_shape,
        :output_shape => output_shape
    )
end

ConvLayer (generic function with 1 method)

In [118]:
function softmax(x, axes)
    exp_x = exp.(x .- maximum(x, axes))
    return exp_x ./ sum(exp_x, axes)
end

softmax (generic function with 1 method)

In [119]:
function SoftmaxLayer(axes, input_shape)
    θ = []
    ϕ = (θ_, x) -> softmax(x, axes)
    return Dict(
        :θ => θ,
        :ϕ => ϕ,
        :input_shape => input_shape,
        :output_shape => input_shape
    )    
end

SoftmaxLayer (generic function with 1 method)

In [23]:
function apply(θs, ϕs, x)
    for i in 1:length(θs)
        x = ϕs[i](θs[i], x)
    end
    return x
end

apply (generic function with 1 method)

In [121]:
model(layers) = [l[:θ] for l in layers], [l[:ϕ] for l in layers]

model (generic function with 1 method)

In [127]:
cl1 = ConvLayer(16, (3, 3), (8, 4, 8), activation=relu);
cl2 = ConvLayer(4, (3, 3), (8, 4, 16), activation=relu);
sm1 = SoftmaxLayer([1, 2, 3], (8, 4, 4));
θs, ϕs = model([cl1, cl2, sm1]);

In [131]:
loss(θs, ϕs, x, y) = mean(abs2, apply(θs, ϕs, x) .- y)

loss (generic function with 1 method)

In [132]:
loss(θs, ϕs, x, y_moves)

0.008126385f0

In [133]:
∇loss = grad(loss)

(::gradfun) (generic function with 1 method)

In [134]:
∇loss(θs, ϕs, x, y_moves)

3-element Array{Any,1}:
 Any[Float32[-0.000127309 0.00014844 0.000198423; 7.84476f-5 0.000181421 2.60013f-5; -1.55478f-6 -1.55466f-6 -8.49236f-7]

Float32[0.0 0.0 0.0; -1.73102f-12 -1.73102f-12 -1.73102f-12; -0.000246022 -0.000211334 -0.000211334]

Float32[0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0]

Float32[0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0]

Float32[-0.000246022 -0.000289058 -0.000340393; -0.000451545 -0.000321805 -0.000167851; -0.00012552 7.25054f-5 7.03342f-5]

Float32[1.23196f-6 1.23196f-6 -1.20404f-7; 1.46577f-6 1.46577f-6 -7.46328f-14; 1.46577f-6 1.46577f-6 -7.46328f-14]

Float32[0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0]

Float32[-0.000374563 -0.000141849 -0.000141849; -0.000374563 -0.000141849 -0.000141849; -0.000374563 -0.000141849 -0.000141849]

Float32[0.000133302 8.31798f-5 -0.000315019; 6.74224f-5 0.000134769 9.48618f-7; -1.11804f-6 -1.11796f-6 -1.21584f-6]

Float32[9.26293f-12 8.27723f-12 8.27723f-12; 0.000120424 0.000216 0.000216; 2.20555f-5 6.47465f-5 6.50661f-5]

Float32[0

In [48]:
?update!

search: [1mu[22m[1mp[22m[1md[22m[1ma[22m[1mt[22m[1me[22m[1m![22m



```
update!(weights, gradients, params)
update!(weights, gradients; lr=0.001, gclip=0)
```

Update the `weights` using their `gradients` and the optimization algorithm parameters specified by `params`.  The 2-arg version defaults to the [`Sgd`](@ref) algorithm with learning rate `lr` and gradient clip `gclip`.  `gclip==0` indicates no clipping. The `weights` and possibly `gradients` and `params` are modified in-place.

`weights` can be an individual numeric array or a collection of arrays represented by an iterator or dictionary.  In the individual case, `gradients` should be a similar numeric array of `size(weights)` and `params` should be a single object.  In the collection case, each individual weight array should have a corresponding params object. This way different weight arrays can have their own optimization state, different learning rates, or even different optimization algorithms running in parallel.  In the iterator case, `gradients` and `params` should be iterators of the same length as `weights` with corresponding elements.  In the dictionary case, `gradients` and `params` should be dictionaries with the same keys as `weights`.

Individual optimization parameters can be one of the following types. The keyword arguments for each type's constructor and their default values are listed as well.

  * [`Sgd`](@ref)`(;lr=0.001, gclip=0)`
  * [`Momentum`](@ref)`(;lr=0.001, gclip=0, gamma=0.9)`
  * [`Nesterov`](@ref)`(;lr=0.001, gclip=0, gamma=0.9)`
  * [`Rmsprop`](@ref)`(;lr=0.001, gclip=0, rho=0.9, eps=1e-6)`
  * [`Adagrad`](@ref)`(;lr=0.1, gclip=0, eps=1e-6)`
  * [`Adadelta`](@ref)`(;lr=0.01, gclip=0, rho=0.9, eps=1e-6)`
  * [`Adam`](@ref)`(;lr=0.001, gclip=0, beta1=0.9, beta2=0.999, eps=1e-8)`

# Example:

```
w = rand(d)                 # an individual weight array
g = lossgradient(w)         # gradient g has the same shape as w
update!(w, g)               # update w in-place with Sgd()
update!(w, g; lr=0.1)       # update w in-place with Sgd(lr=0.1)
update!(w, g, Sgd(lr=0.1))  # update w in-place with Sgd(lr=0.1)

w = (rand(d1), rand(d2))    # a tuple of weight arrays
g = lossgradient2(w)        # g will also be a tuple
p = (Adam(), Sgd())         # p has params for each w[i]
update!(w, g, p)            # update each w[i] in-place with g[i],p[i]

w = Any[rand(d1), rand(d2)] # any iterator can be used
g = lossgradient3(w)        # g will be similar to w
p = Any[Adam(), Sgd()]      # p should be an iterator of same length
update!(w, g, p)            # update each w[i] in-place with g[i],p[i]

w = Dict(:a => rand(d1), :b => rand(d2)) # dictionaries can be used
g = lossgradient4(w)
p = Dict(:a => Adam(), :b => Sgd())
update!(w, g, p)
```


In [87]:
opt = Sgd(lr=0.001)
for i in 1:100
    li = loss(θs, ϕs, x, y_moves)
    update!(θs, ∇loss(θs, ϕs, x, y_moves), [[opt, opt], [opt, opt]])
    println("Epoch ", i, ". Loss ", li)
end

Epoch 1. Loss 0.005944858
Epoch 2. Loss 0.0059297327
Epoch 3. Loss 0.005914651
Epoch 4. Loss 0.0058996016
Epoch 5. Loss 0.005884637
Epoch 6. Loss 0.0058696875
Epoch 7. Loss 0.005854796
Epoch 8. Loss 0.00583995
Epoch 9. Loss 0.00582515
Epoch 10. Loss 0.005810396
Epoch 11. Loss 0.005795692
Epoch 12. Loss 0.0057810247
Epoch 13. Loss 0.00576642
Epoch 14. Loss 0.005751835
Epoch 15. Loss 0.0057373317
Epoch 16. Loss 0.0057228506
Epoch 17. Loss 0.0057084295
Epoch 18. Loss 0.0056940494
Epoch 19. Loss 0.0056797173
Epoch 20. Loss 0.0056654247
Epoch 21. Loss 0.0056511727
Epoch 22. Loss 0.005636971
Epoch 23. Loss 0.0056228214
Epoch 24. Loss 0.0056086984
Epoch 25. Loss 0.005594645
Epoch 26. Loss 0.0055806153
Epoch 27. Loss 0.005566643
Epoch 28. Loss 0.005552715
Epoch 29. Loss 0.005538832
Epoch 30. Loss 0.0055249673
Epoch 31. Loss 0.0055112005
Epoch 32. Loss 0.0054974146
Epoch 33. Loss 0.0054837065
Epoch 34. Loss 0.0054700403
Epoch 35. Loss 0.0054564066
Epoch 36. Loss 0.0054428224
Epoch 37. Loss 0.00

In [155]:
loss(layer, fun, x) = sum(fun(layer[:params], x))

loss (generic function with 2 methods)

In [156]:
loss(cl1, cl1[:transfer_function], x)

123.69244f0

In [157]:
∇loss = grad(loss)

(::gradfun) (generic function with 1 method)

In [158]:
∇loss(cl1, cl1[:transfer_function], x)

Dict{Symbol,Any} with 1 entry:
  :params => Dict(:bias=>Float32[63.0],:weights=>Float32[11.0 18.0 11.0; 16.0 2…

In [49]:
function bloss(params, ϕs, x)
    for pf in zip(params, ϕs)
        params, ϕ = pf
        x = ϕ(params, x)
    end
    return sum(abs2(.x))
end

LoadError: [91msyntax: invalid identifier name "."[39m

In [77]:
bloss([1], [(p, x) -> x], [12.12])

12.12

In [75]:
update!([l[:params] for l in layers], ∇blossc(layers, x))
bloss([l[:params] for l in layers], [l[:transfer_function] for l in layers], x)

-5.2048396f11

In [51]:
∇bloss = grad(bloss)

(::gradfun) (generic function with 1 method)

In [52]:
∇bloss([l[:params] for l in layers], [l[:transfer_function] for l in layers], x)

2-element Array{Any,1}:
 Any[Float32[-28.6034 -40.2424 -37.4153; -20.0241 -28.6625 -30.064; -13.6265 -19.3087 -17.5113]

Float32[-27.2881 -36.007 -29.9811; -49.1222 -57.5985 -42.5337; -49.88 -64.661 -40.6141]

Float32[0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0]

Float32[0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0]

Float32[-37.4564 -46.5109 -31.2603; -31.4305 -46.4542 -34.2165; -22.2058 -28.1182 -28.0615]

Float32[-90.3349 -116.791 -95.7004; -97.5639 -126.746 -103.858; -82.6993 -106.119 -83.2307]

Float32[0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0]

Float32[-67.9433 -82.5746 -61.4839; -67.9433 -82.5746 -61.4839; -53.0788 -61.9474 -40.8567], Float32[-132.715]]
 Any[Float32[88.1307 107.896 83.9332; 99.9036 119.669 92.8819; 94.7341 113.877 88.0112]

Float32[88.1307 107.896 83.9332; 99.9036 119.669 92.8819; 94.7341 113.877 88.0112], Float32[96.0]

Float32[96.0]]                                                                                                                                                  

In [53]:
∇blossc(layers, x) = ∇bloss([l[:params] for l in layers], [l[:transfer_function] for l in layers], x)

∇blossc (generic function with 1 method)

In [54]:
g = ∇blossc(layers, x)

2-element Array{Any,1}:
 Any[Float32[-28.6034 -40.2424 -37.4153; -20.0241 -28.6625 -30.064; -13.6265 -19.3087 -17.5113]

Float32[-27.2881 -36.007 -29.9811; -49.1222 -57.5985 -42.5337; -49.88 -64.661 -40.6141]

Float32[0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0]

Float32[0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0]

Float32[-37.4564 -46.5109 -31.2603; -31.4305 -46.4542 -34.2165; -22.2058 -28.1182 -28.0615]

Float32[-90.3349 -116.791 -95.7004; -97.5639 -126.746 -103.858; -82.6993 -106.119 -83.2307]

Float32[0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0]

Float32[-67.9433 -82.5746 -61.4839; -67.9433 -82.5746 -61.4839; -53.0788 -61.9474 -40.8567], Float32[-132.715]]
 Any[Float32[88.1307 107.896 83.9332; 99.9036 119.669 92.8819; 94.7341 113.877 88.0112]

Float32[88.1307 107.896 83.9332; 99.9036 119.669 92.8819; 94.7341 113.877 88.0112], Float32[96.0]

Float32[96.0]]                                                                                                                                                  

In [43]:
update!([l[:params] for l in layers], g)

In [197]:
[l[:params] for l in layers] .+ g

LoadError: [91mMethodError: no method matching +(::Dict{Symbol,Array{Float32,4}}, ::Dict{Symbol,Any})[0m
Closest candidates are:
  +(::Any, ::Any, [91m::Any[39m, [91m::Any...[39m) at operators.jl:424
  +([91m::Type{AutoGrad.Grad{1}}[39m, ::Any, [91m::Any[39m, [91m::AutoGrad.Rec{##817}[39m) where ##817 at :0
  +([91m::Type{AutoGrad.Grad{1}}[39m, ::Any, [91m::Any[39m, [91m::AutoGrad.Rec{##835}[39m, [91m::AutoGrad.Rec{##836}[39m) where {##835, ##836} at :0
  ...[39m

## Functional model

In [5]:
function predict(w, x)
    n = 100
    for i in 1:n
        x = cat(3, x, relu.( conv4(w[i][1], x, padding=1) .+ w[i][2] ))
    end
    x = relu.( conv4(w[n+1][1], x, padding=1) .+ w[n+1][2] )
    x = reshape(x, 8*4*4, size(x, 4))
    return softmax(x, 1)
end

predict (generic function with 1 method)

In [6]:
nc(i) = 8 + 4(i - 1)
w = vcat(
    [(xavier(Float32, 3, 3, nc(i), 4), xavier(Float32, 1, 1, 4, 1)) for i in 1:100],
    [(xavier(Float32, 3, 3, nc(101), 4), xavier(Float32, 1, 1, 4, 1))]
);

In [7]:
function softmax(x, axes)
    exp_x = exp.(x .- maximum(x, axes))
    return exp_x ./ sum(exp_x, axes)
end

softmax (generic function with 1 method)

In [8]:
function kl_divergence(p, q)
    ppe = p .+ 0.0001f0
    return -sum(ppe .* log.(q ./ ppe), 1)
end

kl_divergence (generic function with 1 method)

In [9]:
loss(w, x, y) = mean(kl_divergence(reshape(y, 8*4*4, size(y, 4)), predict(w, x)))

loss (generic function with 1 method)

In [10]:
∇loss = grad(loss)

(::gradfun) (generic function with 1 method)

In [None]:
@time ∇loss(w, x, y_moves);

In [None]:
@time loss(w, x, y_moves)

In [11]:
opt = optimizers(w, Sgd, lr=0.001);

In [12]:
for epoch in 1:100
    for batch in minibatch(x, y_moves, 4, shuffle=true, partial=true)
        mini_x, mini_y = batch
        update!(w, ∇loss(w, mini_x, mini_y), opt)
        println("Batch loss $(loss(w, mini_x, mini_y))")
    end
    println("Epoch $epoch, loss $(loss(w, x, y_moves))")
end

Batch loss 3.5833597
Batch loss 8.052206
Batch loss 2.891875
Batch loss 4.122321
Batch loss 3.2124307
Batch loss 3.4684405
Batch loss 3.2266831
Batch loss 3.465651
Batch loss 3.8753717
Batch loss 3.4895737
Batch loss 2.9162836
Batch loss 3.3147407
Batch loss 2.9968133


LoadError: [91mInterruptException:[39m

In [14]:
reshape(y_moves, 8*4*4, :)

128×1598 Array{Float32,2}:
 0.0  0.0  0.0      0.0       0.0       …  0.0  0.0       0.0       0.0  0.0
 0.0  0.0  0.0      0.0       0.0          0.0  0.0       0.0       0.0  0.0
 0.0  0.0  0.0      0.0       0.0          0.0  0.0       0.0       0.0  0.0
 0.0  0.0  0.0      0.0       0.0          0.0  0.0       0.0       0.0  0.0
 0.0  0.0  0.14563  0.0       0.108337     0.0  0.265381  0.0       0.0  0.0
 0.0  0.0  0.0      0.0       0.0       …  0.0  0.0       0.0       0.0  0.0
 0.0  0.0  0.0      0.0       0.0          0.0  0.0       0.0       0.0  0.0
 0.0  0.0  0.0      0.0       0.0          0.0  0.0       0.0       0.0  0.0
 0.0  0.0  0.0      0.0       0.0          0.0  0.0       0.0       0.0  0.0
 0.0  0.0  0.0      0.0       0.0          0.0  0.0       0.0       0.0  0.0
 0.0  0.0  0.0      0.0       0.0       …  0.0  0.0       0.0       0.0  0.0
 0.0  0.0  0.0      0.0       0.0          0.0  0.0       0.0       0.0  0.0
 0.0  0.0  0.0      0.0       0.0          0.0  0

In [15]:
predict(w, x)

LoadError: [91mUndefVarError: softmax not defined[39m

In [None]:
loss(w, x, y_moves)

LoadError: [91mUndefVarError: softmax not defined[39m