In [1]:
using Distributed
using Dates
using SharedArrays
#using JET
addprocs(4)

4-element Vector{Int64}:
 2
 3
 4
 5

In [2]:
@everywhere include("AZP_env.jl")
@everywhere include("AZP_agt.jl")
@everywhere include("AZP_mcts_valMAX.jl")

In [3]:
mutable struct ReplayBuffer
    buffer::Vector{Agent}
    buffer_size::Int
    batch_size::Int
    #count::Int
end

function init_buffer(buffer_size::Int, batch_size::Int)
    return ReplayBuffer([], buffer_size, batch_size)
end

function save_game!(buffer::ReplayBuffer, agt::Agent)
    if length(buffer.buffer) > buffer.buffer_size
        popfirst!(buffer.buffer)
    end
    push!(buffer.buffer, agt)
end

save_game! (generic function with 1 method)

In [4]:
mutable struct Storage
    storage::Dict{Int, Chain}
    random_out::Chain
    scores::Dict{Vector{Int}, Float32}
end

function init_storage(env)
    return Storage(Dict(), Chain(Dense(zeros(Float32, env.output,env.input_dim))), Dict())
end

function latest_model(storage::Storage)
    if(isempty(storage.storage))
        return storage.random_out
    else
        return storage.storage[rand(keys(storage.storage))]
    end
end

latest_model (generic function with 1 method)

In [5]:
function WeightSample(hist::Vector{Int})
    s = [i for i in 1:length(hist)]
    ww = s/sum(s)
    return sample(s, ProbabilityWeights(ww))
end

#cpu並列化予定
function sample_batch!(env::Env, buffer::ReplayBuffer, scores::Dict{Vector{Int}, Float32})
    games = sample(buffer.buffer, weights([length(agt.history) for agt in buffer.buffer]), env.batch_size, replace=true)
    g_turn = [(g, WeightSample(g.history)) for g in games]
    #g_turn = [(g, rand(1:length(g.history))) for g in games]

    #imag = SharedArray(zeros(Int, env.input_dim, buffer.batch_size))
    #target = SharedArray(zeros(Float32, env.output, buffer.batch_size))
    if(isempty(scores))
        imag = SharedArray(zeros(Int, env.input_dim, env.batch_size))
        target = SharedArray(zeros(Float32, env.output, env.batch_size))
        @sync @distributed for it in 1:env.batch_size
            g, turn = g_turn[it]
            imag[:,it] = make_image(env, g, turn)
            target[:,it] = make_target(env, g, scores, turn)
        end
    else
        imag = SharedArray(zeros(Int, env.input_dim, 4env.batch_size))
        target = SharedArray(zeros(Float32, env.output, 4env.batch_size))
        @sync @distributed for it in 1:env.batch_size
            println(length(scores))
            g, turn = g_turn[it]
            imag[:,it] = make_image(env, g, turn)
            target[:,it] = make_target(env, g, scores, turn)
        end
        @sync @distributed for it in 1:3env.batch_size
            hist = rand(keys(scores))
            imag[:,env.batch_size+it] = make_image(env, hist)
            target[end,env.batch_size+it] = scores[hist]
        end
    end

    tar_data = sdata(target)
    for it in 1:env.batch_size
        g, turn = g_turn[it]
        for l in 1:length(g.history)
            his = g.history[1:l]
            if(haskey(scores, his))
                scores[his] = max(scores[his], tar_data[end,it])
            else
                scores[his] = tar_data[end,it]
            end
        end
    end
    return sdata(imag), tar_data
end

sample_batch! (generic function with 1 method)

In [6]:
@everywhere function run_selfplay_worker(env::Env, model::Chain, ratio::Float32, noise_r::Float32)
    games = Agent[]
    for it in 1:div(env.num_player, nworkers())
        game = play_physics!(env, model, ratio, noise_r)
        push!(games, game)
    end
    return games
end

function run_selfplay_pal(env::Env, buffer::ReplayBuffer, storage::Storage, ratio::Float32, noise_r::Float32)
    model = latest_model(storage) |> gpu
    futures = Future[]
    for i in workers()
        push!(futures, remotecall(run_selfplay_worker, i, env, model, ratio, noise_r))
    end
    for f in futures
        games = fetch(f)
        for g in games
            save_game!(buffer, g)
        end
    end
end

run_selfplay_pal (generic function with 1 method)

In [7]:
function loss(image::CuArray{Int, 2}, target::Matrix{Float32}, env::Env, model::Chain)
    y1 = cpu(model(image))
    return sum([(((y1[end,i]-target[end,i]))^2 - target[1:end-1,i]' * log.(softmax(y1[1:end-1,i]))) for i in 1:env.batch_size])/env.batch_size
    # + env.C * sum(sqnorm, Flux.params(model))
end

function loss_check(image::CuArray{Int, 2}, target::Matrix{Float32}, env::Env, model::Chain)
    y1 = cpu(model(image))
    val = sum([(((y1[end,i]-target[end,i]))^2) for i in 1:env.batch_size])/env.batch_size
    pol = sum([(-target[1:end-1,i]' * log.(softmax(y1[1:end-1,i]))) for i in 1:env.batch_size])/env.batch_size
    return val, pol
    # + env.C * sum(sqnorm, Flux.params(model))
end

@everywhere tanh10(x) = Float32(10)*tanh(x/10)
@everywhere tanh2(x) = Float32(4)*tanh(x/4)

#gpu並列化予定
function train_model!(env::Env, buffer::ReplayBuffer, storage::Storage)
    #ll = zeros(Float32, env.batch_num)
    ll = zeros(Float32, env.batch_num, env.training_step)
    for b_num in 1:env.batch_num
        if(haskey(storage.storage, b_num))
            model = storage.storage[b_num] |> gpu
        else
            #model = Chain(Dense(env.input_dim, env.middle_dim), Tuple(Chain(Parallel(+, Chain(BatchNorm(env.middle_dim), Dense(env.middle_dim, env.middle_dim, relu)),Dense(env.middle_dim, env.middle_dim, relu)), identity) for i in 1:env.depth)..., Flux.flatten, Flux.Parallel(vcat, Chain(Dense(env.middle_dim, div(env.middle_dim,4), relu), Dense(div(env.middle_dim,4), env.act_ind, tanh2)), Chain(Dense(env.middle_dim, div(env.middle_dim,4), relu), Dense(div(env.middle_dim,4), 1, tanh10)))) |> gpu
            model = Chain(Dense(env.input_dim, env.middle_dim), BatchNorm(env.middle_dim), Tuple(Chain(Parallel(+, Chain(BatchNorm(env.middle_dim), Dense(env.middle_dim, env.middle_dim, relu),Dense(env.middle_dim, env.middle_dim, relu)), identity)) for i in 1:env.depth)..., Flux.flatten, Flux.Parallel(vcat, Chain(BatchNorm(env.middle_dim), Dense(env.middle_dim, env.middle_dim, relu),Dense(env.middle_dim, env.middle_dim, relu), Dense(env.middle_dim, env.act_ind, tanh2)), Chain(BatchNorm(env.middle_dim), Dense(env.middle_dim, div(env.middle_dim,2), relu), Dense(div(env.middle_dim,2), 1, tanh10)))) |> gpu
            #model = Chain(Dense(env.input_dim, env.middle_dim), Tuple(Chain(Parallel(+, Chain(BatchNorm(env.middle_dim), Dense(env.middle_dim, env.middle_dim, relu)),Dense(env.middle_dim, env.middle_dim, relu)), identity) for i in 1:env.depth)..., Flux.flatten, Flux.Parallel(vcat, Chain(Dense(env.middle_dim, env.middle_dim, relu), Dense(env.middle_dim, env.act_ind, tanh2)), Chain(Dense(env.middle_dim, env.middle_dim, relu), Dense(env.middle_dim, 1, tanh10)))) |> gpu
        end
        opt = Flux.Optimiser(WeightDecay(env.C), Adam(1f-5))
        #ParameterSchedulers.Scheduler(env.scheduler, Momentum())
        for it in 1:env.training_step
            if(it%(env.checkpoint_interval)==0)
                opt = Flux.Optimiser(WeightDecay(env.C), Adam(1f-5))
            end
            image_batch, target_batch = sample_batch!(env, buffer, storage.scores)
            val, grads = Flux.withgradient(Flux.params(model)) do
                loss(cu(image_batch),target_batch,env,model)
            end
            Flux.Optimise.update!(opt, Flux.params(model), grads)
            ll[b_num, it] = val
            if(it > env.training_step-6)
                val, pol = loss_check(cu(image_batch),target_batch,env,model)
                println("val=$(val), pol=$(pol)")
            end
        end
        storage.storage[b_num] = model
    end
    return ll
end

train_model! (generic function with 1 method)

In [8]:
function dict_copy(orig::Dict{Vector{Int}, Float32})
    c_dict = Dict{String, Float32}()
    for k in keys(orig)
        c_dict["$(k)"] = orig[k] 
    end
    return c_dict
end

dict_copy (generic function with 1 method)

In [9]:
env0 = init_Env(["8", "12", "128", "12", "1000", "12", "1", "600", "0.3", "0.25", "50", "4", "15.0", "0.4", "1.0", "0.7", "0.5", "120", "0.75", "0.000001"])

max_turn:  8
num_player:  12
middle_dim:  128
depth:  12
training_step:  1000
batch_size:  12
batch_num:  1
num_simulation:  600
α:  0.3
frac:  0.25


Env(8, 12, 2, 3, 1, 6, 48, 128, 7, 12, 1000, 1000, 12, 1, 0.0001f0, 0.9f0, 600, 0.3f0, 0.25f0, 50, 4, 15.0f0, 0.4f0, 1.0f0, 0.7f0, 0.5f0, ComplexF32[-2.0f0 + 0.0f0im 0.0f0 + 0.0f0im 0.0f0 + 0.0f0im -0.7f0 + 0.0f0im; 0.0f0 - 0.0f0im 1.0f0 + 0.0f0im -0.7f0 + 0.0f0im 0.0f0 + 0.0f0im; 0.0f0 - 0.0f0im -0.7f0 - 0.0f0im 1.0f0 + 0.0f0im 0.0f0 + 0.0f0im; -0.7f0 - 0.0f0im 0.0f0 - 0.0f0im 0.0f0 - 0.0f0im 0.0f0 + 0.0f0im], ComplexF32[0.0f0 + 0.0f0im -0.4f0 + 0.0f0im -0.4f0 + 0.0f0im 0.0f0 + 0.0f0im; -0.4f0 - 0.0f0im 0.0f0 + 0.0f0im 0.0f0 + 0.0f0im -0.4f0 + 0.0f0im; -0.4f0 - 0.0f0im 0.0f0 - 0.0f0im 0.0f0 + 0.0f0im -0.4f0 + 0.0f0im; 0.0f0 - 0.0f0im -0.4f0 - 0.0f0im -0.4f0 - 0.0f0im 0.0f0 + 0.0f0im], 0.00837758f0, 120, 0.75f0, 1.0f-6)

In [10]:
storage0 = init_storage(env0)

Storage(Dict{Int64, Chain}(), Chain(Dense(48 => 7)), Dict{Vector{Int64}, Float32}())

In [11]:
replay_buffer0 = init_buffer(1000, env0.batch_size)

ReplayBuffer(Agent[], 1000, 12)

In [12]:
ratio = Float32(5.0)

5.0f0

In [13]:
run_selfplay_pal(env0, replay_buffer0, storage0, ratio, 1.0f0)

In [None]:
train_model!(env0, replay_buffer0, storage0)

In [49]:
@show storage0.scores

storage0.scores = Dict{Vector{Int64}, Float32}([5, 4, 6, 3, 2, 2, 2, 2] => 4.636553, [6, 3, 5, 2, 3, 1] => 7.867267, [5, 1, 5, 1] => 4.636553, [6, 5, 3, 5, 1, 2] => 7.867267, [6, 3, 5, 2, 3] => 7.867267, [6, 5, 3, 5, 1, 2, 1] => 7.867267, [6, 3] => 7.867267, [5, 4, 6] => 4.636553, [6, 5, 3] => 7.867267, [6, 5, 3, 5, 1, 2, 1, 2] => 7.867267, [6, 3, 5, 2] => 7.867267, [5, 4] => 4.636553, [6, 3, 6, 2] => 7.867267, [6, 3, 5, 2, 3, 1, 2, 1] => 7.867267, [6, 3, 5] => 7.867267, [5, 4, 6, 3] => 4.636553, [5, 4, 6, 3, 2, 2] => 4.636553, [2] => -0.8215542, [6, 3, 6, 2, 6] => 7.867267, [6, 3, 6, 2, 6, 4] => 7.867267, [5, 1, 5] => 4.636553, [6, 2] => 7.867267, [6, 5, 3, 5, 1] => 7.867267, [5] => 4.636553, [6, 3, 6] => 7.867267, [6, 5] => 7.867267, [5, 1] => 4.636553, [5, 4, 6, 3, 2, 2, 2] => 4.636553, [5, 4, 6, 3, 2] => 4.636553, [6, 3, 6, 2, 6, 4, 1] => 7.867267, [6, 3, 5, 2, 3, 1, 2] => 7.867267, [1] => 4.6365533, [6] => 7.867267, [6, 3, 6, 2, 6, 4, 1, 2] => 7.867267, [6, 5, 3, 5] => 7.867267, [

Dict{Vector{Int64}, Float32} with 38 entries:
  [5, 4, 6, 3, 2, 2, 2, 2] => 4.63655
  [6, 3, 5, 2, 3, 1]       => 7.86727
  [5, 1, 5, 1]             => 4.63655
  [6, 5, 3, 5, 1, 2]       => 7.86727
  [6, 3, 5, 2, 3]          => 7.86727
  [6, 5, 3, 5, 1, 2, 1]    => 7.86727
  [6, 3]                   => 7.86727
  [5, 4, 6]                => 4.63655
  [6, 5, 3]                => 7.86727
  [6, 5, 3, 5, 1, 2, 1, 2] => 7.86727
  [6, 3, 5, 2]             => 7.86727
  [5, 4]                   => 4.63655
  [6, 3, 6, 2]             => 7.86727
  [6, 3, 5, 2, 3, 1, 2, 1] => 7.86727
  [6, 3, 5]                => 7.86727
  [5, 4, 6, 3]             => 4.63655
  [5, 4, 6, 3, 2, 2]       => 4.63655
  [2]                      => -0.821554
  [6, 3, 6, 2, 6]          => 7.86727
  ⋮                        => ⋮

In [22]:
findmax(string_score)

(4.636553f0, "[5, 4, 6, 3, 2, 2, 2]")

In [41]:
string_score = dict_copy(storage0.scores)
k = [keys(string_score)...]
inds = findall(s -> string_score[s] == findmax(string_score)[1], k)
println("max score:")
for i in inds
    println("$(k[i]), $(string_score[k[i]])")
end

max score:
[5, 4, 6, 3, 2, 2, 2], 4.636553
[5, 4], 4.636553
[5, 4, 6, 3], 4.636553
[5, 4, 6, 3, 2, 2], 4.636553
[5, 4, 6, 3, 2], 4.636553
[5], 4.636553
[5, 4, 6], 4.636553
[5, 4, 6, 3, 2, 2, 2, 2], 4.636553


In [50]:
calc_score([6, 5, 3, 5, 1, 2, 1, 2],env0)

-1.0543102f0

In [57]:
calc_score_his([6, 5, 3, 5, 1, 2, 1, 2],env0, storage0.scores, 8)

7.867267f0

In [51]:
games = sample(replay_buffer0.buffer, weights([length(agt.history) for agt in replay_buffer0.buffer]), 1, replace=true)
g_turn = [(g, WeightSample(g.history)) for g in games]

1-element Vector{Tuple{Agent, Int64}}:
 (Agent([5, 1, 5, 1, 5, 2, 2], Int64[], Vector{Float32}[[0.18, 0.125, 0.14333333, 0.14666666, 0.27833334, 0.12666667], [0.255, 0.18833333, 0.23333333, 0.15, 0.0, 0.17333333], [0.125, 0.125, 0.135, 0.125, 0.36333334, 0.12666667], [0.395, 0.15, 0.15333334, 0.15, 0.0, 0.15166667], [0.21666667, 0.13833334, 0.125, 0.12833333, 0.255, 0.13666667], [0.455, 0.545, 0.0, 0.0, 0.0, 0.0], [0.47, 0.53, 0.0, 0.0, 0.0, 0.0]]), 1)

In [52]:
g_test, turn_test = g_turn[1]
imag = make_image(env0, g_test, turn_test)
target = make_target(env0, g_test, turn_test)

7-element Vector{Float32}:
  0.18
  0.125
  0.14333333
  0.14666666
  0.27833334
  0.12666667
 -3.7118094

In [55]:
turn_test

1

In [54]:
for l in 1:length(g_test.history)
    his = g_test.history[1:l]
    println(haskey(storage0.scores, his))
    #println(scores[his])
    if(haskey(storage0.scores, his))
        println(storage0.scores[his])
        println(target[end])
        println(max(storage0.scores[his], target[end]))
        storage0.scores[his] = max(storage0.scores[his], target[end])
    else
        println("new!")
        storage0.scores[his] = target[end]
    end
end

true
4.636553
-3.7118094
4.636553
true
4.636553
-3.7118094
4.636553
true
4.636553
-3.7118094
4.636553
true
4.636553
-3.7118094
4.636553
true
4.636553
-3.7118094
4.636553
true
4.636553
-3.7118094
4.636553
true
4.636553
-3.7118094
4.636553


In [31]:
test = Dict([1,2] => 1.0, [1,2,3] => 2.0, [1,2,3,4,5] => 2.0, [1,2,3,4,5,6] => 2.0, [1,5] => 1.0)

Dict{Vector{Int64}, Float64} with 5 entries:
  [1, 2, 3, 4, 5]    => 2.0
  [1, 2, 3, 4, 5, 6] => 2.0
  [1, 5]             => 1.0
  [1, 2, 3]          => 2.0
  [1, 2]             => 1.0

In [40]:
findmax(test)[1]

2.0

In [36]:
argmax(test)

5-element Vector{Int64}:
 1
 2
 3
 4
 5