In [1]:
Threads.nthreads()

24

In [2]:
include("AZP_env.jl")
include("AZP_agt.jl")
include("AZP_mcts.jl")

check_RL (generic function with 1 method)

In [3]:
mutable struct ReplayBuffer
    @atomic buffer::Vector{Agent}
    buffer_size::Int
    batch_size::Int
    #lk
end

function init_buffer(buffer_size::Int, batch_size::Int)
    #return ReplayBuffer([], buffer_size, batch_size, ReentrantLock())
    return ReplayBuffer([], buffer_size, batch_size)
end

function save_game!(buffer::ReplayBuffer, agt::Agent)
    #@lock buffer.lk begin
        if length(buffer.buffer) > buffer.buffer_size
            popfirst!(buffer.buffer)
        end
        push!(buffer.buffer, agt)
    #end
end

save_game! (generic function with 1 method)

In [4]:
mutable struct Storage
    storage::Dict{Int, Chain}
    random_out::Chain
    scores::Dict{Vector{Int}, Float32}
end

function init_storage(env)
    return Storage(Dict(), Chain(Dense(zeros(Float32, env.output,env.input_dim))), Dict())
end

function latest_model(storage::Storage)
    if(isempty(storage.storage))
        return storage.random_out
    else
        return storage.storage[rand(keys(storage.storage))]
    end
end

latest_model (generic function with 1 method)

In [5]:
function sample_batch!(env::Env, buffer::ReplayBuffer, scores::Dict{Vector{Int}, Float32})
    games = sample(buffer.buffer, weights([length(agt.history) for agt in buffer.buffer]), buffer.batch_size, replace=true)
    g_turn = [(g, rand(1:length(g.history))) for g in games]
    imag = zeros(Int, env.input_dim, buffer.batch_size)
    target = zeros(Float32, env.output, buffer.batch_size)
    println("ready!")
    Threads.@threads for it in 1:buffer.batch_size
        g, turn = g_turn[it]
        imag[:,it] = make_image(env, g, turn)
        target[:,it] = make_target(env, g, scores, turn)
    end
    for it in 1:buffer.batch_size
        g, turn = g_turn[it]
        if(haskey(scores, g.history))
        else
            scores[g.history] = target[end,it]
        end
    end

    return imag, target
end

sample_batch! (generic function with 1 method)

In [6]:
function run_selfplay(env::Env, buffer::ReplayBuffer, storage::Storage, ratio::Float32)
    model = latest_model(storage) |> gpu
    synchronize()
    Threads.@threads for it in 1:env.num_player
        game = play_physics!(env, model, ratio)
        @atomic save_game!(buffer, game)
    end
end

run_selfplay (generic function with 1 method)

In [7]:
sqnorm(x) = sum(abs2, x)

function loss(image::CuArray{Int, 2}, target::Matrix{Float32}, env::Env, model::Chain)
    y1 = cpu(model(image))
    return sum([((y1[end,i]-target[end,i])^2 - target[1:end-1,i]' * log.(softmax(y1[1:end-1,i]).+1f-8)) for i in 1:env.batch_size])/env.batch_size
end

loss (generic function with 1 method)

In [8]:
tanh10(x) = Float32(10)*tanh(x)
tanh2(x) = Float32(2)*tanh(x)

tanh2 (generic function with 1 method)

In [10]:
function train_model!(env::Env, buffer::ReplayBuffer, storage::Storage)
    for b_num in 1:env.batch_num
        if(haskey(storage.storage, b_num))
            model = storage.storage[b_num] |> gpu
        else
            model = Chain(Dense(env.input_dim, env.middle_dim), Tuple(Chain(Parallel(+, Chain(BatchNorm(env.middle_dim), Dense(env.middle_dim, env.middle_dim, relu)),Dense(env.middle_dim, env.middle_dim, relu)), identity) for i in 1:env.depth)..., Flux.flatten, Flux.Parallel(vcat, Chain(Dense(env.middle_dim, env.middle_dim, relu), Dense(env.middle_dim, env.act_ind)), Chain(Dense(env.middle_dim, env.middle_dim, relu), Dense(env.middle_dim, 1, tanh10)))) |> gpu
        end
        opt = Flux.Optimiser(WeightDecay(env.C), Adam(1f-5))
        #ParameterSchedulers.Scheduler(env.scheduler, Momentum())
        iv_batch = []
        tv_batch = []
        bn::Int = 5
        for it in 1:bn
            image_batch, target_batch = sample_batch!(env, buffer, storage.scores)
            push!(iv_batch, image_batch)
            push!(tv_batch, target_batch)
        end
        for it in 1:env.training_step
            for s in 1:bn
                Flux.train!(loss, Flux.params(model), [(cu(iv_batch[s]), tv_batch[s], env, model, 1.0f0)], opt)
            end
        end
        storage.storage[b_num] = model
    end
end

train_model! (generic function with 1 method)

In [11]:
env0 = init_Env(["10", "240", "128", "12", "1200", "48", "1", "600", "0.3", "0.25", "50", "4", "15.0", "0.4", "1.0", "0.7", "0.5", "120", "1.25", "0.00001"])

max_turn:  10
num_player:  240
middle_dim:  128
depth:  12
training_step:  1200
batch_size:  48
batch_num:  1
num_simulation:  600
α:  0.3
frac:  0.25


Env(10, 240, 2, 3, 1, 6, 60, 128, 7, 12, 1200, 400, 48, 1, 0.0001f0, 0.9f0, 600, 0.3f0, 0.25f0, 50, 4, 15.0f0, 0.4f0, 1.0f0, 0.7f0, 0.5f0, ComplexF32[-2.0f0 + 0.0f0im 0.0f0 + 0.0f0im 0.0f0 + 0.0f0im -0.7f0 + 0.0f0im; 0.0f0 - 0.0f0im 1.0f0 + 0.0f0im -0.7f0 + 0.0f0im 0.0f0 + 0.0f0im; 0.0f0 - 0.0f0im -0.7f0 - 0.0f0im 1.0f0 + 0.0f0im 0.0f0 + 0.0f0im; -0.7f0 - 0.0f0im 0.0f0 - 0.0f0im 0.0f0 - 0.0f0im 0.0f0 + 0.0f0im], ComplexF32[0.0f0 + 0.0f0im -0.4f0 + 0.0f0im -0.4f0 + 0.0f0im 0.0f0 + 0.0f0im; -0.4f0 - 0.0f0im 0.0f0 + 0.0f0im 0.0f0 + 0.0f0im -0.4f0 + 0.0f0im; -0.4f0 - 0.0f0im 0.0f0 - 0.0f0im 0.0f0 + 0.0f0im -0.4f0 + 0.0f0im; 0.0f0 - 0.0f0im -0.4f0 - 0.0f0im -0.4f0 - 0.0f0im 0.0f0 + 0.0f0im], 0.00837758f0, 120, 1.25f0, 1.0f-5)

In [12]:
storage0 = init_storage(env0)

Storage(Dict{Int64, Chain}(), Chain(Dense(60 => 7)), Dict{Vector{Int64}, Float32}())

In [13]:
replay_buffer = init_buffer(1000, env0.batch_size)

ReplayBuffer(Agent[], 1000, 48, ReentrantLock(nothing, 0x00000000, 0x00, Base.GenericCondition{Base.Threads.SpinLock}(Base.IntrusiveLinkedList{Task}(nothing, nothing), Base.Threads.SpinLock(0)), (0, 140053156873088, 0)))

In [14]:
ratio = Float32(8.0)
@time run_selfplay(env0, replay_buffer, storage0, ratio)

 93.646277 seconds (189.96 M allocations: 12.094 GiB, 3.15% gc time, 169.57% compilation time: <1% of which was recompilation)


In [16]:
if(haskey(storage0.storage, 1))
    model = storage0.storage[1] |> gpu
else
    println("generate")
    model = Chain(Dense(env0.input_dim, env0.middle_dim), Tuple(Chain(Parallel(+, Chain(BatchNorm(env0.middle_dim), Dense(env0.middle_dim, env0.middle_dim, relu)),Dense(env0.middle_dim, env0.middle_dim, relu)), identity) for i in 1:env0.depth)..., Flux.flatten, Flux.Parallel(vcat, Chain(Dense(env0.middle_dim, env0.middle_dim, relu), Dense(env0.middle_dim, env0.act_ind)), Chain(Dense(env0.middle_dim, env0.middle_dim, relu), Dense(env0.middle_dim, 1, tanh10)))) |> gpu
end
opt = Flux.Optimiser(WeightDecay(env0.C), Adam(1f-5))
#ParameterSchedulers.Scheduler(env.scheduler, Momentum())
iv_batch = []
tv_batch = []
bn = 5

generate


5

In [17]:

image_batch, target_batch = sample_batch!(env0, replay_buffer, storage0.scores)
#push!(iv_batch, image_batch)
#push!(tv_batch, target_batch)

: 

: 

In [25]:
@time train_model!(env0, replay_buffer, storage0)

: 

: 

In [None]:
function AlphaZero_ForPhysics(env::Env, storage::Storage)

    for it in 1:12
        println("=============")
        println("it=$(it);")

        replay_buffer = init_buffer(1000, env.batch_size)
        ratio = Float32(10.0)
        if(it<5)
            @time run_selfplay(env, replay_buffer, storage, ratio)
            @time train_model!(env, replay_buffer, storage, ratio)
        else
            run_selfplay(env, replay_buffer, storage, ratio)
            train_model!(env, replay_buffer, storage, ratio)
        end
        #@report_call run_selfplay(env, replay_buffer, storage)
        #ll = @report_call train_model!(env, replay_buffer, storage)
        println("loss_average: $(ll)")
        push!(ld,ll)
        println("store data")
        println(length(storage.scores))
        if(it%3==0)
            for bb in 1:env.batch_num
                model0 = storage.storage[bb] |> gpu
                println("------------")
                println("head = $(bb);")
                for tes in 1:3
                    game = play_physics!(env, model0)
                    score = calc_score(game.history, env)
                    println("$(game.history), score:$(score)")
                end
            end
        end
    end
    
    return ld, latest_model(storage)
end