LoadError: syntax: ">" is not a unary operator

In [None]:
using Flux
using Zygote
using PyCall
using Pipe
using DataStructures
using StatsBase
using Printf

In [None]:
gym = pyimport("gym")
env = gym.make("CartPole-v0")

In [None]:
const STATE_SPACE = length(env.observation_space.low)
const ACTION_SPACE = 2
const ACTIONS = 0:(ACTION_SPACE-1)
const EPSILON_START = 0.10
const EPOCHS = 100
const STEPS_PER_EPOCH = 5000
const TRAIN_STEPS_PER_EPOCH = 2500
const BUFFER_SIZE = 10000
const BATCH_SIZE = 16
const DISCOUNT = 0.99


In [None]:
memory = CircularBuffer{Any}(10000)

In [None]:
model = Chain(
    Dense(STATE_SPACE + ACTION_SPACE, 60, tanh),
    LayerNorm(60),
    Dense(60, 60, tanh),
    LayerNorm(60),
    Dense(60, 1),
)

function forward(model, state, action)
    # Onehot has different interfaces for arrays and single values
    onehot = length(action) == 1 ? Flux.onehot : Flux.onehotbatch
    model(vcat(state, onehot(action, ACTIONS)))
end

# Act after an epsilon-greedy strategy
function act(model, state, epsilon)
    if epsilon != 0 && rand() <= epsilon
        return rand(ACTIONS)
    end
    return ACTIONS[argmax(map((a) -> forward(model, state, a), ACTIONS))]
end

In [None]:
optim = ADAM(1e-3)

In [None]:
function grad_descend!(model, parameters, memory)
    outer_loss = 0
    # Sample BATCH_SIZE elements from the replay buffer
    batch = memory[1:BATCH_SIZE]
    sample!(memory, batch, replace=false)
    # Destructure the batch
    state, action, reward, next_state, death = [getindex.(batch, i) for i in 1:5]
    state = hcat(state...)
    next_state = hcat(next_state...)

    # Perform one step of grad descend
    Q_next = (.!death) .* [maximum(map((a) -> forward(model, s, a), ACTIONS))[1] for s in eachcol(next_state)]
    target = reward .+ DISCOUNT*Q_next

    grads = gradient(parameters) do
        loss = Flux.mse(forward(model, state, action), target)
        outer_loss += loss
        return loss
    end

    Flux.update!(optim, parameters, grads)
    return outer_loss
end

In [None]:
function train_epoch!(memory, model, env, epoch)
    state = env.reset()
    traj_rewards = []
    total_reward = 0
    epsilon = EPSILON_START * (1 - epoch/EPOCHS)
    
    parameters = Flux.params(model)
    total_loss = 0.0
    
   
    for step in 1:STEPS_PER_EPOCH
        action = act(model, state, epsilon) # Act
        next_state, reward, death, _ = env.step(action) # Advance the env
        push!(memory, (state, action, reward, next_state, death)) # Save that memory
        total_reward += reward
        if death
            state = env.reset()
            push!(traj_rewards, total_reward)
            total_reward = 0
        else
            state = next_state
        end
        
        if length(memory) > BATCH_SIZE
            loss = grad_descend!(model, parameters, memory)
            total_loss += loss
        end
    end
    
    return mean(traj_rewards), total_loss / STEPS_PER_EPOCH
end

In [None]:
for e in 1:EPOCHS
    reward, loss = train_epoch!(memory, model, env, e)
    @printf("Epoch %d, Loss %f, Reward %f\n", e, reward, loss)
end

3-element BitArray{1}:
 0
 1
 1

In [11]:
function test(model, env)
    total_reward = 0
    state = env.reset()
    for i in 1:STEPS_PER_EPOCH
        action = act(model, state, 0)
        state, reward, death, _ = env.step(action)
        total_reward += reward
        
        env.render()
        death && return total_reward, i
    end
end

test (generic function with 1 method)

In [15]:
test(model, env)

(10.0, 10)

In [38]:
env.reset()
s = 0
r = 0
d = 0
for _ in 1:5
    s,r,d = env.step(0)
end


In [42]:
forward(model, s, 0)

1-element Array{Float32,1}:
 9.298442