# Deep Q Learning Mountain Car

In [15]:
using ReinforcementLearningBase, ReinforcementLearningEnvironments
using Flux
using Flux: params
using Plots; gr(); 
import StatsBase.sample

env = MountainCarEnv();

Firstly, we will define agent's "brain". It will be a struct storing a neural network model and all necessary learning parameters:

In [16]:
mutable struct Brain
    β::Float64
    batch_size::Int
    memory_size::Int
    min_memory_size::Int
    memory::Array{Tuple,1}
    net::Chain
    η::Float64
end

function Brain(env; β = 0.99, η = 0.001)
    model = Chain(Dense(length(env.state), 128, relu), 
            Dense(128, 52, relu), 
            Dense(52, length(env.action_space), identity))
    Brain(β, 64 , 50_000, 1000, [], model, η)
end

Brain

and the loss functions:

In [17]:
loss(x, y) = Flux.mse(agent.brain.net(x), y)

loss (generic function with 1 method)

Then Agent:

In [18]:
mutable struct Agent
    env::AbstractEnv
    ϵ::Float64
    ϵ_decay::Float64
    ϵ_min::Float64
    brain::Brain
    position::Float64
    reward::Float64
end

Agent(env::AbstractEnv, ϵ = 1.0, ϵ_decay = 0.9975, ϵ_min = 0.005) = Agent(env, ϵ, ϵ_decay, ϵ_min, 
                                                                        Brain(env), -Inf, 0.0)

Agent

Finally, functions that control the learning process:

In [19]:
function replay!(agent::Agent)
    x = zeros(Float32,length(agent.env.state), agent.brain.batch_size)
    y = zeros(Float32,length(agent.env.action_space), agent.brain.batch_size)
    for (i,step)  in enumerate(sample(agent.brain.memory, agent.brain.batch_size, replace = false))
        s,a,r,s′,terminal = step
        terminal ? (R  = r) : (R = r + agent.brain.β * maximum(agent.brain.net(s′)))
        Q = agent.brain.net(s)
        Q[a] = R
        x[:, i] .= s
        y[:, i] .= Q
    end
    Flux.train!(loss, params(agent.brain.net), [(x, y)], ADAM(agent.brain.η))
end

replay! (generic function with 1 method)

In [20]:
function remember!(brain::Brain, step::Tuple)
    length(brain.memory) == brain.memory_size && deleteat!(brain.memory,1)
    push!(brain.memory, step)
end

remember! (generic function with 1 method)

In [21]:
policy(agent::Agent, state::Array{Float64,1}) = argmax(agent.brain.net(state))

policy (generic function with 1 method)

In [22]:
function step!(agent::Agent, train::Bool)
    s = deepcopy(agent.env.state)
    (rand() < agent.ϵ  && train) ? (a = rand(agent.env.action_space)) : (a = policy(agent, s))
    agent.env(a)
    r, s′, terminal = deepcopy(reward(agent.env)), deepcopy(state(agent.env)),
    deepcopy(is_terminated(agent.env))
    agent.position = s′[1]
    agent.reward += r
    remember!(agent.brain, (s,a,r,s′,terminal))
    (train && length(agent.brain.memory) > agent.brain.min_memory_size) && replay!(agent)
    terminal 
end

step! (generic function with 1 method)

In [23]:
function run!(agent::Agent, episodes::Int; train::Bool = true, plotting::Bool = true, summary::Bool = true)
    rewards = []
    success_rates = []
    ep = 1.0
    success = 0.0
    while ep ≤ episodes
        plotting && (plot(agent.env); sleep(0.0001))
        if step!(agent, train) 
            reset!(agent.env)
            agent.position > 0.5 && (success += 1.0)
            push!(rewards, agent.reward)
            push!(success_rates, success/ep)
            if summary
                println("episode $(Int(ep)) ends! Reward: $(agent.reward)")
                println("ϵ: $(agent.ϵ), success rate: $(success/ep)")
            end
            ep += 1.0
            agent.reward = 0.0
            agent.position = -Inf
            eps = agent.ϵ * agent.ϵ_decay
            agent.ϵ = max(agent.ϵ_min, eps)
        end
    end
    return rewards, success_rates
end

run! (generic function with 1 method)

## Experiment

In [24]:
agent = Agent(env);

Before Learning:

In [25]:
#rewards,_ = run!(agent,10; train = false, plotting = true);

After Learning:

In [26]:
rewards, success_rates = run!(agent,1000; plotting = false);
#rewards,_ = run!(agent,10; train = false, plotting = true);

episode 1 ends! Reward: -199.0
ϵ: 1.0, success rate: 0.0
episode 2 ends! Reward: -199.0
ϵ: 0.9975, success rate: 0.0
episode 3 ends! Reward: -199.0
ϵ: 0.9950062500000001, success rate: 0.0
episode 4 ends! Reward: -199.0
ϵ: 0.9925187343750002, success rate: 0.0
episode 5 ends! Reward: -199.0
ϵ: 0.9900374375390627, success rate: 0.0
episode 6 ends! Reward: -199.0
ϵ: 0.9875623439452151, success rate: 0.0
episode 7 ends! Reward: -199.0
ϵ: 0.9850934380853521, success rate: 0.0
episode 8 ends! Reward: -199.0
ϵ: 0.9826307044901388, success rate: 0.0
episode 9 ends! Reward: -199.0
ϵ: 0.9801741277289134, success rate: 0.0
episode 10 ends! Reward: -199.0
ϵ: 0.9777236924095912, success rate: 0.0
episode 11 ends! Reward: -199.0
ϵ: 0.9752793831785673, success rate: 0.0
episode 12 ends! Reward: -199.0
ϵ: 0.972841184720621, success rate: 0.0
episode 13 ends! Reward: -199.0
ϵ: 0.9704090817588195, success rate: 0.0
episode 14 ends! Reward: -199.0
ϵ: 0.9679830590544225, success rate: 0.0
episode 15 ends

ϵ: 0.755518821885128, success rate: 0.0
episode 114 ends! Reward: -199.0
ϵ: 0.7536300248304152, success rate: 0.0
episode 115 ends! Reward: -199.0
ϵ: 0.7517459497683392, success rate: 0.0
episode 116 ends! Reward: -199.0
ϵ: 0.7498665848939184, success rate: 0.0
episode 117 ends! Reward: -199.0
ϵ: 0.7479919184316837, success rate: 0.0
episode 118 ends! Reward: -199.0
ϵ: 0.7461219386356045, success rate: 0.0
episode 119 ends! Reward: -199.0
ϵ: 0.7442566337890155, success rate: 0.0
episode 120 ends! Reward: -199.0
ϵ: 0.742395992204543, success rate: 0.0
episode 121 ends! Reward: -199.0
ϵ: 0.7405400022240317, success rate: 0.0
episode 122 ends! Reward: -199.0
ϵ: 0.7386886522184717, success rate: 0.0
episode 123 ends! Reward: -199.0
ϵ: 0.7368419305879256, success rate: 0.0
episode 124 ends! Reward: -199.0
ϵ: 0.7349998257614558, success rate: 0.0
episode 125 ends! Reward: -199.0
ϵ: 0.7331623261970522, success rate: 0.0
episode 126 ends! Reward: -199.0
ϵ: 0.7313294203815597, success rate: 0.0

episode 225 ends! Reward: -199.0
ϵ: 0.5708086902226919, success rate: 0.0
episode 226 ends! Reward: -199.0
ϵ: 0.5693816684971351, success rate: 0.0
episode 227 ends! Reward: -199.0
ϵ: 0.5679582143258923, success rate: 0.0
episode 228 ends! Reward: -199.0
ϵ: 0.5665383187900777, success rate: 0.0
episode 229 ends! Reward: -199.0
ϵ: 0.5651219729931025, success rate: 0.0
episode 230 ends! Reward: -199.0
ϵ: 0.5637091680606198, success rate: 0.0
episode 231 ends! Reward: -199.0
ϵ: 0.5622998951404683, success rate: 0.0
episode 232 ends! Reward: -199.0
ϵ: 0.5608941454026172, success rate: 0.0
episode 233 ends! Reward: -199.0
ϵ: 0.5594919100391107, success rate: 0.0
episode 234 ends! Reward: -199.0
ϵ: 0.558093180264013, success rate: 0.0
episode 235 ends! Reward: -199.0
ϵ: 0.556697947313353, success rate: 0.0
episode 236 ends! Reward: -199.0
ϵ: 0.5553062024450697, success rate: 0.0
episode 237 ends! Reward: -199.0
ϵ: 0.5539179369389571, success rate: 0.0
episode 238 ends! Reward: -199.0
ϵ: 0.55

episode 336 ends! Reward: -199.0
ϵ: 0.4323375530414447, success rate: 0.0
episode 337 ends! Reward: -199.0
ϵ: 0.4312567091588411, success rate: 0.0
episode 338 ends! Reward: -199.0
ϵ: 0.43017856738594407, success rate: 0.0
episode 339 ends! Reward: -199.0
ϵ: 0.4291031209674792, success rate: 0.0
episode 340 ends! Reward: -199.0
ϵ: 0.42803036316506055, success rate: 0.0
episode 341 ends! Reward: -199.0
ϵ: 0.4269602872571479, success rate: 0.0
episode 342 ends! Reward: -199.0
ϵ: 0.42589288653900503, success rate: 0.0
episode 343 ends! Reward: -199.0
ϵ: 0.4248281543226575, success rate: 0.0
episode 344 ends! Reward: -199.0
ϵ: 0.4237660839368509, success rate: 0.0
episode 345 ends! Reward: -199.0
ϵ: 0.4227066687270088, success rate: 0.0
episode 346 ends! Reward: -199.0
ϵ: 0.4216499020551913, success rate: 0.0
episode 347 ends! Reward: -199.0
ϵ: 0.4205957773000534, success rate: 0.0
episode 348 ends! Reward: -199.0
ϵ: 0.41954428785680326, success rate: 0.0
episode 349 ends! Reward: -199.0
ϵ

ϵ: 0.3282784994873863, success rate: 0.0
episode 447 ends! Reward: -199.0
ϵ: 0.32745780323866785, success rate: 0.0
episode 448 ends! Reward: -199.0
ϵ: 0.3266391587305712, success rate: 0.0
episode 449 ends! Reward: -199.0
ϵ: 0.3258225608337448, success rate: 0.0
episode 450 ends! Reward: -199.0
ϵ: 0.32500800443166045, success rate: 0.0
episode 451 ends! Reward: -199.0
ϵ: 0.3241954844205813, success rate: 0.0
episode 452 ends! Reward: -199.0
ϵ: 0.3233849957095299, success rate: 0.0
episode 453 ends! Reward: -199.0
ϵ: 0.3225765332202561, success rate: 0.0
episode 454 ends! Reward: -199.0
ϵ: 0.32177009188720546, success rate: 0.0
episode 455 ends! Reward: -199.0
ϵ: 0.32096566665748744, success rate: 0.0
episode 456 ends! Reward: -199.0
ϵ: 0.3201632524908437, success rate: 0.0
episode 457 ends! Reward: -199.0
ϵ: 0.3193628443596166, success rate: 0.0
episode 458 ends! Reward: -199.0
ϵ: 0.3185644372487176, success rate: 0.0
episode 459 ends! Reward: -199.0
ϵ: 0.3177680261555958, success rat

ϵ: 0.2492653540446881, success rate: 0.0
episode 557 ends! Reward: -199.0
ϵ: 0.2486421906595764, success rate: 0.0
episode 558 ends! Reward: -199.0
ϵ: 0.24802058518292747, success rate: 0.0
episode 559 ends! Reward: -199.0
ϵ: 0.24740053371997017, success rate: 0.0
episode 560 ends! Reward: -199.0
ϵ: 0.24678203238567026, success rate: 0.0
episode 561 ends! Reward: -199.0
ϵ: 0.2461650773047061, success rate: 0.0
episode 562 ends! Reward: -199.0
ϵ: 0.24554966461144434, success rate: 0.0
episode 563 ends! Reward: -199.0
ϵ: 0.24493579044991573, success rate: 0.0
episode 564 ends! Reward: -199.0
ϵ: 0.24432345097379096, success rate: 0.0
episode 565 ends! Reward: -199.0
ϵ: 0.2437126423463565, success rate: 0.0
episode 566 ends! Reward: -199.0
ϵ: 0.24310336074049063, success rate: 0.0
episode 567 ends! Reward: -199.0
ϵ: 0.24249560233863943, success rate: 0.0
episode 568 ends! Reward: -199.0
ϵ: 0.24188936333279284, success rate: 0.0
episode 569 ends! Reward: -199.0
ϵ: 0.24128463992446086, succe

episode 654 ends! Reward: -199.0
ϵ: 0.19504128352692915, success rate: 0.05198776758409786
episode 655 ends! Reward: -199.0
ϵ: 0.19455368031811185, success rate: 0.051908396946564885
episode 656 ends! Reward: -199.0
ϵ: 0.19406729611731657, success rate: 0.051829268292682924
episode 657 ends! Reward: -199.0
ϵ: 0.19358212787702328, success rate: 0.0517503805175038
episode 658 ends! Reward: -199.0
ϵ: 0.19309817255733072, success rate: 0.05167173252279635
episode 659 ends! Reward: -199.0
ϵ: 0.1926154271259374, success rate: 0.051593323216995446
episode 660 ends! Reward: -199.0
ϵ: 0.19213388855812258, success rate: 0.051515151515151514
episode 661 ends! Reward: -199.0
ϵ: 0.19165355383672728, success rate: 0.05143721633888049
episode 662 ends! Reward: -199.0
ϵ: 0.19117441995213547, success rate: 0.0513595166163142
episode 663 ends! Reward: -199.0
ϵ: 0.19069648390225513, success rate: 0.05128205128205128
episode 664 ends! Reward: -199.0
ϵ: 0.1902197426924995, success rate: 0.05120481927710843

episode 745 ends! Reward: -126.0
ϵ: 0.1553105089703522, success rate: 0.11946308724832215
episode 746 ends! Reward: -130.0
ϵ: 0.15492223269792632, success rate: 0.12064343163538874
episode 747 ends! Reward: -123.0
ϵ: 0.15453492711618153, success rate: 0.12182061579651941
episode 748 ends! Reward: -148.0
ϵ: 0.15414858979839108, success rate: 0.12299465240641712
episode 749 ends! Reward: -153.0
ϵ: 0.1537632183238951, success rate: 0.12416555407209613
episode 750 ends! Reward: -137.0
ϵ: 0.1533788102780854, success rate: 0.12533333333333332
episode 751 ends! Reward: -122.0
ϵ: 0.1529953632523902, success rate: 0.12649800266311584
episode 752 ends! Reward: -120.0
ϵ: 0.15261287484425923, success rate: 0.1276595744680851
episode 753 ends! Reward: -125.0
ϵ: 0.15223134265714858, success rate: 0.12881806108897742
episode 754 ends! Reward: -128.0
ϵ: 0.1518507643005057, success rate: 0.129973474801061
episode 755 ends! Reward: -132.0
ϵ: 0.15147113738975446, success rate: 0.13112582781456952
episode

episode 836 ends! Reward: -199.0
ϵ: 0.12367306941609345, success rate: 0.20095693779904306
episode 837 ends! Reward: -199.0
ϵ: 0.12336388674255322, success rate: 0.2007168458781362
episode 838 ends! Reward: -199.0
ϵ: 0.12305547702569684, success rate: 0.20047732696897375
episode 839 ends! Reward: -199.0
ϵ: 0.1227478383331326, success rate: 0.20023837902264602
episode 840 ends! Reward: -199.0
ϵ: 0.12244096873729977, success rate: 0.2
episode 841 ends! Reward: -194.0
ϵ: 0.12213486631545653, success rate: 0.20095124851367419
episode 842 ends! Reward: -199.0
ϵ: 0.1218295291496679, success rate: 0.2007125890736342
episode 843 ends! Reward: -199.0
ϵ: 0.12152495532679373, success rate: 0.20047449584816132
episode 844 ends! Reward: -199.0
ϵ: 0.12122114293847676, success rate: 0.20023696682464456
episode 845 ends! Reward: -196.0
ϵ: 0.12091809008113057, success rate: 0.20118343195266272
episode 846 ends! Reward: -168.0
ϵ: 0.12061579485592774, success rate: 0.20212765957446807
episode 847 ends! R

episode 927 ends! Reward: -167.0
ϵ: 0.09848031662633713, success rate: 0.23408845738942827
episode 928 ends! Reward: -145.0
ϵ: 0.09823411583477129, success rate: 0.2349137931034483
episode 929 ends! Reward: -89.0
ϵ: 0.09798853054518437, success rate: 0.2357373519913886
episode 930 ends! Reward: -156.0
ϵ: 0.09774355921882141, success rate: 0.23655913978494625
episode 931 ends! Reward: -145.0
ϵ: 0.09749920032077436, success rate: 0.23737916219119226
episode 932 ends! Reward: -164.0
ϵ: 0.09725545231997243, success rate: 0.23819742489270387
episode 933 ends! Reward: -154.0
ϵ: 0.0970123136891725, success rate: 0.2390139335476956
episode 934 ends! Reward: -157.0
ϵ: 0.09676978290494957, success rate: 0.2398286937901499
episode 935 ends! Reward: -155.0
ϵ: 0.0965278584476872, success rate: 0.24064171122994651
episode 936 ends! Reward: -148.0
ϵ: 0.09628653880156798, success rate: 0.24145299145299146
episode 937 ends! Reward: -92.0
ϵ: 0.09604582245456407, success rate: 0.24226254002134473
episode

<b> Plot 1: </b> Successe rate as a function of time.

In [None]:
plot(success_rates, xlabel = "Time", ylabel = "Sucess rate", legend = false)

<b> Plot 2: </b> Reward as a function of time.

In [None]:
plot(rewards, xlabel = "Time", ylabel = "Reward", legend = false)