https://github.com/JuliaPOMDP/POMDPs.jl

In [None]:
using POMDPs, QuickPOMDPs, POMDPSimulators, QMDP

In [None]:
S = [:left, :right]           # S, A, and O may contain any objects
A = [:left, :right, :listen]  # including user-defined types
O = [:left, :right]
γ = 0.95

function T(s, a, sp)
    if a == :listen
        return s == sp
    else # a door is opened
        return 0.5 #reset
    end
end

function Z(a, sp, o)
    if a == :listen
        if o == sp
            return 0.85
        else
            return 0.15
        end
    else
        return 0.5
    end
end

function R(s, a)
    if a == :listen  
        return -1.0
    elseif s == a # the tiger was found
        return -100.0
    else # the tiger was escaped
        return 10.0
    end
end

m = DiscreteExplicitPOMDP(S,A,O,T,Z,R,γ)

In [None]:
solver = QMDPSolver()
policy = solve(solver, m)

In [None]:
rsum = 0.0
for (s,b,a,o,r) in stepthrough(m, policy, "s,b,a,o,r", max_steps=10)
    println("s: $s, b: $([pdf(b,s) for s in S]), a: $a, o: $o")
    global rsum += r
end
println("Undiscounted reward was $rsum.")

In [None]:
using DeepQLearning
using POMDPs
using Flux
using POMDPModels
using POMDPSimulators

In [None]:
# load MDP model from POMDPModels or define your own!
mdp = SimpleGridWorld();

In [None]:
# Define the Q network (see Flux.jl documentation)
# the gridworld state is represented by a 2 dimensional vector.
model = Chain(Dense(2, 32), Dense(32, n_actions(mdp)))

In [None]:
solver = DeepQLearningSolver(qnetwork = model, max_steps=100000, 
                             learning_rate=0.005,log_freq=5000,
                             recurrence=false,double_q=true, dueling=true, prioritized_replay=true)

In [None]:
policy = solve(solver, mdp)

In [None]:
sim = RolloutSimulator(max_steps=30)
r_tot = simulate(sim, mdp, policy)
println("Total discounted reward for 1 simulation: $r_tot")