We start with the code for an MDP class in Assignment 5

In [1]:
type MDP 
    stateSize  :: Int
    actionSize :: Int
    bellmanUpdate! :: Function
    c :: Array{Float64, 2}
    P :: Array{ Array{Float64, 2}, 1}
    
    function MDP(c, P)
        (n, m) = size(c)
        
        if length(P) != m
            error("Number of transition matrices does not match the number of actions")
        end
        
        P_concatenated = vcat(P...)
        if size(P_concatenated) != (n*m, n)
            error("Size of transition and reward matrices are inconsistent")
        end
        
        is_square(Pi) = size(Pi) == (n,n)
        is_row_stochastic(Pi) = isapprox(sum(Pi,2), ones(n); atol=100*eps(Float64))
        is_stopping_action(Pi) = Pi - zero(Pi)
        
        for Pi in P
            if !is_square(Pi)
                error("Transition matrix is not a square matrix")
                print(size(Pi))
                print(n)
            elseif !(is_row_stochastic(Pi) || is_stopping_action(Pi))
                error("Transition matrix is not row stochastic")
            end
        end
        
        function update!(v, g, vOld; discount=1)
            Q = c + discount * reshape(P_concatenated * vOld, n, m)
            
            for x=1:n
                g[x], v[x] = 1, Q[x,1]
                for u=2:m
                    if Q[x,u] < v[x]
                        g[x], v[x] = u, Q[x,u]
                    end
                end
            end
        end
        
        new(n, m, update!, c, P)
    end
end

And define the function to evaluate a policy

In [2]:
function evaluate(m::MDP, g;
                  discount=1.0)
    n = size(m.c, 1)
    c = zeros(n)
    P = zeros(n,n)
        
    for x=1:n
        u = g[x]
        c[x] = m.c[x,u]
        P[x,:] = m.P[u][x,:]
    end
    
    return (eye(n)-discount*P)\c
end

evaluate (generic function with 1 method)

## Value Iteration

## Policy Iteration

In [35]:
function policyIteration(model::MDP, initial_g; discount=0.9)
    evaluate_policy(g) = evaluate(model, g; discount=discount)
    update!(v, g, initial_v) = model.bellmanUpdate!(v, g, initial_v; discount=discount)
    
    g_next = copy(initial_g)
    v_next = evaluate_policy(g_next)
   
    g_previous = zero(initial_g)
    v_previous = zero(v_next)
    
    iterationCount = 1
    
    while g_next != g_previous
        iterationCount += 1
        g_previous[:] = g_next[:]
        v_previous[:] = v_next[:]
        update!(v_next, g_next, v_previous)
        v_next = evaluate_policy(g_next)
        print([g_previous g_next])
    end
    
    info(@sprintf("Policy Iteration converged in %d iterations", iterationCount))
    return (v_next, g_next)
    
end

policyIteration (generic function with 1 method)

## System Described in the Question

In [4]:
const rate        = [0.2 0.5 0.8]
const arrivalRate = 0.6

const serviceCost = [0 2 6]
const holdingCost = 2

const R = 1
const M = 8
const A = length(rate)

P = [ spzeros(Float64, M+1, M+1) for u = 1:A]
c = zeros(Float64, M+1, A)

# Initialize cost matrix
c[1,:] = 0 

for x = 1:(M+1)
    for u = 1:A
        if x == 1 
          c[x,u] = serviceCost[u] 
        else
          c[x,u] = (x-1) * holdingCost + serviceCost[u] - R*rate[u]
        end
    end
end

# Initialize Probability matrix
for x = 2:M
    for u = 1:A
        P[u][x, x-1] = (1 - arrivalRate) * rate[u]
        P[u][x, x]   = (1 - arrivalRate) * (1 - rate[u]) + arrivalRate * rate[u]
        P[u][x, x+1] = arrivalRate * ( 1 - rate[u])
    end
end

for u = 1:A
    P[u][1,1] = (1 - arrivalRate) 
    P[u][1,2] = arrivalRate

    P[u][M+1, M+1] = (1 - rate[u])
    P[u][M+1, M  ] = rate[u]
end

model = MDP(c,P)

MDP(9, 3, update!, [0.0 2.0 6.0; 1.8 3.5 7.2; … ; 13.8 15.5 19.2; 15.8 17.5 21.2], Array{Float64,2}[[0.4 0.6 … 0.0 0.0; 0.08 0.44 … 0.0 0.0; … ; 0.0 0.0 … 0.44 0.48; 0.0 0.0 … 0.2 0.8], [0.4 0.6 … 0.0 0.0; 0.2 0.5 … 0.0 0.0; … ; 0.0 0.0 … 0.5 0.3; 0.0 0.0 … 0.5 0.5], [0.4 0.6 … 0.0 0.0; 0.32 0.56 … 0.0 0.0; … ; 0.0 0.0 … 0.56 0.12; 0.0 0.0 … 0.8 0.2]])

In [5]:
v0 = zeros(Float64, M+1)
g0 = zeros(Int,M+1) + 1

9-element Array{Int64,1}:
 1
 1
 1
 1
 1
 1
 1
 1
 1

In [36]:
(vP,gP) = policyIteration(model, g0; discount=0.9)

[1 1; 1 3; 1 3; 1 3; 1 2; 1 2; 1 2; 1 1; 1 1][1 1; 3 2; 3 3; 3 3; 2 3; 2 3; 2 2; 1 2; 1 1][1 1; 2 2; 3 3; 3 3; 3 3; 3 3; 2 2; 2 2; 1 1]

[1m[36mINFO: [39m[22m[36mPolicy Iteration converged in 4 iterations
[39m

([47.2656, 56.0185, 69.6383, 85.2527, 102.125, 119.457, 135.541, 146.463, 150.584], [1, 2, 3, 3, 3, 3, 2, 2, 1])

In [37]:
vN = evaluate(model,gP;discount=0.9)

9-element Array{Float64,1}:
  47.2656
  56.0185
  69.6383
  85.2527
 102.125 
 119.457 
 135.541 
 146.463 
 150.584 

In [38]:
[vP vN]

9×2 Array{Float64,2}:
  47.2656   47.2656
  56.0185   56.0185
  69.6383   69.6383
  85.2527   85.2527
 102.125   102.125 
 119.457   119.457 
 135.541   135.541 
 146.463   146.463 
 150.584   150.584 

LoadError: [91mUndefVarError: gN not defined[39m