The parameters given in the problem statement are as follows

In [1]:
T = 6
α = 1
β = 1
s = 0.6
ξ(m,n) = (m+1)/(m+n+1)
(A,B) = (0,1)

(0, 1)

The reachable set of states at time $t$ are given by $\{ (m,n) : m + n < t \}$. Since we are solving the system only for a small horizon, we store the value function as a $t \times t$ matrix, and only fill in the values corresponding to the reachable set

In [2]:
# Initialize data strcutures
using OffsetArrays

V = [ OffsetArray{Nullable{Float64}}(0:t,0:t) for t in 1:T+1]
QA = [ OffsetArray{Nullable{Float64}}(0:t,0:t) for t in 1:T]
QB = [ OffsetArray{Nullable{Float64}}(0:t,0:t) for t in 1:T]
g = [ OffsetArray{Nullable{Int}}(0:t,0:t) for t in 1:T]
chooseA = [ [] for t in 1:T ]
chooseB = [ [] for t in 1:T ]


for t in 1:T+1
    for m in 0:t, n in 0:t
        V[t][m,n] = nothing
        if t != T+1
            QA[t][m,n] = nothing
            QB[t][m,n] = nothing
            g[t][m,n] = nothing
        end
    end
end

Initialize the value function for $T+1$

In [3]:
for m in 0:T+1, n in 0:(T-m)    
    V[T+1][m,n] = 0.0
end

Now we write the dynamic programming recursion

In [4]:
for t in T:-1:1
    for m in 0:t, n in 0:(t-1-m)
        QA[t][m,n] = α*s + get(V[t+1][m,n])
        p = ξ(m,n)
        QB[t][m,n] = β*p + p*get(V[t+1][m+1,n]) + (1-p)*get(V[t+1][m,n+1])
        if get(QA[t][m,n]) >= get(QB[t][m,n])
            (V[t][m,n], g[t][m,n]) = (get(QA[t][m,n]), 0)
            push!(chooseA[t], (m,n))
        else
            (V[t][m,n], g[t][m,n]) = (get(QB[t][m,n]), 1)
            push!(chooseB[t], (m,n))
        end
    end
end


In [5]:
@printf("The indices where it is optimal to choose action A\n")
for t in 1:T
    @printf("t = %d: %s \n", t, chooseA[t])
end

The indices where it is optimal to choose action A
t = 1: Any[] 
t = 2: Any[] 
t = 3: Any[(0, 2)] 
t = 4: Any[(0, 1), (0, 2), (0, 3), (1, 2)] 
t = 5: Any[(0, 1), (0, 2), (0, 3), (0, 4), (1, 2), (1, 3)] 
t = 6: Any[(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (1, 2), (1, 3), (1, 4), (2, 2), (2, 3)] 


In [6]:
@printf("The indices where it is optimal to choose action B\n")
for t in 1:T
    @printf("t = %d: %s \n", t, chooseB[t])
end

The indices where it is optimal to choose action B
t = 1: Any[(0, 0)] 
t = 2: Any[(0, 0), (0, 1), (1, 0)] 
t = 3: Any[(0, 0), (0, 1), (1, 0), (1, 1), (2, 0)] 
t = 4: Any[(0, 0), (1, 0), (1, 1), (2, 0), (2, 1), (3, 0)] 
t = 5: Any[(0, 0), (1, 0), (1, 1), (2, 0), (2, 1), (2, 2), (3, 0), (3, 1), (4, 0)] 
t = 6: Any[(0, 0), (1, 0), (1, 1), (2, 0), (2, 1), (3, 0), (3, 1), (3, 2), (4, 0), (4, 1), (5, 0)] 
