In [1]:
import numpy as np
import gym
import random

In [22]:
# Solving policy evaluation using dynamic programming

states = [0,1,2,3,4,5,6,7,8]
actions = [0,1,2,3,4,5]

P = np.zeros((9, 6, 9))  
R = np.zeros((9, 6, 9))  

P[0,4,0] = 1.0
P[1,4,1] = 1.0
P[3,4,3] = 1.0
P[4,4,4] = 1.0
P[5,4,5] = 1.0
P[6,4,6] = 1.0
P[7,4,7] = 1.0
P[8,4,8] = 1.0
P[2,5,2] = 1.0


R[0,4,0] = 5
R[0,5,0] = -10
R[1,4,1] = 5
R[1,5,1] = -10
R[8,4,8] = 5
R[8,5,8] = -10
R[2,5,2] = 50
R[3,4,3] = 5
R[3,5,3] = -10
R[4,4,4] = 5
R[4,5,4] = -10
R[5,4,5] = 5
R[5,5,5] = -10
R[6,4,6] = 5
R[6,5,6] = -10
R[7,4,7] = 5
R[7,5,7] = -10


gamma = 0.9

policy = [0 for s in range(9)]
V = np.zeros(9)

print("Initial policy", policy)



iterations = 0
delta = 0
theta = 0.000001
while delta < theta:
    iterations += 1
    for s in range(9):
        V[s] = sum([P[s,policy[s],s1] * (R[s,policy[s],s1] + gamma*V[s1]) for s1 in range(9)])

    for s in range(9):
        q_best = V[s]
        for a in range(6):
            q_sa = sum([P[s, a, s1] * (R[s, a, s1] + gamma * V[s1]) for s1 in range(9)])
            if q_sa > q_best:
                print("State", s, ": q_selected_action", q_sa, "q_best_action", q_best)
                policy[s] = a
                q_best = q_sa
                delta = max(delta, np.abs(v - V[s]))
    print("Iterations:", iterations)

print("Optimal policy")
print(policy)
print(V)

Initial policy [0, 0, 0, 0, 0, 0, 0, 0, 0]
State 0 : q_selected_action 5.0 q_best_action 0.0
State 1 : q_selected_action 5.0 q_best_action 0.0
State 2 : q_selected_action 50.0 q_best_action 0.0
State 3 : q_selected_action 5.0 q_best_action 0.0
State 4 : q_selected_action 5.0 q_best_action 0.0
State 5 : q_selected_action 5.0 q_best_action 0.0
State 6 : q_selected_action 5.0 q_best_action 0.0
State 7 : q_selected_action 5.0 q_best_action 0.0
State 8 : q_selected_action 5.0 q_best_action 0.0
Iterations: 1
State 0 : q_selected_action 9.5 q_best_action 5.0
State 1 : q_selected_action 9.5 q_best_action 5.0
State 2 : q_selected_action 95.0 q_best_action 50.0
State 3 : q_selected_action 9.5 q_best_action 5.0
State 4 : q_selected_action 9.5 q_best_action 5.0
State 5 : q_selected_action 9.5 q_best_action 5.0
State 6 : q_selected_action 9.5 q_best_action 5.0
State 7 : q_selected_action 9.5 q_best_action 5.0
State 8 : q_selected_action 9.5 q_best_action 5.0
Iterations: 2
Optimal policy
[4, 4, 5, 4

In [23]:
# Solving policy iteration using MDP

states = [0,1,2,3,4,5,6,7,8]
actions = [0,1,2,3,4,5]

P = np.zeros((9, 6, 9))  
R = np.zeros((9, 6, 9))  

P[0,4,0] = 1.0
P[1,4,1] = 1.0
P[3,4,3] = 1.0
P[4,4,4] = 1.0
P[5,4,5] = 1.0
P[6,4,6] = 1.0
P[7,4,7] = 1.0
P[8,4,8] = 1.0
P[2,5,2] = 1.0


R[0,4,0] = 5
R[0,5,0] = -10
R[1,4,1] = 5
R[1,5,1] = -10
R[8,4,8] = 5
R[8,5,8] = -10
R[2,5,2] = 50
R[3,4,3] = 5
R[3,5,3] = -10
R[4,4,4] = 5
R[4,5,4] = -10
R[5,4,5] = 5
R[5,5,5] = -10
R[6,4,6] = 5
R[6,5,6] = -10
R[7,4,7] = 5
R[7,5,7] = -10


gamma = 0.9

policy = [0 for s in range(9)]
V = np.zeros(9)

print("Initial policy", policy)


is_value_changed = True
iterations = 0
while is_value_changed:
    is_value_changed = False
    iterations += 1
    for s in range(9):
        V[s] = sum([P[s,policy[s],s1] * (R[s,policy[s],s1] + gamma*V[s1]) for s1 in range(9)])

    for s in range(9):
        q_best = V[s]
        for a in range(6):
            q_sa = sum([P[s, a, s1] * (R[s, a, s1] + gamma * V[s1]) for s1 in range(9)])
            if q_sa > q_best:
                print("State", s, ": q_selected_action", q_sa, "q_best_action", q_best)
                policy[s] = a
                q_best = q_sa
                is_value_changed = True

    print("Iterations:", iterations)

print("Optimal policy")
print(policy)
print(V)

Initial policy [0, 0, 0, 0, 0, 0, 0, 0, 0]
State 0 : q_selected_action 5.0 q_best_action 0.0
State 1 : q_selected_action 5.0 q_best_action 0.0
State 2 : q_selected_action 50.0 q_best_action 0.0
State 3 : q_selected_action 5.0 q_best_action 0.0
State 4 : q_selected_action 5.0 q_best_action 0.0
State 5 : q_selected_action 5.0 q_best_action 0.0
State 6 : q_selected_action 5.0 q_best_action 0.0
State 7 : q_selected_action 5.0 q_best_action 0.0
State 8 : q_selected_action 5.0 q_best_action 0.0
Iterations: 1
State 0 : q_selected_action 9.5 q_best_action 5.0
State 1 : q_selected_action 9.5 q_best_action 5.0
State 2 : q_selected_action 95.0 q_best_action 50.0
State 3 : q_selected_action 9.5 q_best_action 5.0
State 4 : q_selected_action 9.5 q_best_action 5.0
State 5 : q_selected_action 9.5 q_best_action 5.0
State 6 : q_selected_action 9.5 q_best_action 5.0
State 7 : q_selected_action 9.5 q_best_action 5.0
State 8 : q_selected_action 9.5 q_best_action 5.0
Iterations: 2
State 0 : q_selected_actio