In [1]:
import numpy as np

moves = np.array(range(1, 7))
actions = ['N', 'E', 'S', 'W']
discount = 0.9
reward = -0.02
V = np.zeros(len(moves))

Policy_rand = {
    1: {'N': 1, 'E': 0, 'S': 5, 'W': 1},
    2: {'N': 1, 'E': 3, 'S': 2, 'W': 2},
    3: {'N': 2, 'E': 4, 'S': 3, 'W': 3},
    4: {'N': 3, 'E': 4, 'S': 6, 'W': 4},
    5: {'N': 1, 'E': 5, 'S': 5, 'W': 5},
    6: {'N': 4, 'E': 6, 'S': 6, 'W': 6}
}
def new_policy(move):
    tra = Policy_rand[move]
    move_values = [V[tra[act] - 1] for act in actions if tra[act] != move]
    return reward + discount * np.mean(move_values)
    
def new_optimal(move):
    if move == 6:
        return 0 
    tra = Policy_rand[move]
    if 6 in tra.values():
        return reward + discount * V[5]
    else:
        return reward + discount * max([V[tra[act] - 1] for act in actions if tra[act] != move])

for _ in range(1000):
    V_new = np.array([new_policy(s) for s in moves])
    if np.max(np.abs(V - V_new)) < 1e-4:
        break
    V = V_new

print("Values displayed by 'Random Policy':", V)

V = np.zeros(len(moves))

for _ in range(1000):
    V_new = np.array([new_optimal(s) for s in moves])
    if np.max(np.abs(V - V_new)) < 1e-4:
        break
    V = V_new

print("Values dispayed by 'Optimal Policy':", V)

Values displayed by 'Random Policy': [-0.19907232 -0.19907232 -0.19907232 -0.19907232 -0.19907232 -0.19907232]
Values dispayed by 'Optimal Policy': [-0.02  -0.038 -0.038 -0.02  -0.038  0.   ]


In [2]:
import numpy as np

moves = 6
discount = 0.9
reward = -0.02
V = np.zeros(moves)
iter = 100

Policy_rand = np.array([
    [1/3, 1/3, 0, 0, 1/3, 0], 
    [1/2, 0, 1/2, 0, 0, 0],
    [0, 1/2, 0, 1/2, 0, 0], 
    [0, 0, 1/3, 1/3, 0, 1/3], 
    [1/3, 0, 0, 0, 1/3, 1/3], 
    [0, 0, 0, 1, 0, 0]         
])

for _ in range(iter):
    V_p = np.copy(V)
    for s in range(moves):
        V[s] = reward + discount * np.sum(Policy_rand[s] * V_p)
    if np.max(np.abs(V - V_p)) < 0.01:
        break

print("Final moves under 'Random Policy':", V)


Final moves under 'Random Policy': [-0.11390656 -0.11390656 -0.11390656 -0.11390656 -0.11390656 -0.11390656]


In [3]:
import numpy as np

moves = 6
discount = 0.9
rewards = -0.02
V = np.zeros(moves)
iter = 100

Policy_rand = np.array([
    [1/3, 1/3, 0, 0, 1/3, 0],
    [1/2, 0, 1/2, 0, 0, 0], 
    [0, 1/2, 0, 1/2, 0, 0],
    [0, 0, 1/2, 0, 0, 1/2],
    [1/3, 0, 0, 0, 1/3, 1/3],
    [0, 0, 0, 0, 0, 1]        
])

Policy_opt = np.array([
    [0, 1, 0, 0, 0, 0],
    [0, 0, 1, 0, 0, 0],
    [0, 0, 0, 1, 0, 0], 
    [0, 0, 0, 0, 0, 1], 
    [0, 0, 0, 0, 0, 1],
    [0, 0, 0, 0, 0, 1] 
])

for policy_matrix, policy_name in [(Policy_rand, "Random"), (Policy_opt, "Optimal")]:
    V = np.zeros(moves)
    for _ in range(iter):
        V_new = rewards + discount * np.dot(policy_matrix, V)
        if np.max(np.abs(V - V_new)) < 0.0001:
            print(f"{policy_name}")
            break
        V = V_new
    print(f"Final values under the {policy_name} policy:", V)

Random
Final values under the Random policy: [-0.19907232 -0.19907232 -0.19907232 -0.19907232 -0.19907232 -0.19907232]
Optimal
Final values under the Optimal policy: [-0.19907232 -0.19907232 -0.19907232 -0.19907232 -0.19907232 -0.19907232]


In [4]:
# Question 1

In [5]:
from sympy import symbols, Eq, solve
V1, V2, V3, V4, V5 = symbols('V1 V2 V3 V4 V5')

moves1 = Eq(V1, -0.02 + 0.9*(0.7*V2 + 0.1*V3 + 0.1*V4 + 0.1*V5))
moves2 = Eq(V2, -0.02 + 0.9*(0.7 + 0.1*V1 + 0.2*V2))
moves3 = Eq(V3, -0.02 + 0.9*(0.1*V1 + 0.2*V2 + 0.7*V3))
moves4 = Eq(V4, -0.02 + 0.9*(0.1*V1 + 0.1*V4 + 0.1*V4 + 0.7*(-1)))
moves5 = Eq(V5, -0.02 + 0.9*(0.7*V1 + 0.3*V5))

answer = solve((moves1, moves2, moves3, moves4, moves5), (V1, V2, V3, V4, V5))
print(answer)

{V1: 0.492963376738754, V2: 0.798008175495717, V3: 0.454076149988424, V4: -0.738577190357942, V5: 0.398036886774541}


In [6]:
from sympy import symbols, Eq, solve
V1, V2, V3, V4, V5 = symbols('V1 V2 V3 V4 V5')

moves1 = Eq(V1, -0.02 + 0.9*(0.7*V2 + 0.1*V5 + 0.1*V4 + 0.1*V3))
moves2 = Eq(V2, -0.02 + 0.9*(0.7 + 0.2*V2 + 0.1*V1))
moves3 = Eq(V3, -0.02 + 0.9*(0.7*V1 + 0.3*V3))
moves4 = Eq(V4, -0.02 + 0.9*(0.7*V1 + 0.1*V4 + 0.1*V4 + 0.1*(-1)))
moves5 = Eq(V5, -0.02 + 0.9*(0.7*V1 + 0.3*V5))

answer = solve((moves1, moves2, moves3, moves4, moves5), (V1, V2, V3, V4, V5))
print(answer)

{V1: 0.611091928198094, V2: 0.810973504314425, V3: 0.499983444883286, V4: 0.335351115566828, V5: 0.499983444883286}


In [7]:
# Question 2

In [2]:
def question_two(values, policy):
    zero_values = [0.0] * len(values)
    for i in range(len(values)):
        if i == 0:
            zero_values[i] = -0.02 + 0.9 * (
                0.7 * values[1] + 0.1 * values[4] + 0.1 * values[3] + 0.1 * values[2]
            )
        elif i == 1:
            zero_values[i] = -0.02 + 0.9 * (
                0.7 + 0.2 * values[1] + 0.1 * values[0]
            )
        elif i == 2:
            zero_values[i] = -0.02 + 0.9 * (
                0.7 * values[0] + 0.3 * values[2]
            )
        elif i == 3:
            zero_values[i] = -0.02 + 0.9 * (
                0.7 * values[0] + 0.1 * values[3] + 0.1 * values[3] + 0.1 * (-1)
            )
        elif i == 4:
            zero_values[i] = -0.02 + 0.9 * (
                0.7 * values[0] + 0.3 * values[4]
            )
    return zero_values

def main():
    num_actions = 6
    initial_values = [0.0] * num_actions
    random_policy = [0.25, 0.55, 0.62, 0.12, 0.0]

    iterations = 1000
    for i in range(iterations):
        initial_values = question_two(initial_values, random_policy)

    print("Values for Random Policy:", initial_values)

if __name__ == "__main__":
    main()

Values for Random Policy: [0.6110919281980935, 0.8109735043144248, 0.49998344488328617, 0.33535111556682784, 0.49998344488328617, 0.0]


In [9]:
# When it comes to the optimal policy, the value converges since the Bellman equation makes sure that every iteration improves in a sense by value due to optimal policy.

In [None]:
# The random policy vs. the Optimal policy is presented in two values above, they show a direct correlation that while being conditoned you can ultimately they similar values for both policies.