

> Import libraries to use



In [16]:
import numpy as np

>  # Introduction to numpy (Skip if you already are familiar)

>> Creating a 1D array

In [26]:
a = np.array([1,2,3,4])
print(a)

[1 2 3 4]


>> Creating a 2D array


In [27]:
a = np.array([[1,2],[3,4]])
print(a)

[[1 2]
 [3 4]]


>> Creating an array full of zeros


In [28]:
a = np.zeros(shape=(10))
print(a)
a = np.zeros(shape=(5,2))
print(a)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


>> Infinity in numpy

In [29]:
print(np.inf)

inf


>> Max and Argmax

In [30]:
a = np.array([2,1,4,3])
print(np.max(a))
print(np.argmax(a))

4
2


>> From list to Numpy

In [31]:
l = [1,2,3,4]
print(l)
print(np.asarray(l))

[1, 2, 3, 4]
[1 2 3 4]


>> Random in numpy

In [32]:
# Array of Random integers ranging from 1 to 10 (with any size you want)
a = np.random.randint(low=1, high=10, size=(5,2))
print(a)

# Array of random elements of a list with any size you want
a = np.random.choice([0,1,2], size=(2,))

[[7 5]
 [6 7]
 [4 9]
 [7 5]
 [2 8]]


>> Shapes in numpy

In [33]:
a = np.random.randint(low=1, high=5, size=(4,2))
print(a.shape)
print(a)

# Reshape a to a vector of shape = (8,1)
a = a.reshape((8,1))
print(a.shape)
print(a)

(4, 2)
[[2 1]
 [1 3]
 [1 4]
 [3 4]]
(8, 1)
[[2]
 [1]
 [1]
 [3]
 [1]
 [4]
 [3]
 [4]]


# Pre-defined utilities

In [10]:

int_to_char = {
    0 : 'u',
    1 : 'r',
    2 : 'd',
    3 : 'l'
}

policy_one_step_look_ahead = {
    0 : [-1,0],
    1 : [0,1],
    2 : [1,0],
    3 : [0,-1]
}

def policy_int_to_char(pi,n):

    pi_char = ['']

    for i in range(n):
        for j in range(n):

            if i == 0 and j == 0 or i == n-1 and j == n-1:

                continue

            pi_char.append(int_to_char[pi[i,j]])

    pi_char.append('')

    return np.asarray(pi_char).reshape(n,n)

# 1- Policy evaluation

In [47]:
def policy_evaluation(n, v, pi, threshold, Gamma):
  #to avoid infinite loop
    max_iterations=1000
    for iteration in range(max_iterations):
        delta = 0
        for i in range(n):
            for j in range(n):
                v_old = v[i, j]
                action = pi[i, j]
                reward, new_state = -1, (i, j)  # Assuming each step has a negative reward
                if action == 0 and i > 0:
                    new_state = (i - 1, j)
                elif action == 1 and j < n - 1:
                    new_state = (i, j + 1)
                elif action == 2 and i < n - 1:
                    new_state = (i + 1, j)
                elif action == 3 and j > 0:
                    new_state = (i, j - 1)
                v[i, j] = reward + Gamma * v[new_state]
                delta = max(delta, abs(v_old - v[i, j]))
        if delta < threshold:
            break
    return v



# 2- Policy improvement

In [12]:
def policy_improvement(n, pi, v, Gamma):
    policy_stable = True
    for i in range(n):
        for j in range(n):
            old_action = pi[i, j]
            # Find the best action by comparing the value function
            best_action, best_value = None, float('-inf')
            for action in range(4):  # Explore actions
                reward, new_state = -1, (i, j)
                if action == 0 and i > 0:
                    new_state = (i - 1, j)
                elif action == 1 and j < n - 1:
                    new_state = (i, j + 1)
                elif action == 2 and i < n - 1:
                    new_state = (i + 1, j)
                elif action == 3 and j > 0:
                    new_state = (i, j - 1)
                value = reward + Gamma * v[new_state]
                if value > best_value:
                    best_value = value
                    best_action = action
            pi[i, j] = best_action
            if old_action != best_action:
                policy_stable = False
    return pi, policy_stable

# 3- Policy Initialization

In [45]:
def policy_initialization(n):
    # Initialize a policy randomly
    # Actions: 0 = up, 1 = right, 2 = down, 3 = left
    return np.random.choice(4, (n, n))

# 4- Policy Iteration algorithm

In [42]:
def policy_iteration(n, Gamma, threshhold):
    pi = policy_initialization(n)
    v = np.zeros((n, n))
    while True:
        v = policy_evaluation(n, v, pi, threshhold, Gamma)
        pi, pi_stable = policy_improvement(n, pi, v, Gamma)
        if pi_stable:
            break
    return pi, v

# Main Code to Test

In [48]:
n = 4

Gamma = [0.8,0.9,1]

threshhold = 1e-4

for _gamma in Gamma:

    pi , v = policy_iteration(n=n,Gamma=_gamma,threshhold=threshhold)

    pi_char = policy_int_to_char(n=n,pi=pi)

    print()
    print("Gamma = ",_gamma)

    print()

    print(pi_char)

    print()
    print()

    print(v)



Gamma =  0.8

[['' 'u' 'u' 'u']
 ['u' 'u' 'u' 'u']
 ['l' 'l' 'u' 'u']
 ['u' 'u' 'u' '']]


[[-4.99995433 -4.99995433 -4.99995433 -4.99995433]
 [-4.99996346 -4.99996346 -4.99996346 -4.99996346]
 [-4.99995433 -4.99996346 -4.99997077 -4.99997077]
 [-4.99996346 -4.99997077 -4.99997662 -4.99997662]]

Gamma =  0.9

[['' 'u' 'u' 'u']
 ['u' 'u' 'u' 'u']
 ['u' 'u' 'u' 'u']
 ['u' 'u' 'u' '']]


[[-9.99938296 -9.99938296 -9.99938296 -9.99938296]
 [-9.99944467 -9.99944467 -9.99944467 -9.99944467]
 [-9.9995002  -9.9995002  -9.9995002  -9.9995002 ]
 [-9.99955018 -9.99955018 -9.99955018 -9.99955018]]

Gamma =  1

[['' 'u' 'u' 'u']
 ['u' 'u' 'u' 'u']
 ['r' 'r' 'r' 'r']
 ['u' 'u' 'u' '']]


[[-6000. -6000. -6000. -6000.]
 [-6001. -6001. -6001. -6001.]
 [-6000. -6000. -6000. -6000.]
 [-6001. -6001. -6001. -6001.]]
