In [1]:
#Implementation of a Q-learning algorithm in a 4x4 grid world

In [2]:
#This is the 4x4 grid world with S as the start and T as the exit
#Rewards are 0 for every state except for T which is +10
#S . . T
#. . . .
#. . . .
#. . . .

In [14]:
import random
import numpy as np

In [103]:
GRID_SIZE = 4
actions   = ["up", "down", "left", "right"]
reward_grid       = np.zeros((GRID_SIZE, GRID_SIZE))
reward_grid[:,:]  = -1
reward_grid[0, 3] = 10  # Goal position has a reward of +10
Q         = np.zeros((GRID_SIZE, GRID_SIZE, len(actions)))
terminal_state = (0,3)
epsilon = 0.2
alpha   = 0.1
gamma   = 0.9
episodes = 100000

In [104]:
def next_state(current_state, action):
    x, y = current_state
    if action == "up":
        next_state = (max(0, x - 1), y)  # Move up (row decreases)
    elif action == "down":
        next_state = (min(GRID_SIZE - 1, x + 1), y)  # Move down (row increases)
    elif action == "left":
        next_state = (x, max(0, y - 1))  # Move left (column decreases)
    elif action == "right":
        next_state = (x, min(GRID_SIZE - 1, y + 1))  # Move right (column increases)
    return next_state

In [105]:
for episode in range(episodes):
    
    current_state = (0,0)
    
    while current_state != terminal_state:
        x,y = current_state
        if np.random.uniform(0,1) < epsilon:
            action = random.choice(actions)
        else:
            action_index = np.argmax(Q[x,y])
            action = actions[action_index]

       
        next_s = next_state(current_state, action)
        x_next, y_next = next_s
        
        reward = reward_grid[x_next, y_next]
        action_index = actions.index(action)
        
        Q[x,y,action_index] = Q[x,y,action_index] + alpha* (reward + gamma* np.max(Q[x_next, y_next]) - Q[x,y,action_index])
        
        current_state = (x_next, y_next) 


In [106]:
policy = np.zeros((GRID_SIZE, GRID_SIZE), dtype=str)
for i in range(GRID_SIZE):
    for j in range(GRID_SIZE):
        state = (i,j)
        if state == terminal_state:
            policy[i,j] = "T"
        else:
            policy[i,j] = actions[np.argmax(Q[i,j])][0].upper()

In [107]:
policy

array([['R', 'R', 'R', 'T'],
       ['U', 'U', 'U', 'U'],
       ['U', 'U', 'U', 'U'],
       ['U', 'U', 'U', 'U']], dtype='<U1')