**Developing a simple car racing project with Reinforcement learning (QL)**

**Written By:** Zahra Mousavi Ziabari

‚ú® **Step 1:** Importing libraries

In [None]:
import numpy as np
import random
import time
import os

‚ú® **Step 2:** Preparing the game environment

In [None]:
# Grid size and goal
GRID_HEIGHT = 6
GRID_WIDTH = 6
GOAL_POSITION = (0, GRID_WIDTH - 1)
START_POSITION = (GRID_HEIGHT - 1, 0)

# Actions: Up, Down, Left, Right
ACTIONS = ['U', 'D', 'L', 'R']
ACTION_TO_DELTA = {
    'U': (-1, 0),
    'D': (1, 0),
    'L': (0, -1),
    'R': (0, 1)
}
SYMBOLS = ['‚Üë', '‚Üì', '‚Üê', '‚Üí']

üí¨ Printing the grid environment

üöó = Agent (car)

üèÅ = Goal (finish line)

‚¨ú = Road

In [None]:
def print_grid(agent_pos):
    os.system('cls' if os.name == 'nt' else 'clear')  # Clear console
    for i in range(GRID_HEIGHT):
        row = ''
        for j in range(GRID_WIDTH):
            if (i, j) == agent_pos:
                row += 'üöó '  # Car
            elif (i, j) == GOAL_POSITION:
                row += 'üèÅ '  # Goal
            else:
                row += '‚¨ú '
        print(row)
    print()

state = START_POSITION
print_grid(state)

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 



‚ú® **Step 3:** Initializing Q-table and printing

In [None]:
# Initialize Q-table
Q = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(ACTIONS)))

def print_q_values():
    print("\nüìä Q-Values (Rounded, Best Action Per Cell):")
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            values = Q[i, j]
            best = np.argmax(values)
            print(f"[{i},{j}] {SYMBOLS[best]} {np.round(values[best], 1)}", end=" | ")
        print()

print_q_values()


üìä Q-Values (Rounded, Best Action Per Cell):
[0,0] ‚Üë 0.0 | [0,1] ‚Üë 0.0 | [0,2] ‚Üë 0.0 | [0,3] ‚Üë 0.0 | [0,4] ‚Üë 0.0 | [0,5] ‚Üë 0.0 | 
[1,0] ‚Üë 0.0 | [1,1] ‚Üë 0.0 | [1,2] ‚Üë 0.0 | [1,3] ‚Üë 0.0 | [1,4] ‚Üë 0.0 | [1,5] ‚Üë 0.0 | 
[2,0] ‚Üë 0.0 | [2,1] ‚Üë 0.0 | [2,2] ‚Üë 0.0 | [2,3] ‚Üë 0.0 | [2,4] ‚Üë 0.0 | [2,5] ‚Üë 0.0 | 
[3,0] ‚Üë 0.0 | [3,1] ‚Üë 0.0 | [3,2] ‚Üë 0.0 | [3,3] ‚Üë 0.0 | [3,4] ‚Üë 0.0 | [3,5] ‚Üë 0.0 | 
[4,0] ‚Üë 0.0 | [4,1] ‚Üë 0.0 | [4,2] ‚Üë 0.0 | [4,3] ‚Üë 0.0 | [4,4] ‚Üë 0.0 | [4,5] ‚Üë 0.0 | 
[5,0] ‚Üë 0.0 | [5,1] ‚Üë 0.0 | [5,2] ‚Üë 0.0 | [5,3] ‚Üë 0.0 | [5,4] ‚Üë 0.0 | [5,5] ‚Üë 0.0 | 


In [None]:
def print_full_q_table():
    print("\nüìò Compact Q-Table (‚Üë ‚Üì ‚Üê ‚Üí per cell):\n")
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            values = [round(Q[i, j, a], 1) for a in range(len(ACTIONS))]
            print(f"({i},{j}): ‚Üë{values[0]:>5} ‚Üì{values[1]:>5} ‚Üê{values[2]:>5} ‚Üí{values[3]:>5}")

In [None]:
def print_policy():
    print("\nüìò Learned Policy (Best Action at Each Cell):")
    for i in range(GRID_HEIGHT):
        row = ""
        for j in range(GRID_WIDTH):
            if (i, j) == GOAL_POSITION:
                row += "üèÅ "
            else:
                best_a = np.argmax(Q[i, j])
                row += SYMBOLS[best_a] + " "
        print(row)

‚ú® **Step 4:** Defining Q-learning Parameters and Functions

In [None]:
# Q-learning parameters
alpha = 0.1
gamma = 0.9
epsilon = 0.1
episodes = 2000

# Q-learning functions
def get_reward(pos):
    return 100 if pos == GOAL_POSITION else -1

def valid_pos(pos):
    return 0 <= pos[0] < GRID_HEIGHT and 0 <= pos[1] < GRID_WIDTH

def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        return random.choice(range(len(ACTIONS)))  # Explore
    else:
        return np.argmax(Q[state[0], state[1]])     # Exploit

‚ú® **Step 5:** Training Q-Learining

üí¨ Run for only one episode

In [None]:
Q = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(ACTIONS))) # Initialize
state = START_POSITION
done = False
# Run for only one episode
while not done:
        action_idx = choose_action(state)
        print(f"State: {state}, Action: {ACTIONS[action_idx]}")
        delta = ACTION_TO_DELTA[ACTIONS[action_idx]]
        new_state = (state[0] + delta[0], state[1] + delta[1])
        if not valid_pos(new_state):
            new_state = state  # Stay in place if move is invalid
        reward = get_reward(new_state)
        best_next = np.max(Q[new_state[0], new_state[1]])

        # Q-Learning update
        Q[state[0], state[1], action_idx] += alpha * (reward + gamma * best_next - Q[state[0], state[1], action_idx])
        state = new_state
        if state == GOAL_POSITION:
            done = True
            print(f"State: {state}, üéØ Reached the goal")
print_q_values()

State: (5, 0), Action: U
State: (4, 0), Action: U
State: (3, 0), Action: U
State: (2, 0), Action: U
State: (1, 0), Action: U
State: (0, 0), Action: U
State: (0, 0), Action: D
State: (1, 0), Action: D
State: (2, 0), Action: D
State: (3, 0), Action: D
State: (4, 0), Action: D
State: (5, 0), Action: D
State: (5, 0), Action: L
State: (5, 0), Action: R
State: (5, 1), Action: U
State: (4, 1), Action: U
State: (3, 1), Action: U
State: (2, 1), Action: U
State: (1, 1), Action: U
State: (0, 1), Action: U
State: (0, 1), Action: D
State: (1, 1), Action: D
State: (2, 1), Action: D
State: (3, 1), Action: D
State: (4, 1), Action: D
State: (5, 1), Action: D
State: (5, 1), Action: L
State: (5, 0), Action: U
State: (4, 0), Action: L
State: (4, 0), Action: R
State: (4, 1), Action: L
State: (4, 0), Action: U
State: (3, 0), Action: L
State: (3, 0), Action: R
State: (3, 1), Action: L
State: (3, 0), Action: U
State: (2, 0), Action: L
State: (2, 0), Action: R
State: (2, 1), Action: L
State: (2, 0), Action: U


üí¨ Run for the total number of episodes

In [None]:
# Q-Learning Training
Q = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(ACTIONS))) # Initialize
for ep in range(episodes):
    state = (GRID_HEIGHT - 1, 0)
    done = False
    while not done:
        action_idx = choose_action(state)
        delta = ACTION_TO_DELTA[ACTIONS[action_idx]]
        new_state = (state[0] + delta[0], state[1] + delta[1])
        if not valid_pos(new_state):
            new_state = state  # Stay in place if move is invalid

        reward = get_reward(new_state)
        best_next = np.max(Q[new_state[0], new_state[1]])

        # Q-Learning update
        Q[state[0], state[1], action_idx] += alpha * (reward + gamma * best_next - Q[state[0], state[1], action_idx])
        state = new_state
        if state == GOAL_POSITION:
            done = True

# Show learned policy and Q-values
print_full_q_table()
print_q_values()
print_policy()


üìò Compact Q-Table (‚Üë ‚Üì ‚Üê ‚Üí per cell):

(0,0): ‚Üë -0.8 ‚Üì -0.8 ‚Üê -0.8 ‚Üí -0.7
(0,1): ‚Üë -0.6 ‚Üì -0.6 ‚Üê -0.6 ‚Üí -0.6
(0,2): ‚Üë -0.5 ‚Üì 28.6 ‚Üê -0.4 ‚Üí -0.4
(0,3): ‚Üë  7.5 ‚Üì -0.3 ‚Üê -0.3 ‚Üí 88.2
(0,4): ‚Üë  8.6 ‚Üì 27.1 ‚Üê 12.5 ‚Üí100.0
(0,5): ‚Üë  0.0 ‚Üì  0.0 ‚Üê  0.0 ‚Üí  0.0
(1,0): ‚Üë -0.9 ‚Üì -0.8 ‚Üê -0.9 ‚Üí -0.9
(1,1): ‚Üë -0.7 ‚Üì -0.7 ‚Üê -0.7 ‚Üí 17.7
(1,2): ‚Üë  5.4 ‚Üì 28.6 ‚Üê  1.5 ‚Üí 70.2
(1,3): ‚Üë 75.0 ‚Üì 62.0 ‚Üê 61.9 ‚Üí 79.1
(1,4): ‚Üë 87.9 ‚Üì 69.6 ‚Üê 69.7 ‚Üí 89.0
(1,5): ‚Üë100.0 ‚Üì 78.6 ‚Üê 79.0 ‚Üí 89.0
(2,0): ‚Üë -1.1 ‚Üì -1.2 ‚Üê -1.1 ‚Üí 14.3
(2,1): ‚Üë -0.9 ‚Üì  3.4 ‚Üê  0.6 ‚Üí 54.8
(2,2): ‚Üë 61.7 ‚Üì 48.3 ‚Üê 47.6 ‚Üí 62.2
(2,3): ‚Üë 70.2 ‚Üì 54.7 ‚Üê 54.8 ‚Üí 69.5
(2,4): ‚Üë 79.1 ‚Üì  5.2 ‚Üê 25.3 ‚Üí 21.0
(2,5): ‚Üë 88.9 ‚Üì -0.1 ‚Üê 27.8 ‚Üí  2.7
(3,0): ‚Üë -1.5 ‚Üì  1.8 ‚Üê  1.5 ‚Üí 42.3
(3,1): ‚Üë  3.8 ‚Üì 14.6 ‚Üê  9.8 ‚Üí 48.5
(3,2): ‚Üë 55.0 ‚Üì 42.2 ‚Üê 42.4 ‚Üí 54.8
(3,3): ‚Üë 62.2 ‚Üì  8.3 ‚Üê 20.1 ‚Üí  8.8
(3,

üí• **Step 6:** Testing the final results

In [None]:
# Test episodes (some may fail due to exploration)
print("\nüß™ Test Episodes (with Œµ-greedy policy):")
for test in range(3):
    print(f"\nTest #{test + 1}:")
    state = START_POSITION
    path = [state]
    steps = 0
    time.sleep(1)
    while state != GOAL_POSITION and steps < 30:
        print_grid(state)
        action_idx = choose_action(state)  # Œµ-greedy again
        delta = ACTION_TO_DELTA[ACTIONS[action_idx]]
        new_state = (state[0] + delta[0], state[1] + delta[1])
        if valid_pos(new_state):
            state = new_state
        path.append(state)
        steps += 1
    if state == GOAL_POSITION:
      print("üéâ Reached the goal!")
    else:
      print("üò¢ Failed to reach the goal.")



üß™ Test Episodes (with Œµ-greedy policy):

Test #1:
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚

In [None]:
# Final deterministic test (pure greedy path)
print_policy()
print("\nüéûÔ∏è Final Agent Learned Policy Animation:")
state = START_POSITION
steps = 0
time.sleep(1)

while state != GOAL_POSITION and steps < 20:
    print_grid(state)
    action_idx = np.argmax(Q[state[0], state[1]])  # Always greedy
    action = ACTIONS[action_idx]
    delta = ACTION_TO_DELTA[action]
    new_state = (state[0] + delta[0], state[1] + delta[1])

    if valid_pos(new_state):
        state = new_state
    time.sleep(0.5)
    steps += 1

print_grid(state)
if state == GOAL_POSITION:
    print("üéâ Reached the goal!")
else:
    print("üò¢ Failed to reach the goal.")



üìò Learned Policy (Best Action at Each Cell):
‚Üí ‚Üë ‚Üì ‚Üí ‚Üí üèÅ 
‚Üì ‚Üí ‚Üí ‚Üí ‚Üí ‚Üë 
‚Üí ‚Üí ‚Üí ‚Üë ‚Üë ‚Üë 
‚Üí ‚Üí ‚Üë ‚Üë ‚Üë ‚Üë 
‚Üí ‚Üí ‚Üë ‚Üë ‚Üë ‚Üë 
‚Üë ‚Üë ‚Üë ‚Üë ‚Üê ‚Üë 

üéûÔ∏è Final Agent Learned Policy Animation:
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨

‚ú® **Step 7:** Variations in the environment

‚ö° Agent gets a penalty if it bumps into one obstacle (or can't pass).

Grid visualization will show:

üöó Car

üèÅ Goal

‚ùå Obstacles

‚¨ú Empty road

In [None]:
# Obstacles
OBSTACLES = {(1, 2), (2, 2), (3, 4)}  # You can add more

# Print grid
def print_grid(agent_pos):
    os.system('cls' if os.name == 'nt' else 'clear')
    for i in range(GRID_HEIGHT):
        row = ''
        for j in range(GRID_WIDTH):
            cell = (i, j)
            if cell == agent_pos:
                row += 'üöó '
            elif cell == GOAL_POSITION:
                row += 'üèÅ '
            elif cell in OBSTACLES:
                row += '‚ùå '
            else:
                row += '‚¨ú '
        print(row)
    print()

state = START_POSITION
print_grid(state)

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚ùå ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 



üí¨ Redefining some of the functions and parameters

In [None]:
# Q-learning parameters
epsilon = 0.5
episodes = 2000

# Reward function
def get_reward(pos):
    if pos == GOAL_POSITION:
        return 100
    elif pos in OBSTACLES:
        return -10
    else:
        return -1

# Valid position check
def valid_pos(pos):
    x, y = pos
    return 0 <= x < GRID_HEIGHT and 0 <= y < GRID_WIDTH and pos not in OBSTACLES

üí¨ Training Q-Learning

In [None]:
# Initialize Q-table
Q = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(ACTIONS)))

# Q-learning training
for ep in range(episodes):
    state = START_POSITION
    done = False
    while not done:
        action_idx = choose_action(state)
        delta = ACTION_TO_DELTA[ACTIONS[action_idx]]
        new_state = (state[0] + delta[0], state[1] + delta[1])

        if not valid_pos(new_state):
            new_state = state

        reward = get_reward(new_state)
        best_next = np.max(Q[new_state[0], new_state[1]])
        Q[state[0], state[1], action_idx] += alpha * (reward + gamma * best_next - Q[state[0], state[1], action_idx])
        state = new_state

        if state == GOAL_POSITION:
            done = True

üí¨ Testing Learned Policy

In [None]:
# Test episodes (some may fail due to exploration)
print("\nüß™ Test Episodes (with Œµ-greedy policy):")
for test in range(3):
    print(f"\nTest #{test + 1}:")
    state = START_POSITION
    path = [state]
    steps = 0
    time.sleep(1)
    while state != GOAL_POSITION and steps < 30:
        print_grid(state)
        action_idx = choose_action(state)  # Œµ-greedy again
        delta = ACTION_TO_DELTA[ACTIONS[action_idx]]
        new_state = (state[0] + delta[0], state[1] + delta[1])
        if valid_pos(new_state):
          state = new_state
        else:
            if new_state in OBSTACLES:
              print("üí• Hit obstacle!")
            else:
              print("üí• Bounced off wall")
        time.sleep(0.3)
        path.append(state)
        steps += 1
    if state == GOAL_POSITION:
      print("üéâ Reached the goal!")
    else:
      print("üò¢ Failed to reach the goal.")


üß™ Test Episodes (with Œµ-greedy policy):

Test #1:
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚ùå ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

üí• Bounced off wall
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚ùå ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚ùå ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚ùå ‚¨ú 
‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚ùå ‚¨ú 
‚¨ú ‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú üöó ‚¨ú ‚ùå ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚

In [None]:
# Final deterministic test (pure greedy path)
print_policy()
print("\nüéûÔ∏è Final Agent Learned Policy Animation:")
state = START_POSITION
steps = 0
time.sleep(1)

while state != GOAL_POSITION and steps < 30:
    print_grid(state)
    action_idx = np.argmax(Q[state[0], state[1]])
    delta = ACTION_TO_DELTA[ACTIONS[action_idx]]
    new_state = (state[0] + delta[0], state[1] + delta[1])

    if valid_pos(new_state):
        state = new_state

    time.sleep(0.5)
    steps += 1

print_grid(state)
if state == GOAL_POSITION:
    print("üéâ Reached the goal!")
else:
    print("üò¢ Failed to reach the goal.")


üìò Learned Policy (Best Action at Each Cell):
‚Üí ‚Üí ‚Üí ‚Üí ‚Üí üèÅ 
‚Üí ‚Üì ‚Üë ‚Üí ‚Üí ‚Üë 
‚Üí ‚Üì ‚Üë ‚Üí ‚Üí ‚Üë 
‚Üí ‚Üí ‚Üí ‚Üë ‚Üë ‚Üë 
‚Üí ‚Üí ‚Üí ‚Üë ‚Üí ‚Üë 
‚Üë ‚Üí ‚Üë ‚Üë ‚Üí ‚Üë 

üéûÔ∏è Final Agent Learned Policy Animation:
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚ùå ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚ùå ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚ùå ‚¨ú 
‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚ùå ‚¨ú 
‚¨ú ‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ùå ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚ùå ‚¨ú 
‚¨ú ‚¨ú ‚¨ú üöó ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨

üî® What happens if you add more obstacles?

üî® What if the reward is -50 instead of -10?

üî® What if epsilon is 0 (no exploration)?



‚ö° **Next:**

Precision in path planning

Exploration vs. optimality

How the agent learns to avoid walls and dead ends

In [None]:
# Define valid path
PATH = {
    (5, 0), (4, 0), (3, 0), (3, 1), (3, 2), (2, 2),
    (1, 2), (1, 3), (0, 3), (0, 4), (0, 5)
}
OBSTACLES = {(i, j) for i in range(GRID_HEIGHT) for j in range(GRID_WIDTH)} - PATH


# Learning params
alpha = 0.1
gamma = 0.9
epsilon = 0.1
episodes = 1500

# Q-table
Q = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(ACTIONS)))

def reward(pos):
    if pos == GOAL_POSITION:
        return 100
    elif pos in OBSTACLES:
        return -10
    else:
        return -1

def valid(pos):
    return pos in PATH or pos == GOAL_POSITION

def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        return random.choice(range(len(ACTIONS)))
    return np.argmax(Q[state[0], state[1]])

def print_grid(agent_pos):
    os.system('cls' if os.name == 'nt' else 'clear')
    for i in range(GRID_HEIGHT):
        row = ''
        for j in range(GRID_WIDTH):
            pos = (i, j)
            if pos == agent_pos:
                row += 'üöó '
            elif pos == GOAL_POSITION:
                row += 'üèÅ '
            elif pos in OBSTACLES:
                row += '‚¨õ '
            else:
                row += '‚¨ú '
        print(row)
    print()

# Training
for _ in range(episodes):
    state = START_POSITION
    done = False
    while not done:
        action_idx = choose_action(state)
        delta = ACTION_TO_DELTA[ACTIONS[action_idx]]
        next_state = (state[0] + delta[0], state[1] + delta[1])
        if not (0 <= next_state[0] < GRID_HEIGHT and 0 <= next_state[1] < GRID_WIDTH) or next_state in OBSTACLES:
            next_state = state  # Bump into wall

        r = reward(next_state)
        Q[state[0], state[1], action_idx] += alpha * (r + gamma * np.max(Q[next_state[0], next_state[1]]) - Q[state[0], state[1], action_idx])
        state = next_state

        if state == GOAL_POSITION:
            done = True

# Test run
print("\nüèÅ Learned Policy on Narrow Path:")
state = START_POSITION
steps = 0
time.sleep(1)

while state != GOAL_POSITION and steps < 30:
    print_grid(state)
    action_idx = np.argmax(Q[state[0], state[1]])
    delta = ACTION_TO_DELTA[ACTIONS[action_idx]]
    next_state = (state[0] + delta[0], state[1] + delta[1])

    if valid(next_state):
        state = next_state
    else:
        print("üí• Bounced off wall")
        time.sleep(0.3)

    time.sleep(0.5)
    steps += 1

print_grid(state)
if state == GOAL_POSITION:
    print("üéâ Reached the goal!")
else:
    print("‚ùå Didn't reach the goal.")



üèÅ Learned Policy on Narrow Path:
‚¨õ ‚¨õ ‚¨õ ‚¨ú ‚¨ú üèÅ 
‚¨õ ‚¨õ ‚¨ú ‚¨ú ‚¨õ ‚¨õ 
‚¨õ ‚¨õ ‚¨ú ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨ú ‚¨ú ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨õ ‚¨õ ‚¨õ ‚¨õ ‚¨õ 
üöó ‚¨õ ‚¨õ ‚¨õ ‚¨õ ‚¨õ 

‚¨õ ‚¨õ ‚¨õ ‚¨ú ‚¨ú üèÅ 
‚¨õ ‚¨õ ‚¨ú ‚¨ú ‚¨õ ‚¨õ 
‚¨õ ‚¨õ ‚¨ú ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨ú ‚¨ú ‚¨õ ‚¨õ ‚¨õ 
üöó ‚¨õ ‚¨õ ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨õ ‚¨õ ‚¨õ ‚¨õ ‚¨õ 

‚¨õ ‚¨õ ‚¨õ ‚¨ú ‚¨ú üèÅ 
‚¨õ ‚¨õ ‚¨ú ‚¨ú ‚¨õ ‚¨õ 
‚¨õ ‚¨õ ‚¨ú ‚¨õ ‚¨õ ‚¨õ 
üöó ‚¨ú ‚¨ú ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨õ ‚¨õ ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨õ ‚¨õ ‚¨õ ‚¨õ ‚¨õ 

‚¨õ ‚¨õ ‚¨õ ‚¨ú ‚¨ú üèÅ 
‚¨õ ‚¨õ ‚¨ú ‚¨ú ‚¨õ ‚¨õ 
‚¨õ ‚¨õ ‚¨ú ‚¨õ ‚¨õ ‚¨õ 
‚¨ú üöó ‚¨ú ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨õ ‚¨õ ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨õ ‚¨õ ‚¨õ ‚¨õ ‚¨õ 

‚¨õ ‚¨õ ‚¨õ ‚¨ú ‚¨ú üèÅ 
‚¨õ ‚¨õ ‚¨ú ‚¨ú ‚¨õ ‚¨õ 
‚¨õ ‚¨õ ‚¨ú ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨ú üöó ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨õ ‚¨õ ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨õ ‚¨õ ‚¨õ ‚¨õ ‚¨õ 

‚¨õ ‚¨õ ‚¨õ ‚¨ú ‚¨ú üèÅ 
‚¨õ ‚¨õ ‚¨ú ‚¨ú ‚¨õ ‚¨õ 
‚¨õ ‚¨õ üöó ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨ú ‚¨ú ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨õ ‚¨õ ‚¨õ ‚¨õ ‚¨õ 
‚¨ú ‚¨õ ‚¨õ ‚¨õ ‚¨õ ‚¨õ 

‚¨õ ‚¨õ ‚¨õ ‚¨ú ‚¨ú üèÅ 
‚¨õ ‚¨õ üöó ‚¨ú ‚¨

‚ö° **Next:**

‚ùå Obstacles: Unpassable ‚Äî agent bounces off them.

‚ò†Ô∏è Hazards: Agent can enter them but receives a large negative reward (e.g., -50).

‚¨ú Regular road cells give small negative reward (e.g., -1).

In [None]:
OBSTACLES = {(1, 1), (2, 3), (4, 4)}
HAZARDS = {(2, 1), (3, 2), (4, 2)}

alpha = 0.1
gamma = 0.9
epsilon = 0.5
episodes = 100

Q = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(ACTIONS)))

def reward(pos):
    if pos == GOAL_POSITION:
        return 100
    elif pos in HAZARDS:
        return -50
    elif pos in OBSTACLES:
        return -10
    else:
        return -1

def valid_pos(pos):
    x, y = pos
    # Check wall and hazard
    message = ""
    if not (0 <= pos[0] < GRID_HEIGHT and 0 <= pos[1] < GRID_WIDTH):
        message = "üí• Bumped (out of bounds)"
    elif new_state in OBSTACLES:
        message = "üí• Bumped (obstacle)"
    elif new_state in HAZARDS:
        message = "‚ò†Ô∏è Hazard encountered!"
    return 0 <= x < GRID_HEIGHT and 0 <= y < GRID_WIDTH and pos not in OBSTACLES, message

def choose_action(state):
    if random.random() < epsilon:
        return random.randint(0, len(ACTIONS) - 1)
    return np.argmax(Q[state[0], state[1]])

def print_grid(agent_pos):
    os.system('cls' if os.name == 'nt' else 'clear')
    for i in range(GRID_HEIGHT):
        row = ''
        for j in range(GRID_WIDTH):
            pos = (i, j)
            if pos == agent_pos:
                row += 'üöó '
            elif pos == GOAL_POSITION:
                row += 'üèÅ '
            elif pos in OBSTACLES:
                row += '‚¨õ '
            elif pos in HAZARDS:
                row += '‚ò†Ô∏è '
            else:
                row += '‚¨ú '
        print(row)
    print()

# Training
for episode in range(1, episodes + 1):
    state = START_POSITION
    done = False
    steps = 0

    while not done and steps < 100:
        action_idx = choose_action(state)
        delta = ACTION_TO_DELTA[ACTIONS[action_idx]]
        new_state = (state[0] + delta[0], state[1] + delta[1])

         # Check wall and hazard
        if not valid_pos(new_state)[0]:
            new_state = state

        r = reward(new_state)
        Q[state[0], state[1], action_idx] += alpha * (
            r + gamma * np.max(Q[new_state[0], new_state[1]]) - Q[state[0], state[1], action_idx]
        )
        state = new_state
        steps += 1
        if state == GOAL_POSITION:
            done = True

    if episode % 50 == 0:
        print(f"üéØ Episode {episode} completed in {steps} steps.\n")
        time.sleep(1)

# Test episodes (some may fail due to exploration)
print("\nüß™ Test Episodes (with Œµ-greedy policy):")
for test in range(3):
    print(f"\nTest #{test + 1}:")
    state = START_POSITION
    path = [state]
    steps = 0
    time.sleep(1)
    while state != GOAL_POSITION and steps < 30:
        print_grid(state)
        action_idx = choose_action(state)  # Œµ-greedy again
        delta = ACTION_TO_DELTA[ACTIONS[action_idx]]
        new_state = (state[0] + delta[0], state[1] + delta[1])
        if valid_pos(new_state)[0]:
          state = new_state
        print(valid_pos(new_state)[1])
        time.sleep(0.3)
        path.append(state)
        steps += 1
    if state == GOAL_POSITION:
      print("üéâ Reached the goal!")
    else:
      print("üò¢ Failed to reach the goal.")


üéØ Episode 50 completed in 25 steps.

üéØ Episode 100 completed in 23 steps.


üß™ Test Episodes (with Œµ-greedy policy):

Test #1:
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨õ ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 


‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨õ ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú 
‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú 


‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨õ ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨ú ‚¨ú 
‚¨ú üöó ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 


‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨õ ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨ú ‚¨ú 
üöó ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 


‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨õ ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

üí• Bumped (out of bounds)
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ü

‚ö° **Next:**

A main goal: reach the finish line (üèÅ)

A subgoal (üéØ): the agent must reach a checkpoint first

üî® The agent will learn two stages:

1. Reach the subgoal (checkpoint üéØ)

2. From the checkpoint, reach the final goal üèÅ

In [None]:
# Grid config
GRID_HEIGHT = 6
GRID_WIDTH = 6
START = (5, 0)
SUBGOAL = (2, 2)  # üéØ
FINAL_GOAL = (0, 5)  # üèÅ

OBSTACLES = {(1, 1), (2, 3), (4, 4)}
HAZARDS = {(4, 2), (3, 2)}

ACTIONS = ['U', 'D', 'L', 'R']
ACTION_TO_DELTA = {'U': (-1, 0), 'D': (1, 0), 'L': (0, -1), 'R': (0, 1)}

alpha = 0.1
gamma = 0.9
epsilon = 0.1
episodes = 1000

# Separate Q-tables for each stage
Q1 = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(ACTIONS)))  # Start -> Subgoal
Q2 = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(ACTIONS)))  # Subgoal -> Final

def reward(pos, phase):
    if pos in OBSTACLES:
        return -10
    elif pos in HAZARDS:
        return -50
    elif phase == 1 and pos == SUBGOAL:
        return 50
    elif phase == 2 and pos == FINAL_GOAL:
        return 100
    else:
        return -1

def is_valid(pos):
    x, y = pos
    return 0 <= x < GRID_HEIGHT and 0 <= y < GRID_WIDTH and pos not in OBSTACLES

def choose_action(Q, state):
    if random.random() < epsilon:
        return random.randint(0, len(ACTIONS) - 1)
    return np.argmax(Q[state[0], state[1]])

def print_grid(agent_pos):
    os.system('cls' if os.name == 'nt' else 'clear')
    for i in range(GRID_HEIGHT):
        row = ''
        for j in range(GRID_WIDTH):
            pos = (i, j)
            if pos == agent_pos:
                row += 'üöó '
            elif pos == FINAL_GOAL:
                row += 'üèÅ '
            elif pos == SUBGOAL:
                row += 'üéØ '
            elif pos in OBSTACLES:
                row += '‚¨õ '
            elif pos in HAZARDS:
                row += '‚ò†Ô∏è '
            else:
                row += '‚¨ú '
        print(row)
    print()

# Training phase
for episode in range(1, episodes + 1):
    state = START
    phase = 1  # First go to subgoal
    done = False

    while not done:
        Q = Q1 if phase == 1 else Q2
        action_idx = choose_action(Q, state)
        delta = ACTION_TO_DELTA[ACTIONS[action_idx]]
        next_state = (state[0] + delta[0], state[1] + delta[1])

        message = ""
        if not (0 <= next_state[0] < GRID_HEIGHT and 0 <= next_state[1] < GRID_WIDTH):
            message = "üí• Bumped into wall (out of bounds)"
            next_state = state
        elif next_state in OBSTACLES:
            message = "üí• Bumped into wall (obstacle)"
            next_state = state
        elif next_state in HAZARDS:
            message = "‚ò†Ô∏è Hazard encountered!"

        r = reward(next_state, phase)
        Q[state[0], state[1], action_idx] += alpha * (
            r + gamma * np.max(Q[next_state[0], next_state[1]]) - Q[state[0], state[1], action_idx]
        )
        state = next_state

        if phase == 1 and state == SUBGOAL:
            phase = 2  # Switch to final phase
        if phase == 2 and state == FINAL_GOAL:
            done = True

# Final demonstration
print("\nüèÅ Demonstrating learned policy (with subgoal üéØ)")
state = START
phase = 1
steps = 0
time.sleep(1)

while steps < 50:
    print_grid(state)
    Q = Q1 if phase == 1 else Q2
    action_idx = np.argmax(Q[state[0], state[1]])
    delta = ACTION_TO_DELTA[ACTIONS[action_idx]]
    next_state = (state[0] + delta[0], state[1] + delta[1])

    if not (0 <= next_state[0] < GRID_HEIGHT and 0 <= next_state[1] < GRID_WIDTH):
        print("üí• Bumped into wall (out of bounds)")
        next_state = state
    elif next_state in OBSTACLES:
        print("üí• Bumped into wall (obstacle)")
        next_state = state
    elif next_state in HAZARDS:
        print("‚ò†Ô∏è Hazard encountered!")

    state = next_state

    if phase == 1 and state == SUBGOAL:
        print("üéØ Subgoal reached!")
        time.sleep(1)
        phase = 2
    elif phase == 2 and state == FINAL_GOAL:
        print("üéâ Final goal reached!")
        break

    time.sleep(0.5)
    steps += 1

print_grid(state)



üèÅ Demonstrating learned policy (with subgoal üéØ)
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨õ ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú üéØ ‚¨õ ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú 
üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨õ ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú üéØ ‚¨õ ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú 
‚¨ú üöó ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨õ ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú üéØ ‚¨õ ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨ú ‚¨ú 
‚¨ú üöó ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨õ ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú üéØ ‚¨õ ‚¨ú ‚¨ú 
‚¨ú üöó ‚ò†Ô∏è ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨õ ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú üöó üéØ ‚¨õ ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨õ ‚¨ú 
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú 

üéØ Subgoal reached!
‚¨ú ‚¨ú ‚¨ú ‚¨ú ‚¨ú üèÅ 
‚¨ú ‚¨õ ‚¨ú ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú üöó ‚¨õ ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú ‚¨ú ‚¨ú 
‚¨ú ‚¨ú ‚ò†Ô∏è ‚¨ú