In [1]:
import numpy as np
import random

In [2]:
# Define the maze environment
# 0 = Free space, 1 = Obstacle, 2 = Goal
maze = np.array([
    [0, 1, 0, 0, 0],
    [0, 1, 0, 1, 0],
    [0, 0, 0, 1, 0],
    [0, 1, 1, 0, 0],
    [0, 0, 0, 1, 2]
])

# Define possible actions
# 0 = Up, 1 = Right, 2 = Down, 3 = Left
actions = [0, 1, 2, 3]
action_dict = {
    0: (-1, 0),  # Move up
    1: (0, 1),   # Move right
    2: (1, 0),   # Move down
    3: (0, -1)   # Move left
}

# Initialize Q-table
q_table = np.zeros((maze.shape[0], maze.shape[1], len(actions)))

# Parameters for Q-learning
alpha = 0.1    # Learning rate
gamma = 0.9    # Discount factor
epsilon = 0.9  # Exploration rate
max_episodes = 1000
max_steps = 100

# Rewards
goal_reward = 100
step_penalty = -1
obstacle_penalty = -100

# Helper function to get next state based on action
def get_next_state(state, action):
    row, col = state
    next_row = row + action_dict[action][0]
    next_col = col + action_dict[action][1]
    
    if next_row < 0 or next_row >= maze.shape[0] or next_col < 0 or next_col >= maze.shape[1]:
        return state  # Stay in current state if out of bounds
    
    if maze[next_row, next_col] == 1:  # Hit an obstacle
        return state  # Stay in current state
    
    return (next_row, next_col)

# Q-learning algorithm
for episode in range(max_episodes):
    state = (0, 0)  # Start at the top-left corner of the maze
    
    for step in range(max_steps):
        if random.uniform(0, 1) < epsilon:  # Explore
            action = random.choice(actions)
        else:  # Exploit
            action = np.argmax(q_table[state[0], state[1]])
        
        next_state = get_next_state(state, action)
        
        # Assign rewards
        if maze[next_state[0], next_state[1]] == 2:  # Reached the goal
            reward = goal_reward
        elif maze[next_state[0], next_state[1]] == 1:  # Hit an obstacle
            reward = obstacle_penalty
        else:
            reward = step_penalty
        
        # Update Q-value
        old_q_value = q_table[state[0], state[1], action]
        next_max_q_value = np.max(q_table[next_state[0], next_state[1]])
        new_q_value = (1 - alpha) * old_q_value + alpha * (reward + gamma * next_max_q_value)
        q_table[state[0], state[1], action] = new_q_value
        
        # Move to the next state
        state = next_state
        
        # If goal is reached, end episode
        if maze[state[0], state[1]] == 2:
            break
    
    # Decrease epsilon (less exploration as training progresses)
    epsilon = max(0.1, epsilon * 0.99)

# Display the learned Q-values
print("Learned Q-table:")
print(q_table)

Learned Q-table:
[[[ 19.69098728  20.37702599  24.51916557  18.73078274]
  [  0.           0.           0.           0.        ]
  [ 46.83469901  54.9539      41.73744849  46.83741117]
  [ 53.63835328  62.171       53.50315538  47.27520752]
  [ 61.46774097  60.41518201  70.19        53.04703048]]

 [[ 20.01242545  22.59857335  28.35462841  23.52365841]
  [  0.           0.           0.           0.        ]
  [ 48.45851     39.28397197  36.45136525  41.23659233]
  [  0.           0.           0.           0.        ]
  [ 60.56674201  69.46341478  79.1         66.41916146]]

 [[ 21.63708951  32.61625379  22.44452467  26.71825138]
  [ 31.58169662  37.3513931   30.50578997  24.23499563]
  [ 42.612659    35.80148063  35.97213     30.79104278]
  [  0.           0.           0.           0.        ]
  [ 69.26646854  78.65384991  89.          78.68310416]]

 [[ 27.93503274  -2.38197102  -4.4060382   -1.60070262]
  [  0.           0.           0.           0.        ]
  [  0.           0.     

In [3]:
# Display the optimal policy (best actions for each state)
optimal_policy = np.zeros(maze.shape, dtype=str)
for i in range(maze.shape[0]):
    for j in range(maze.shape[1]):
        if maze[i, j] == 1:
            optimal_policy[i, j] = 'X'  # Obstacle
        elif maze[i, j] == 2:
            optimal_policy[i, j] = 'G'  # Goal
        else:
            action = np.argmax(q_table[i, j])
            if action == 0:
                optimal_policy[i, j] = '↑'
            elif action == 1:
                optimal_policy[i, j] = '→'
            elif action == 2:
                optimal_policy[i, j] = '↓'
            elif action == 3:
                optimal_policy[i, j] = '←'

print("\nOptimal Policy:")
print(optimal_policy)



Optimal Policy:
[['↓' 'X' '→' '→' '↓']
 ['↓' 'X' '↑' 'X' '↓']
 ['→' '→' '↑' 'X' '↓']
 ['↑' 'X' 'X' '→' '↓']
 ['↑' '↑' '↑' 'X' 'G']]
