In [3]:
import numpy as np
import random

# Define the maze
maze = [
    [0, -1, 0, 0, 1],
    [0, -1, 0, -1, -1],
    [0, 0, 0, 0, 0],
    [-1, -1, 0, -1, 0],
    [0, 0, 0, -1, 0]
]

start = (0, 0)  # Starting point
goal = (0, 4)   # Goal point

# Map actions to numbers (0, 1, 2, 3)
# 0 = up, 1 = down, 2 = left, 3 = right
action_dict = {
    0: (-1, 0),  # up
    1: (1, 0),   # down
    2: (0, -1),  # left
    3: (0, 1)    # right
}

# Initialize Q-table with zeros (maze dimensions x number of actions)
q_table = np.zeros((len(maze), len(maze[0]), 4))

alpha = 0.1     # Learning rate
gamma = 0.9     # Discount factor
epsilon = 0.1   # Exploration rate
episodes = 1000 # Number of episodes

def is_valid_position(position):
    row, col = position
    return 0 <= row < len(maze) and 0 <= col < len(maze[0]) and maze[row][col] != -1

def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        # Random action (0, 1, 2, or 3)
        return random.randint(0, 3)
    else:
        row, col = state
        # Exploit the best action (max Q-value)
        return np.argmax(q_table[row, col])  # Exploit the best action based on Q-table

# Q-learning
for episode in range(episodes):
    state = start
    while state != goal:
        row, col = state
        action = choose_action(state)

        # Ensure valid action is chosen (safety check)
        if action not in action_dict:
            print(f"Invalid action: {action}, using 0 (up) instead.")
            action = 0  # Default to 0 (up) in case of invalid action.

        move = action_dict[action]
        next_state = (row + move[0], col + move[1])

        if not is_valid_position(next_state):
            reward = -1  # Penalty for hitting a wall
            next_state = state  # Stay in the same position
        elif next_state == goal:
            reward = 1  # Reward for reaching the goal
        else:
            reward = -0.1  # Small penalty for each move

        # Update Q-value
        next_row, next_col = next_state
        best_next_action = np.max(q_table[next_row, next_col])
        q_table[row, col, action] += alpha * (reward + gamma * best_next_action - q_table[row, col, action])

        # Update state
        state = next_state

    # Decrease exploration rate over time
    epsilon = max(0.01, epsilon * 0.99)

# Print the trained Q-table
print("Trained Q-Table:")
print(q_table)

# Find the path using the trained Q-table
state = start
path = [state]
while state != goal:
    row, col = state
    action = np.argmax(q_table[row, col])  # Choose the best action based on Q-values
    move = action_dict[action]
    next_state = (row + move[0], col + move[1])
    if not is_valid_position(next_state):
        break
    state = next_state
    path.append(state)

# Print the path taken by the agent
print("Path taken by the agent:",path)

Trained Q-Table:
[[[-0.66994717 -0.0434062  -0.72079098 -0.57431084]
  [ 0.          0.          0.          0.        ]
  [-0.20192651  0.08350814 -0.15459721  0.8       ]
  [-0.100707   -0.1000955   0.19705742  1.        ]
  [ 0.          0.          0.          0.        ]]

 [[-0.2974059   0.062882   -0.66028685 -0.66828771]
  [ 0.          0.          0.          0.        ]
  [ 0.62        0.0611404  -0.24705398 -0.32423864]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]

 [[-0.21033193 -0.60577805 -0.61746673  0.18098   ]
  [-0.5036096  -0.58932906 -0.12019543  0.3122    ]
  [ 0.458      -0.13006347 -0.05289352 -0.13212132]
  [-0.19346697 -0.199       0.03773287 -0.13924527]
  [-0.27800766 -0.13918221 -0.1285528  -0.1909    ]]

 [[ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [-0.13001813 -0.13021056 -0.49297517 -0.19770223]
  [ 0.          0.          0.          0