In [1]:
import numpy as np

# Define the environment
# Let's consider a simple 3x3 grid world
# S : Start
# G : Goal
# x : Obstacle
# 0 : Empty cell
# The agent can move in four directions: up, down, left, right
# The goal is to reach the 'G' cell while avoiding obstacles 'x'

env = np.array([
    ['S', '0', 'x'],
    ['0', 'x', '0'],
    ['0', '0', 'G']
])

# Define parameters
num_episodes = 1000
max_steps_per_episode = np.prod(env.shape)
learning_rate = 0.1
discount_rate = 0.99
epsilon = 0.1

# Initialize Q-table with zeros
num_states = np.prod(env.shape)
num_actions = 4  # up, down, left, right
q_table = np.zeros((num_states, num_actions))

# Helper function to convert 2D coordinates to a single index
def state_to_index(state):
    return state[0] * env.shape[1] + state[1]

# Helper function to select an action using epsilon-greedy policy
def choose_action(state):
    if np.random.uniform(0, 1) < epsilon:
        return np.random.choice(num_actions)
    else:
        return np.argmax(q_table[state_to_index(state), :])

# Q-learning algorithm
for episode in range(num_episodes):
    state = (0, 0)  # Start state
    done = False
    total_reward = 0

    for step in range(max_steps_per_episode):
        action = choose_action(state)
        next_state = None
        reward = None

        # Take action
        if action == 0:  # up
            next_state = (state[0] - 1, state[1])
        elif action == 1:  # down
            next_state = (state[0] + 1, state[1])
        elif action == 2:  # left
            next_state = (state[0], state[1] - 1)
        elif action == 3:  # right
            next_state = (state[0], state[1] + 1)

        # Check if the next state is valid
        if 0 <= next_state[0] < env.shape[0] and 0 <= next_state[1] < env.shape[1] and env[next_state] != 'x':
            reward = 0
        else:
            next_state = state  # Agent hits wall, stay in the same state
            reward = -1

        # Update Q-value
        q_table[state_to_index(state), action] += learning_rate * (
            reward + discount_rate * np.max(q_table[state_to_index(next_state), :]) - q_table[state_to_index(state), action])

        total_reward += reward
        state = next_state

        if env[state] == 'G':
            done = True
            break

    if episode % 100 == 0:
        print(f"Episode: {episode}, Total Reward: {total_reward}")

print("Training finished.")

# After training, you can use the learned Q-table to navigate the environment
# For example, you can choose the best action in each state using the Q-values
# and follow that policy to navigate from the start to the goal.


Episode: 0, Total Reward: -2
Episode: 100, Total Reward: -1
Episode: 200, Total Reward: -2
Episode: 300, Total Reward: 0
Episode: 400, Total Reward: 0
Episode: 500, Total Reward: 0
Episode: 600, Total Reward: 0
Episode: 700, Total Reward: 0
Episode: 800, Total Reward: -1
Episode: 900, Total Reward: -1
Training finished.
