In [7]:
import gymnasium as gym
import numpy as np

from gym_env import SumoRobotEnv

In [None]:
class Params(NamedTuple):
    total_episodes: int  # Total episodes
    learning_rate: float  # Learning rate
    gamma: float  # Discounting rate
    epsilon: float  # Exploration probability
    map_size: int  # Number of tiles of one side of the squared environment
    seed: int  # Define a seed so that we get reproducible results
    is_slippery: bool  # If true the player will move in intended direction with probability of 1/3 else will move in either perpendicular direction with equal probability of 1/3 in both directions
    n_runs: int  # Number of runs
    action_size: int  # Number of possible actions
    state_size: int  # Number of possible states
    proba_frozen: float  # Probability that a tile is frozen
    savefig_folder: Path  # Root folder where plots are saved


params = Params(
    total_episodes=2000,
    learning_rate=0.8,
    gamma=0.95,
    epsilon=0.1,
    map_size=5,
    seed=123,
    is_slippery=False,
    n_runs=20,
    action_size=None,
    state_size=None,
    proba_frozen=0.9,
    savefig_folder=Path("../../_static/img/tutorials/"),
)
params

# Set the seed
rng = np.random.default_rng(params.seed)

# Create the figure folder if it doesn't exists
params.savefig_folder.mkdir(parents=True, exist_ok=True)


In [8]:
env = SumoRobotEnv(render_mode="human", for_training=True)
num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
Q = np.zeros((num_states, num_actions))

In [9]:
learning_rate = 0.1
discount_factor = 0.99
num_episodes = 1000
max_steps_per_episode = 100
exploration_rate = 1.0
max_exploration_rate = 1.0
min_exploration_rate = 0.01
exploration_decay_rate = 0.01

In [10]:
for episode in range(num_episodes):
    state = env.reset()
    done = False
    t = 0

    for t in range(max_steps_per_episode):
        # Choose an action using epsilon-greedy policy
        exploration_rate_threshold = np.random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(Q[state, :])
        else:
            action = env.action_space.sample()

        # Perform the chosen action and observe the next state and reward
        next_state, reward, done, _ = env.step(action)

        # Update the Q-table using the Q-learning formula
        Q[state, action] = (1 - learning_rate) * Q[state, action] + learning_rate * (
                reward + discount_factor * np.max(Q[next_state, :]))

        state = next_state

        if done:
            break

    # Decay the exploration rate
    exploration_rate = min_exploration_rate + \
                       (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)


Vec2d(-3.0, 3.0)


IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (10,) (2,) 

: 

In [None]:
# Test the agent's learned policy
total_reward = 0
num_test_episodes = 10

for _ in range(num_test_episodes):
    state = env.reset()
    done = False

    while not done:
        action = np.argmax(Q[state, :])
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        state = next_state

avg_reward = total_reward / num_test_episodes
print("Average reward:", avg_reward)
