In [8]:
import numpy as np
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map

In [9]:
env = gym.make('FrozenLake-v1', desc=generate_random_map(size=5), render_mode="human", is_slippery=False)
observation, info = env.reset()

In [10]:
# Initialize Q-table with zeros
Q = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
learning_rate = 0.1
discount_factor = 0.99
epsilon = 0.1
num_episodes = 1000

In [11]:
# Training SARSA
for episode in range(num_episodes):
    state_tuple = env.reset()  # State is a tuple
    state = state_tuple[0]  # Extract the integer state value

    done = False

    while not done:
        # Choose action using epsilon-greedy policy
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Exploration
        else:
            action = np.argmax(Q[state, :])  # Exploitation

        # Take action and observe the next state, reward, done flag, and info
        step_result = env.step(action)

        next_state = step_result[0]  # Extract the next state tuple
        reward = step_result[1]  # Extract the reward
        done = step_result[2]  # Extract the done flag

        # Update Q-value using SARSA formula
        next_action = np.argmax(Q[next_state, :])
        Q[state, action] = Q[state, action] + learning_rate * (reward + discount_factor * Q[next_state, next_action] - Q[state, action])

        state = next_state

KeyboardInterrupt: 

In [None]:
# Save the trained Q-table to a file
np.save('frozenlake_qtable.npy', Q)

In [12]:
env.close()