#OpenAI Gym - Frozen Lake

In [None]:
import numpy as np
import gym
from IPython.display import clear_output
import time

In [None]:
# ---------------------------
# PARAMETERS
# ---------------------------
GAMMA = 0.9     # Discount factor
ALPHA = 0.1     # Learning rate
EPS = 1.0       # Initial epsilon (exploration rate)
MIN_EPS = 0.01  # Minimum epsilon
DECAY = 0.99    # Epsilon decay rate
EPISODES = 10000  # Number of training episodes

## Setup Gym Env

In [None]:
#
# The reward is 1 if you reach the Goal and 0 otherwise.
# Landing on a hole will terminate the game
# is_slippery=False -> it's deterministic
# render_mode='ansi' refers to display on text-based console
#
# For details: https://gymnasium.farama.org/environments/toy_text/frozen_lake/
#
env = gym.make('FrozenLake-v1', new_step_api=True, render_mode='ansi', is_slippery=False) #new_step_api=True => use the new api

In [None]:
print("No of states:", env.observation_space.n)
print("No of actions:", env.action_space.n)
# Initialize Q-table with zeros
Q_table = np.zeros((env.observation_space.n, env.action_space.n))

total_goal = 0 #keep track of cumulative total number of goals reached after EPISODES of training
               #some episodes will result in failures with 0 return while successful ones will return value of 1 each

No of states: 16
No of actions: 4


In [None]:
#print(env.observation_space) # meaning values are 0 to 15
#print(env.action_space) # meaning values are 0 to 3

#Training

**Use SARSA to train the agent and get its Q-values. During training, find the total number of goals achieved out of
the 10,000 episodes. Print the total goals and the cumulative success
percentage.**

In [None]:
# ---------------------------
# PART 1: TRAINING THE AGENT (SARSA)
# ---------------------------
for episode in range(EPISODES):
    state = env.reset()
    terminated = False
    truncated = False

    # Choose initial action using epsilon-greedy policy
    action = env.action_space.sample() if np.random.uniform(0, 1) < EPS else np.argmax(Q_table[state, :])

    while not (terminated or truncated):
        # Take action and observe result
        next_state, reward, terminated, truncated, _ = env.step(action)

        # Choose next action using epsilon-greedy policy (SARSA update)
        next_action = env.action_space.sample() if np.random.uniform(0, 1) < EPS else np.argmax(Q_table[next_state, :])

        # Update Q-value using the SARSA formula
        Q_table[state, action] = Q_table[state, action] + ALPHA * (
            reward + GAMMA * Q_table[next_state, next_action] - Q_table[state, action]
        )

        state, action = next_state, next_action  # Move to next state and action

        if reward == 1:
            total_goal += 1  # Count successful attempts

    # Decay exploration rate
    EPS = max(MIN_EPS, EPS * DECAY)

print("Done")

  if not isinstance(terminated, (bool, np.bool8)):


Done


In [None]:
# Print Training Results
success_rate = (total_goal * 100 / EPISODES)
print(f"Total Goals: {total_goal}, Cumulative Success Rate: {success_rate:.1f}%")

Total Goals: 9764, Cumulative Success Rate: 97.6%


#Code to Render Display & Display Optimal Policy

**With the obtained Q-values, show the optimal
policy through rendering the display output (i.e., it should show agent
moving the 6 steps to reach the Goal from the Start tile with the help of
the obtained Q-values).**

In [None]:
# ---------------------------
# PART 2: DISPLAY OPTIMAL POLICY
# ---------------------------
print("\nOptimal Policy Demonstration:")
time.sleep(2)

for episode in range(1, 2):  # Start from episode 1 to avoid confusion
    state, _ = env.reset(return_info=True)
    truncated = False
    terminated = False

    while not (truncated or terminated):
        clear_output(wait=True)
        print(f"Episode {episode}")
        for e in env.render():
            print(e)
        time.sleep(0.5)

        # Select best action based on learned Q-table
        action = np.argmax(Q_table[state, :])

        # Take action and move to the next state
        next_state, reward, terminated, truncated, _ = env.step(action)
        state = next_state

    # Display last frame
    clear_output(wait=True)
    print(f"Q-Table Optimal Step (SARSA)")
    for e in env.render():
        print(e)
    time.sleep(1)

print("\nOptimal policy execution completed!")

Q-Table Optimal Step (SARSA)
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


Optimal policy execution completed!
