In [1]:
import gym
import numpy as np
import random

# Initialize the FrozenLake environment
env = gym.make("FrozenLake-v1", is_slippery=False)

# Q-learning parameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.1  # Minimum exploration rate
epsilon_decay = 0.995  # Decay rate for exploration probability

# Initialize the Q-table
q_table = np.zeros((env.observation_space.n, env.action_space.n))

# Training parameters
num_episodes = 1000
max_steps_per_episode = 100

# Q-learning algorithm
for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    step = 0
    total_reward = 0
    
    while not done and step < max_steps_per_episode:
        # Exploration-exploitation tradeoff
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(q_table[state, :])  # Exploit
        
        # Take the action and observe the outcome
        next_state, reward, done, _, _ = env.step(action)
        
        # Update the Q-table
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state, :])
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value
        
        state = next_state
        step += 1
        total_reward += reward
    
    # Decay the exploration rate
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay
    
    if (episode + 1) % 100 == 0:
        print(f'Episode {episode + 1}/{num_episodes} - Total reward: {total_reward} - Epsilon: {epsilon}')
        print(f'Q-table snapshot:\n{q_table}')

# Evaluate the agent
num_eval_episodes = 100
total_rewards = 0

for episode in range(num_eval_episodes):
    state, _ = env.reset()
    done = False
    step = 0
    episode_reward = 0
    
    while not done and step < max_steps_per_episode:
        action = np.argmax(q_table[state, :])  # Always exploit during evaluation
        next_state, reward, done, _, _ = env.step(action)
        episode_reward += reward
        state = next_state
        step += 1
    
    total_rewards += episode_reward

average_reward = total_rewards / num_eval_episodes
print(f'Average reward over {num_eval_episodes} evaluation episodes: {average_reward}')
env.close()


  if not isinstance(terminated, (bool, np.bool8)):


Episode 100/1000 - Total reward: 0.0 - Epsilon: 0.6057704364907278
Q-table snapshot:
[[9.27511024e-03 2.01158702e-02 3.40658858e-03 9.68163730e-03]
 [8.14773730e-03 0.00000000e+00 1.79446270e-05 9.36267024e-04]
 [1.16384216e-04 0.00000000e+00 5.46316752e-07 1.67596645e-07]
 [5.51835103e-06 0.00000000e+00 0.00000000e+00 5.46316752e-07]
 [9.56851160e-03 2.78119256e-02 0.00000000e+00 5.18742410e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 3.37056390e-03 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.36302423e-02 0.00000000e+00 4.71395236e-02 1.27754813e-02]
 [3.33173744e-04 1.02032678e-01 2.56337725e-02 0.00000000e+00]
 [4.81818842e-03 2.12789433e-01 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 4.33204200e-03 2.71489961e-01 1.38131403e-02]
 [2.36530939e-02 2.58686039e-01 6