In [9]:
# Frozen Lake environment.  [Map out how rewards, actions, etc. work]

import numpy as np
import gymnasium as gym
import random

env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=True)

action_size = env.action_space.n
state_size = env.observation_space.n

q_table = np.zeros((state_size, action_size))

# Set hyperparameters
total_episodes = 10000       # Total episodes
learning_rate = 0.5          # Learning rate
# max_steps = 99               # Max steps per episode # Note: max_steps should be handled by truncation already.
gamma = 0.99                 # Discounting rate

# Exploration parameters
epsilon = 1.0                # Exploration rate
max_epsilon = 1.0            # Exploration probability at start
min_epsilon = 0.01           # Minimum exploration probability 
decay_rate = 0.001           # Exponential decay rate for exploration prob

# Idea behind decaying epsilon is you gradually transition from exploration to exploitation.

# Q-learning algorithm
for episode in range(total_episodes):
    state, _ = env.reset()
    done = False

    while not done:
        # Exploration-exploitation tradeoff
        exp_exp_tradeoff = random.uniform(0, 1)
        
        # Choose action based on epsilon-greedy strategy
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()

        # Take the action and observe the outcome
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated  # Consider both terminated and truncated as the end of an episode

        # Update Q-table using the Q-learning algorithm
        q_table[state, action] = q_table[state, action] + learning_rate * (reward + gamma * np.max(q_table[new_state, :]) - q_table[state, action])

        state = new_state
    
    # Decay epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    

In [10]:
env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=True, render_mode="human") # Show the guy doing his thing now
# Use the Q-table to Play Game
env.reset()
rewards = []

for episode in range(5):
    state, _ = env.reset()
    total_rewards = 0
    print("****************************************************")
    print("EPISODE ", episode)

    while True:
        env.render()
        action = np.argmax(q_table[state,:])
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        total_rewards += reward

        if done:
            rewards.append(total_rewards)
            print("Score", total_rewards)
            break

        state = new_state

env.close()
print("Score over time:", sum(rewards) / len(rewards))

****************************************************
EPISODE  0
Score 1.0
****************************************************
EPISODE  1
Score 0.0
****************************************************
EPISODE  2
Score 1.0
****************************************************
EPISODE  3
Score 1.0
****************************************************
EPISODE  4
Score 1.0
Score over time: 0.8


: 