In [28]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
import os
from IPython.display import clear_output
import random
import matplotlib.pyplot as plt
import pickle

In [29]:
def run(episodes, is_training=True, render=False):
    env = gym.make('FrozenLake-v1', map_name="8x8", is_slippery=True, render_mode='human' if render else None)

    if(is_training):
        q = np.zeros((env.observation_space.n, env.action_space.n))
    else:
        with open('frozen_lake8x8.pkl', 'rb') as f:
            q = pickle.load(f)

    learning_rate_a = 0.1
    discount_factor_g = 0.99
    epsilon = 1
    epsilon_decay_rate = 0.0005
    rng = np.random.default_rng()

    rewards_per_episode = np.zeros(episodes)

    for i in range(episodes):
        print("Episode", i, "/", episodes)
        state = env.reset()[0]
        terminated = False
        truncated = False
        episode_reward = 0

        while(not terminated and not truncated):
            if is_training and rng.random() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q[state,:])

            new_state, reward, terminated, truncated, _ = env.step(action)

            if is_training:
                q[state,action] = q[state,action] + learning_rate_a * (
                    reward + discount_factor_g * np.max(q[new_state,:]) - q[state,action]
                )

            state = new_state
            episode_reward += reward

        epsilon = max(epsilon - epsilon_decay_rate, 0.01)
        rewards_per_episode[i] = episode_reward
        print(f"Episode {i}/{episodes}, Epsilon: {epsilon:.4f}, Reward: {episode_reward:.4f}")

        if i % 1000 == 0:
            print(f"Episode {i}/{episodes}, Epsilon: {epsilon:.4f}, Average Reward: {np.mean(rewards_per_episode[max(0, i-1000):i]):.4f}")

    env.close()

    window_size = 1000
    sum_rewards = np.zeros(episodes)
    for t in range(episodes):
        sum_rewards[t] = np.sum(rewards_per_episode[max(0, t-window_size):(t+1)])
    
    print("Final average reward:", np.mean(sum_rewards[-1000:]))
    plt.plot(sum_rewards)
    plt.savefig('frozen_lake8x8.png')

    if is_training:
        with open("frozen_lake8x8.pkl", "wb") as f:
            pickle.dump(q, f)

In [31]:
if __name__ == "__main__":
    run(100, is_training=True, render=True)

Episode 0 / 100
Episode 0/100, Epsilon: 0.9995, Average Reward: nan
Episode 1 / 100
Episode 2 / 100


KeyboardInterrupt: 

: 