In [23]:

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
import os
from IPython.display import clear_output
import random
import matplotlib.pyplot as plt
import pickle
import time

In [24]:
def save_frames_as_gif(frames,episode, algorithm_type, path='./Algorithm_Animations', filename='gym_animation.gif'):
    #Mess with this to change frame size
    print(frames[0].shape)
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72)

    patch = plt.imshow(frames[0])
    plt.axis('off')
    plt.title(f"Run from episode {episode} {algorithm_type}")

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    anim.save(path + filename, writer='imagemagick', fps=30)

In [33]:
def run(episodes, is_training=True, render=False):
    env = gym.make('CliffWalking-v0', render_mode="human" if render else None)

    if is_training:
        q = np.zeros((env.observation_space.n, env.action_space.n))
    else:
        with open('CliffWalking.pkl', 'rb') as f:
            q = pickle.load(f)

    learning_rate_a = 0.1
    epsilon = 1
    epsilon_decay_rate = 0.0005
    rng = np.random.default_rng()
    RENDER_AT_EPISODE = 10
    rewards_per_episode = np.zeros(episodes)

    for i in range(episodes):
        if i % 10_000 == 0:
            print(f"Currently on episode {i}")
        state = env.reset()[0]
        terminated = False
        truncated = False
        episode_reward = 0

        while not terminated and not truncated:
            if is_training and rng.random() < epsilon:
                action = env.action_space.sample()  # escoje una accion aleatoria
            else:
                action = np.argmax(q[state, :])  # escoje la mejor accion

            new_state, reward, terminated, truncated, info = env.step(action)

            if is_training:
                q[state, action] = q[state, action] + learning_rate_a * (reward - q[state, action])
            
            state = new_state
            episode_reward += reward
        if i % RENDER_AT_EPISODE == 0:
            clear_output(wait=True)
            env.render()

        epsilon = max(epsilon - epsilon_decay_rate, 0.01)
        rewards_per_episode[i] = episode_reward
        print(f"Episode {i}/{episodes}, Epsilon: {epsilon:.4f}, Reward: {episode_reward:.4f}")

    env.close()

    window_size = 1000
    sum_rewards = np.zeros(episodes)
    for t in range(episodes):
        sum_rewards[t] = np.sum(rewards_per_episode[max(0, t-window_size):(t+1)])
    
    print("Final average reward:", np.mean(sum_rewards[-1000:]))
    plt.plot(sum_rewards)
    plt.xlabel('Episodes')
    plt.ylabel('Sum of rewards')
    plt.title('Sum of rewards over time')
    plt.savefig('CliffWalking.png')

    if is_training:
        with open("CliffWalking.pkl", "wb") as f:
            pickle.dump(q, f)

    return sum_rewards

In [34]:
if __name__ == "__main__":
    run(10000, is_training=True, render=False)

Episode 290/10000, Epsilon: 0.8545, Reward: -3590.0000
Episode 291/10000, Epsilon: 0.8540, Reward: -263.0000


KeyboardInterrupt: 

: 