In [9]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
import os
from IPython.display import clear_output
import random
import matplotlib.pyplot as plt
import pickle
import time

In [10]:
rng = np.random.default_rng()

In [11]:
class RewardGraphic():
    def __init__(self, window_size=1000):
        self.window_size = window_size

    def Graphic(self, rewards_per_episode, episodes):
        sum_rewards = np.zeros(episodes)
        for t in range(episodes):
            sum_rewards[t] = np.sum(rewards_per_episode[max(0, t-self.window_size):(t+1)])

        print("Final average reward:", np.mean(sum_rewards[-1000:]))
        plt.plot(sum_rewards)
        plt.xlabel('Episodes')
        plt.ylabel('Sum of rewards')
        plt.title('Sum of rewards over time')
        plt.savefig('CliffWalking.png')

In [12]:
class Game():
    def __init__(self, Agent, explore=True):
        self.agent = Agent
        self.explore = explore
        self.is_explore()

    def is_explore(self):
        if self.explore:
            self.agent.q = np.zeros((self.agent.env.observation_space.n, self.agent.env.action_space.n))
        else:
            with open('CliffWalking.pkl', 'rb') as f:
                self.agent.q = pickle.load(f)
    
    def save_Pickle(self):
        if self.explore:
            with open("CliffWalking.pkl", "wb") as f:
                pickle.dump(self.agent.q, f)
    
    def run(self, episodes=100):
        RENDER_AT_EPISODE = 10
        rewards_per_episode = np.zeros(episodes)
        reward_graphic = RewardGraphic()
        
        for episode in range(episodes):
            state = self.agent.reset()
            self.agent.episode_reward = 0
            
            while True:
                action = self.agent.choose_action(state, self.explore)
                new_state, reward, terminated, truncated, _ = self.agent.env.step(action)
                
                done = terminated or truncated 
                if self.explore:
                    self.agent.learn(state, action, reward, new_state, done)

                state = new_state
                self.agent.episode_reward += reward
                
                if done:
                    break
            
            if episode % RENDER_AT_EPISODE == 0:
                clear_output(wait=True)
                self.agent.env.render()
            
            self.agent.epsilon = max(self.agent.epsilon - self.agent.epsilon_decay_rate, 0.01)
            rewards_per_episode[episode] = self.agent.episode_reward
            print(f"Episode {episode}/{episodes}, Epsilon: {self.agent.epsilon:.4f}, Reward: {self.agent.episode_reward:.4f}")
        
        reward_graphic.Graphic(rewards_per_episode, episodes)
        self.save_Pickle()
        self.agent.env.close()

In [13]:
class Agent():
    def __init__(self, epsilon=1, epsilon_decay_rate=0.0001, alpha=0.5, gamma=0.99, render=False):
        self.epsilon = epsilon
        self.epsilon_decay_rate = epsilon_decay_rate
        self.alpha = alpha
        self.gamma = gamma
        self.render = render
        self.episode_reward = 0
        self.env = gym.make('CliffWalking-v0', render_mode="human" if render else None)
        self.q = None
    
    def reset(self):
        return self.env.reset()[0] 
    
    def choose_action(self, state, explore):
        if explore and np.random.uniform(0,1) < self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.q[state, :])
    
    def learn(self, state, action, reward, next_state, done):
        #self.q[state, action] = self.q[state, action] + self.alpha * (reward - self.q[state, action])
        self.q[state,action] = self.q[state,action] + self.alpha * (reward + self.gamma * np.max(self.q[next_state,:]) - self.q[state,action])

In [15]:
agent = Agent(render=True)
game = Game(agent, explore=False)
game.run(1000)

Episode 10/1000, Epsilon: 0.9989, Reward: -13.0000
Episode 11/1000, Epsilon: 0.9988, Reward: -13.0000
Episode 12/1000, Epsilon: 0.9987, Reward: -13.0000
Episode 13/1000, Epsilon: 0.9986, Reward: -13.0000
Episode 14/1000, Epsilon: 0.9985, Reward: -13.0000


KeyboardInterrupt: 