In [41]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
import os
from IPython.display import clear_output
import random
import matplotlib.pyplot as plt
import pickle
import time
import pandas as pd

In [42]:
rng = np.random.default_rng()

In [43]:
class RewardGraphic():
    def __init__(self, window_size=1000):
        self.window_size = window_size

    def Graphic(self, rewards_per_episode, episodes):
        sum_rewards = np.zeros(episodes)
        for t in range(episodes):
            sum_rewards[t] = np.sum(rewards_per_episode[max(0, t-self.window_size):(t+1)])

        print("Final average reward:", np.mean(sum_rewards[-1000:]))
        plt.plot(sum_rewards)
        plt.xlabel('Episodes')
        plt.ylabel('Sum of rewards')
        plt.title('Sum of rewards over time')
        plt.savefig('CliffWalking.png')

In [44]:
class Game:
    def __init__(self, agent, explore=True):
        self.agent = agent
        self.explore = explore
        self.is_explore()

    def is_explore(self):
        if self.explore:
            self.agent.q = np.zeros((self.agent.observation_space_size, self.agent.action_space_size))
            self.agent.actions = np.zeros(self.agent.action_space_size)
        else:
            with open('highway.pkl', 'rb') as f:
                self.agent.q = pickle.load(f)
                self.agent.actions = np.zeros(self.agent.action_space_size)

    def save_pickle(self):
        if self.explore:
            with open("highway.pkl", "wb") as f:
                pickle.dump(self.agent.q, f)

    def run(self, episodes=100):
        RENDER_AT_EPISODE = 1
        rewards_per_episode = np.zeros(episodes)
        
        for episode in range(episodes):
            state = self.agent.reset()
            self.agent.episode_reward = 0
            
            while True:
                action = self.agent.choose_action(state, self.explore)
                next_state, reward, terminated, truncated, _ = self.agent.env.step(action)
                next_state = self.agent.discretize_state(next_state)
                
                done = terminated or truncated 
                if self.explore:
                    self.agent.learn(state, action, reward, next_state, done)
                
                self.agent.actions[action] += 1
                state = next_state
                self.agent.episode_reward += reward
                
                if done:
                    break
            
            if episode % RENDER_AT_EPISODE == 0:
                clear_output(wait=True)
                self.agent.env.render()
            
            self.agent.epsilon = max(self.agent.epsilon - self.agent.epsilon_decay_rate, 0.01)
            rewards_per_episode[episode] = self.agent.episode_reward
            print(f"Episode {episode}/{episodes}, Epsilon: {self.agent.epsilon:.4f}, Reward: {self.agent.episode_reward:.4f}")
        
        plt.plot(rewards_per_episode)
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.title('Reward per Episode')
        plt.show()
        
        self.save_pickle()
        self.agent.env.close()

In [45]:
class Agent:
    def __init__(self, epsilon=1, epsilon_decay_rate=0.0001, alpha=0.5, gamma=0.99, render=False):
        self.epsilon = epsilon
        self.epsilon_decay_rate = epsilon_decay_rate
        self.alpha = alpha
        self.gamma = gamma
        self.render = render
        self.episode_reward = 0
        self.env = gym.make('highway-v0', render_mode='human' if render else None)
        self.observation_space_size = 10  # Discretización del espacio de observación
        self.action_space_size = self.env.action_space.n
        self.q = np.zeros((self.observation_space_size, self.action_space_size))
        self.actions = np.zeros(self.action_space_size)

    def discretize_state(self, state):
        state_disc = tuple((state * self.observation_space_size).astype(int))
        return hash(state_disc) % self.observation_space_size

    def reset(self):
        return self.discretize_state(self.env.reset()[0])

    def is_explore(self, explore, state):
        if explore:
            return np.argmax(self.q[state, :])
        else:
            max_q = np.max(self.q[state, :])
            actions = np.where(self.q[state, :] == max_q)[0]
            print(f"Estado: {state}, Acción: {np.random.choice(actions)}")
            action = np.random.choice(actions)
            return action

    def choose_action(self, state, explore):
        if explore and np.random.uniform(0, 1) < self.epsilon:
            return self.env.action_space.sample()
        else:
            return self.is_explore(explore, state)

    def learn(self, state, action, reward, next_state, done):
        self.q[state, action] = self.q[state, action] + self.alpha * (
            reward + self.gamma * np.max(self.q[next_state, :]) - self.q[state, action]
        )

In [46]:
agent = Agent()
game = Game(agent, explore=True)
game.run(100)

TypeError: unhashable type: 'numpy.ndarray'

0: Move up

1: Move right

2: Move down

3: Move left

In [121]:
q_df = pd.DataFrame(agent.q)
print("Q-table as DataFrame:")
q_df

Q-table as DataFrame:


Unnamed: 0,0,1,2,3
0,-1.699615e-08,-1.699725e-08,-1.422748e-08,-1.073257e-08
1,-1.699615e-08,-1.699726e-08,-1.422748e-08,-1.073257e-08
2,-1.699616e-08,-1.699727e-08,-1.422748e-08,-1.073257e-08
3,-1.699615e-08,-1.699725e-08,-1.422755e-08,-1.073259e-08
4,-1.699623e-08,-1.69974e-08,-1.422748e-08,-1.07326e-08
5,-1.699639e-08,-1.699733e-08,-1.422747e-08,-1.073268e-08
6,-1.699668e-08,-1.699724e-08,-1.422748e-08,-1.073257e-08
7,-1.699622e-08,-1.699733e-08,-1.422755e-08,-1.073257e-08
8,-1.699668e-08,-1.700085e-08,-1.422796e-08,-1.073263e-08
9,-1.701855e-08,-1.700332e-08,-1.423064e-08,-1.073607e-08
