# Create a dummy agent to play in the Environment

In this unit we are going to create an Agent class that can act on the environment. It's going to be a dummy agent that only does random actions and doesn't learn anything. We are just focusing on the class structure and will follow the pattern of the stable_baselines3 library.

Create a class with the following functions:
- `def __init__(self, env)`
- `def act(self, observation) -> action`
- `def learn(self, observation, action, reward, next_observation, done)`
- `def predict(self, observation) -> action`

In [23]:
import numpy as np
from io import BytesIO
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

class MazeGameEnv():

    def __init__(self, render_mode=None):
        self.render_mode = render_mode
        self.max_steps = 100
        self.step_count = 0
        self.board = [
            ['😊', '😺', ' '],
            [' ', ' ', ' '],
            ['😺', ' ', '😍']
        ]
        self.player_pos = (0, 0)
        self.goal_pos = (2, 2)
        self.board_history = []
        self.action_space = 4  # Up, Down, Left, Right
        self.observation_space = (3, 3)


    def reset(self):
        self.step_count = 0
        self.player_pos = (0, 0)
        self.board[0][0] = '😊'
        self.board[2][2] = '😍'
        self.board_history = []

        self.render()

        observation = self.player_pos
        return observation

    def step(self, action):
        self.step_count += 1
        x, y = self.player_pos
        if action == 0 and x > 0:  # Up
            x -= 1
        elif action == 1 and x < 2:  # Down
            x += 1
        elif action == 2 and y > 0:  # Left
            y -= 1
        elif action == 3 and y < 2:  # Right
            y += 1

        self.board[self.player_pos[0]][self.player_pos[1]] = ' '
        self.player_pos = (x, y)
        self.board[x][y] = '😊'

        terminated = self.player_pos == self.goal_pos
        truncated = self.step_count >= self.max_steps
        reward = 1 if terminated else -1

        self.render()

        observation = self.player_pos
        return observation, reward, terminated, truncated


    def render(self):
      if self.render_mode == "human" or self.render_mode == "rgb_array":
        self._render_frame()

    def _render_frame(self):
      fig, ax = plt.subplots()
      ax.set_xticks(np.arange(0, 4, 1))
      ax.set_yticks(np.arange(0, 4, 1))
      ax.grid(True, color='black')

      if self.player_pos == self.goal_pos:
          self.board[self.goal_pos[0]][self.goal_pos[1]] = '😊😍'

      [[ax.text(j + 0.5, 2.5 - i, self.board[i][j], ha='center', va='center', fontsize=50, color='blue') for j in range(3)] for i in range(3)]

      buf = BytesIO()
      fig.savefig(buf, format='png')
      buf.seek(0)
      img = mpimg.imread(buf)
      self.board_history.append(img)
      buf.close()
      if self.render_mode != "human":
          plt.close(fig)
      return img

    def close(self):
        plt.close()

In [24]:
import random

class DummyAgent():
    def __init__(self, env):
        self.action_space = env.action_space

    def act(self, observation):
        # Choose a random action
        return random.randint(0, self.action_space - 1)

    def learn(self, observation, action, reward, next_observation, done):
        # I don't learn I'm dummy
        pass

    def predict(self, observation):
        # I don't learn I'm dummy
        pass

## Ejecutar un episodio hasta que llegue a la meta

In [25]:
env = MazeGameEnv(render_mode="rgb_array")
agent = DummyAgent(env)

observation = env.reset()

total_reward = 0
terminated, truncated = False, False
while not terminated and not truncated:
    action = agent.act(observation)
    observation, reward, terminated, truncated = env.step(action)
    total_reward += reward

print(f"Total reward: {total_reward}")

Total reward: -7


In [26]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.animation import FuncAnimation
from matplotlib import rc
rc('animation', html='jshtml')

def animate_board_history(board_history):
    fig, ax = plt.subplots()
    img_plot = ax.imshow(board_history[0])  # Initial plot

    def update(frame):
        img_plot.set_data(board_history[frame])
        return img_plot,

    ani = FuncAnimation(fig, update, frames=len(board_history), interval=500, blit=True)
    plt.close(fig)
    return ani

animate_board_history(env.board_history)

# Evaluar al agente durante 100 episodios

In [27]:
env = MazeGameEnv(render_mode=None)
agent = DummyAgent(env)

# Evaluate the trained agent
def evaluate_agent(agent, env, num_episodes=100):
    total_reward = 0
    for i in range(num_episodes):
        observation = env.reset()
        terminated, truncated = False, False
        episode_reward = 0
        while not terminated and not truncated:
            action = agent.act(observation)
            observation, reward, terminated, truncated = env.step(action)
            observation = env.player_pos
            episode_reward += reward
        print(f"Episode {i+1} reward: {episode_reward}")
        total_reward += episode_reward
    return total_reward / num_episodes

env = MazeGameEnv()
average_reward = evaluate_agent(agent, env)
print(f"Average reward over 100 episodes: {average_reward}")

Episode 1 reward: -9
Episode 2 reward: -16
Episode 3 reward: -43
Episode 4 reward: -12
Episode 5 reward: -22
Episode 6 reward: -24
Episode 7 reward: -10
Episode 8 reward: -15
Episode 9 reward: -67
Episode 10 reward: -13
Episode 11 reward: -11
Episode 12 reward: -18
Episode 13 reward: -2
Episode 14 reward: -34
Episode 15 reward: -24
Episode 16 reward: -13
Episode 17 reward: -10
Episode 18 reward: -59
Episode 19 reward: -68
Episode 20 reward: -10
Episode 21 reward: -34
Episode 22 reward: -47
Episode 23 reward: -20
Episode 24 reward: -3
Episode 25 reward: -34
Episode 26 reward: -27
Episode 27 reward: -11
Episode 28 reward: -15
Episode 29 reward: -22
Episode 30 reward: -43
Episode 31 reward: -76
Episode 32 reward: -9
Episode 33 reward: -36
Episode 34 reward: -10
Episode 35 reward: -23
Episode 36 reward: -27
Episode 37 reward: -23
Episode 38 reward: -30
Episode 39 reward: -19
Episode 40 reward: -3
Episode 41 reward: -4
Episode 42 reward: -6
Episode 43 reward: -3
Episode 44 reward: -7
Episod