## Make the Environment compatible with the gymnasium library

Create a class that inherit from `gym.Env`: `class MazeGameEnv(gym.Env)` with the following functions:
- `def __init__(self, render_mode=None)`
- `def reset(self, seed=None, options=None) -> observation, info`
- `def step(self, action) -> observation, reward, terminated, truncated, info`
- `render()`
- `close()`

It must have the following attributes:
- `self.action_space`
- `self.observation_space`

of the type `from gymnasium import spaces`

For more detail visit the [Gymnasium library guide: Make your own custom environment](https://www.gymlibrary.dev/content/environment_creation/)

In [14]:
!pip install gymnasium -q

In [15]:
import numpy as np
from io import BytesIO
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import gymnasium as gym
from gymnasium import spaces

class MazeGameEnv(gym.Env):

    def __init__(self, render_mode=None):
        self.render_mode = render_mode
        self.max_steps = 100
        self.step_count = 0
        self.board = [
            ['😊', '😺', ' '],
            [' ', ' ', ' '],
            ['😺', ' ', '😍']
        ]
        self.player_pos = (0, 0)
        self.goal_pos = (2, 2)
        self.board_history = []

        self.action_space = spaces.Discrete(4) # up, down, left, right
        self.observation_space = spaces.Box(low=0, high=2, shape=(2,), dtype=int)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.step_count = 0
        self.player_pos = (0, 0)
        self.board[0][0] = '😊'
        self.board[2][2] = '😍'
        self.board_history = []
        observation = self._get_observation()

        self.render()

        info = {}
        return observation, info

    def step(self, action):
        self.step_count += 1
        x, y = self.player_pos
        if action == 0 and x > 0:  # Up
            x -= 1
        elif action == 1 and x < 2:  # Down
            x += 1
        elif action == 2 and y > 0:  # Left
            y -= 1
        elif action == 3 and y < 2:  # Right
            y += 1

        self.board[self.player_pos[0]][self.player_pos[1]] = ' '
        self.player_pos = (x, y)
        self.board[x][y] = '😊'

        terminated = self.player_pos == self.goal_pos
        reward = 1 if terminated else -1
        truncated = self.step_count >= self.max_steps
        observation = self._get_observation()

        self.render()

        info = {}
        return observation, reward, terminated, truncated, info

    def _get_observation(self):
      observation = np.array(self.player_pos)
      return observation

    def render(self):
      if self.render_mode == "human" or self.render_mode == "rgb_array":
        self._render_frame()

    def _render_frame(self):
      fig, ax = plt.subplots()
      ax.set_xticks(np.arange(0, 4, 1))
      ax.set_yticks(np.arange(0, 4, 1))
      ax.grid(True, color='black')

      if self.player_pos == self.goal_pos:
          self.board[self.goal_pos[0]][self.goal_pos[1]] = '😊😍'

      [[ax.text(j + 0.5, 2.5 - i, self.board[i][j], ha='center', va='center', fontsize=50, color='blue') for j in range(3)] for i in range(3)]

      buf = BytesIO()
      fig.savefig(buf, format='png')
      buf.seek(0)
      img = mpimg.imread(buf)
      self.board_history.append(img)
      buf.close()
      if self.render_mode != "human":
          plt.close(fig)
      return img

    def close(self):
        plt.close()

## Train using stable_baselines3 library

In [16]:
!pip install stable-baselines3 -q

In [17]:
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env
import numpy as np

env = MazeGameEnv()
check_env(env)

# Train the agent
agent = DQN("MlpPolicy", env,
            learning_rate=0.001,
            exploration_initial_eps=0.9,
            exploration_final_eps=0.9,
            train_freq=1,
            gamma = 0.9,
            learning_starts=32,
            target_update_interval=1,
            verbose=1)
agent.learn(total_timesteps=2000, log_interval=20)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 23.6     |
|    ep_rew_mean      | -21.8    |
|    exploration_rate | 0.9      |
| time/               |          |
|    episodes         | 20       |
|    fps              | 86       |
|    time_elapsed     | 5        |
|    total_timesteps  | 473      |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.0286   |
|    n_updates        | 440      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 19.3     |
|    ep_rew_mean      | -17.4    |
|    exploration_rate | 0.9      |
| time/               |          |
|    episodes         | 40       |
|    fps              | 94       |
|    time_elapsed     | 8        |
|    total_timesteps  | 772      |
| train/              |        

<stable_baselines3.dqn.dqn.DQN at 0x7dfc7f2738b0>

## Visualize learned policy

In [18]:
# Print the trained policy in a 3 by 3 dataframe with an up, down, left or right arrow icon

import pandas as pd

# Create a dictionary to map action indices to arrow symbols
action_mapping = {
    0: "↑",  # Up
    1: "↓",  # Down
    2: "←",  # Left
    3: "→"   # Right
}

# Create the policy DataFrame
policy_data = np.empty((3, 3), dtype=str)
for r in range(3):
    for c in range(3):
        action = int(agent.predict((r, c), deterministic=True)[0])
        policy_data[r,c] = action_mapping[action]

policy_df = pd.DataFrame(policy_data)
policy_df

Unnamed: 0,0,1,2
0,↓,↓,↓
1,→,→,↓
2,→,→,↓


## Evaluate using stable_baselines3 library

In [19]:
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy

env = MazeGameEnv(render_mode=None)
eval_env = Monitor(env)
mean_reward, std_reward = evaluate_policy(agent, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=-2.00 +/- 0.0


  and should_run_async(code)


## Play one episode

In [20]:
env = MazeGameEnv(render_mode="rgb_array")
observation, info = env.reset()
terminated = False
total_reward = 0
while not terminated:
    action = int(agent.predict(observation, deterministic=True)[0])  # Use predict for evaluation
    observation, reward, terminated, truncated, info = env.step(action)
    total_reward += reward

print(f"Total reward: {total_reward}")

Total reward: -2


## Render the episode

In [21]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.animation import FuncAnimation
from matplotlib import rc
rc('animation', html='jshtml')

def animate_board_history(board_history):
    fig, ax = plt.subplots()
    img_plot = ax.imshow(board_history[0])  # Initial plot

    def update(frame):
        img_plot.set_data(board_history[frame])
        return img_plot,

    ani = FuncAnimation(fig, update, frames=len(board_history), interval=500, blit=True)
    plt.close(fig)
    return ani

animate_board_history(env.board_history)