# Create an Agent to play in the Environment

Create a class with the following functions:
- `def __init__(self, env)`
- `def act(self, observation)`
- `def learn(self, observation, action, reward, next_observation, done)`
- `def predict(self, observation)`

## Environment from previous unit

In [13]:
import numpy as np
from io import BytesIO
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

class MazeGameEnv():

    def __init__(self, render_mode=None):
        self.render_mode = render_mode
        self.max_steps = 100
        self.step_count = 0
        self.board = [
            ['😊', '😺', ' '],
            [' ', ' ', ' '],
            ['😺', ' ', '😍']
        ]
        self.player_pos = (0, 0)
        self.goal_pos = (2, 2)
        self.board_history = []
        self.action_space = 4  # Up, Down, Left, Right
        self.observation_space = (3, 3)


    def reset(self):
        self.step_count = 0
        self.player_pos = (0, 0)
        self.board[0][0] = '😊'
        self.board[2][2] = '😍'
        self.board_history = []

        self.render()

        observation = self.player_pos
        return observation

    def step(self, action):
        self.step_count += 1
        x, y = self.player_pos
        if action == 0 and x > 0:  # Up
            x -= 1
        elif action == 1 and x < 2:  # Down
            x += 1
        elif action == 2 and y > 0:  # Left
            y -= 1
        elif action == 3 and y < 2:  # Right
            y += 1

        self.board[self.player_pos[0]][self.player_pos[1]] = ' '
        self.player_pos = (x, y)
        self.board[x][y] = '😊'

        terminated = self.player_pos == self.goal_pos
        truncated = self.step_count >= self.max_steps
        reward = 1 if terminated else -1

        self.render()

        observation = self.player_pos
        return observation, reward, terminated, truncated


    def render(self):
      if self.render_mode == "human" or self.render_mode == "rgb_array":
        self._render_frame()

    def _render_frame(self):
      fig, ax = plt.subplots()
      ax.set_xticks(np.arange(0, 4, 1))
      ax.set_yticks(np.arange(0, 4, 1))
      ax.grid(True, color='black')

      if self.player_pos == self.goal_pos:
          self.board[self.goal_pos[0]][self.goal_pos[1]] = '😊😍'

      [[ax.text(j + 0.5, 2.5 - i, self.board[i][j], ha='center', va='center', fontsize=50, color='blue') for j in range(3)] for i in range(3)]

      buf = BytesIO()
      fig.savefig(buf, format='png')
      buf.seek(0)
      img = mpimg.imread(buf)
      self.board_history.append(img)
      buf.close()
      if self.render_mode != "human":
          plt.close(fig)
      return img

    def close(self):
        plt.close()

## Agent to implement

In [14]:
import random
class QLearningAgent():
    def __init__(self, env, learning_rate=0.1, discount_factor=0.9, exploration_rate=0.9):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.q_table = np.zeros((9, 4)) # State (row * 3 + col) and 4 actions

    def act(self, observation):
        # Use observation (player position) as the state
        state = observation[0] * 3 + observation[1] # convert 2D to 1D
        if random.uniform(0, 1) < self.exploration_rate:
            return random.randint(0,3) # Explore
        else:
            return np.argmax(self.q_table[state]) # Exploit

    def learn(self, observation, action, reward, next_observation, done):
        # Use observation (player position) as the state
        state = observation[0] * 3 + observation[1]
        next_state = next_observation[0] * 3 + next_observation[1]
        if done:
            target = reward
        else:
            target = reward + self.discount_factor * np.max(self.q_table[next_state])
        self.q_table[state, action] += self.learning_rate * (target - self.q_table[state, action])

    def predict(self, observation):
        state = observation[0] * 3 + observation[1]
        return np.argmax(self.q_table[state])

## Agent training

In [15]:
from tqdm import tqdm

# Hyperparameters
learning_rate = 0.1
discount_factor = 0.9
exploration_rate = 0.9
num_episodes = 100

# Initialize environment and agent
env = MazeGameEnv()
agent = QLearningAgent(env, learning_rate, discount_factor, exploration_rate)

# Training loop
for episode in tqdm(range(num_episodes)):
    observation = env.reset()  # Reset environment at the start of each episode
    terminated, truncated = False, False
    while not terminated and not truncated:
        action = agent.act(observation)
        next_observation, reward, terminated, truncated = env.step(action)
        agent.learn(observation, action, reward, next_observation, terminated)
        observation = next_observation

print("Training finished.")

100%|██████████| 100/100 [00:00<00:00, 2446.47it/s]

Training finished.





## QTable trained values

In [16]:
# Print pretty qtable with states, actions and the max value of each row in a different color

import pandas as pd

# Assuming 'agent' and 'q_table' are defined from the previous code
# Create a list of states
states = []
for i in range(3):
  for j in range(3):
    states.append(f"({i}, {j})")
# Create a DataFrame for the Q-table
df = pd.DataFrame(agent.q_table, columns=['Up', 'Down', 'Left', 'Right'], index=states)

# Find max value in each row
max_values = df.max(axis=1)

# Style the DataFrame
styled_df = df.style.apply(lambda row: ['background-color: yellow' if val == row.max() else '' for val in row], axis=1)

# Display the styled Q-table with max values
styled_df

Unnamed: 0,Up,Down,Left,Right
"(0, 0)",-2.809135,-2.016077,-2.806693,-2.003234
"(0, 1)",-2.000074,-1.110584,-2.802371,-1.11033
"(0, 2)",-1.113211,-0.111554,-1.987817,-1.124938
"(1, 0)",-2.751776,-1.184193,-2.00512,-1.108001
"(1, 1)",-1.955445,-0.153516,-1.993606,-0.108783
"(1, 2)",-1.106806,0.998003,-1.102972,-0.132737
"(2, 0)",-1.983833,-1.194479,-1.163243,-0.153509
"(2, 1)",-1.045924,-0.228355,-1.164824,0.986697
"(2, 2)",0.0,0.0,0.0,0.0


## Trained Policy

In [17]:
# Print the trained policy in a 3 by 3 dataframe with an up, down, left or right arrow icon

import pandas as pd

# Create a dictionary to map action indices to arrow symbols
action_mapping = {
    0: "↑",  # Up
    1: "↓",  # Down
    2: "←",  # Left
    3: "→"   # Right
}

# Create the policy DataFrame
policy_data = np.empty((3, 3), dtype=str)
for r in range(3):
    for c in range(3):
        action = agent.predict((r, c))
        policy_data[r,c] = action_mapping[action]

policy_df = pd.DataFrame(policy_data)
policy_df

Unnamed: 0,0,1,2
0,→,→,↓
1,→,→,↓
2,→,→,↑


## Agent Evaluation

In [18]:
# Evaluate the trained agent
def evaluate_agent(agent, env, num_episodes=100):
    total_reward = 0
    for _ in tqdm(range(num_episodes)):
        observation = env.reset()
        terminated, truncated = False, False
        while not terminated and not truncated:
            action = agent.predict(observation)  # Use predict for evaluation
            observation, reward, terminated, truncated = env.step(action)
            total_reward += reward
    return total_reward / num_episodes

env = MazeGameEnv()
average_reward = evaluate_agent(agent, env)
print(f"Average reward over 100 episodes: {average_reward}")

100%|██████████| 100/100 [00:00<00:00, 26843.55it/s]

Average reward over 100 episodes: -2.0





## Sample episode

In [19]:
env = MazeGameEnv(render_mode="rgb_array")
observation = env.reset()
terminated, truncated = False, False
total_reward = 0
while not terminated and not truncated:
    action = agent.predict(observation)  # Use predict for evaluation
    observation, reward, terminated, truncated = env.step(action)
    total_reward += reward

print(f"Total reward: {total_reward}")

Total reward: -2


## Animation function from previous unit

In [20]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.animation import FuncAnimation
from matplotlib import rc
rc('animation', html='jshtml')

def animate_board_history(board_history):
    fig, ax = plt.subplots()
    img_plot = ax.imshow(board_history[0])  # Initial plot

    def update(frame):
        img_plot.set_data(board_history[frame])
        return img_plot,

    ani = FuncAnimation(fig, update, frames=len(board_history), interval=500, blit=True)
    plt.close(fig)
    return ani

animate_board_history(env.board_history)