# Q Learning

In this unit we will create an agent that learns by playing by using the qlearning algorithm to generate a table (the Q table) that stores a value for each state-action pair

In [None]:
import copy
from io import BytesIO
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.animation import FuncAnimation
from matplotlib import rc

# Ensure animations can be displayed inline
rc('animation', html='jshtml')

class MazeGame:
    def __init__(self):
        self.board = [
            ['😊', '😺', ' '],
            [' ', ' ', ' '],
            ['😺', ' ', '😍']
        ]
        self.board_values = [
            [-1,  20, -1],
            [-1, -1, -1],
            [ 20, -1, 100]
        ]
        self.player_pos = (0, 0)  # Starting position
        self.goal_pos = (2, 2)  # Goal position
        self.board_history = []  # To store the board state at each step
        self.board_history.append(copy.deepcopy(self.board))  # Store initial state

    def render_animation(self):
        """
        Creates an animation that shows the progression of the game
        """
        fig, ax = plt.subplots()
        ax.set_xticks(np.arange(0, 3 + 1, 1))
        ax.set_yticks(np.arange(0, 3 + 1, 1))
        ax.grid(True, color='black')

        # Set limits and reverse y-axis to have (0,0) in top-left
        ax.set_xlim(0, 3)
        ax.set_ylim(0, 3)
        ax.invert_yaxis()

        # Initialize a list of text objects for each cell
        text_objects = []
        for i in range(3):
            row = []
            for j in range(3):
                text = ax.text(j + 0.5, i + 0.5, '', ha='center', va='center', fontsize=50)
                row.append(text)
            text_objects.append(row)

        # Function to update the board for each frame of the animation
        def update(frame):
            board = self.board_history[frame]
            for i in range(3):
                for j in range(3):
                    text_objects[i][j].set_text(board[i][j])
            return [item for sublist in text_objects for item in sublist]

        # Create the animation
        ani = FuncAnimation(fig, update, frames=len(self.board_history), interval=500, blit=True)
        plt.close(fig)
        return ani

    def move(self, direction):
        """
        Moves the player one cell in the specified direction.

        Args:
          direction: A string indicating the direction to move ('up', 'down', 'left', 'right').
        """
        x, y = self.player_pos
        if direction == 'up' and x > 0:
            self.board[x][y] = ' '
            x -= 1
        elif direction == 'down' and x < 2:
            self.board[x][y] = ' '
            x += 1
        elif direction == 'left' and y > 0:
            self.board[x][y] = ' '
            y -= 1
        elif direction == 'right' and y < 2:
            self.board[x][y] = ' '
            y += 1
        else:
            #print("Invalid move!")
            return

        # Update the board with the new player position
        self.board[x][y] = '😊'
        self.player_pos = (x, y)

        # Check if player reached the goal
        if self.player_pos == self.goal_pos:
            self.board[x][y] = '😊😍'
            #print(f"You reached the goal in {len(self.board_history)} moves!")

        # Append the updated board state to the history
        self.board_history.append(copy.deepcopy(self.board))



In [None]:
import random
from collections import defaultdict

class QLearningAgent:
    def __init__(self, maze_game, learning_rate=0.01, discount_factor=1.0, exploration_rate=1.0, exploration_decay=0.99):
        self.maze_game = maze_game
        self.q_table = defaultdict(lambda: np.zeros(4))  # Initialize Q-table for each state, with 4 actions (up, down, left, right)
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.actions = ['up', 'down', 'left', 'right']  # Possible actions
        self.visited = set()  # Keep track of visited states in each episode

    def get_state(self):
        # The state can be represented by the player's position on the board
        return self.maze_game.player_pos

    def is_valid_move(self, state, action):
        """
        Check if the move is valid (doesn't move out of bounds or into a wall).
        """
        x, y = state
        if action == 'up':
            new_x, new_y = x - 1, y
        elif action == 'down':
            new_x, new_y = x + 1, y
        elif action == 'left':
            new_x, new_y = x, y - 1
        elif action == 'right':
            new_x, new_y = x, y + 1

        # Check if the new position is out of bounds
        if new_x < 0 or new_x >= 3 or new_y < 0 or new_y >= 3:
            return False

        # Check if the new position has already been visited
        if (new_x, new_y) in self.visited:
            return False

        return True

    def choose_action(self, state, explore=True):
        """
        Choose a valid action using ε-greedy strategy, and avoid invalid or revisited moves.
        """
        valid_actions = [action for action in self.actions if self.is_valid_move(state, action)]

        if explore and np.random.rand() < self.exploration_rate:
            # Choose a random valid action
            if valid_actions:
                return random.choice(valid_actions)
            else:
                return None  # If no valid action is available
        else:
            # Choose the best action according to Q-table, only from valid actions
            q_values = self.q_table[state]
            if valid_actions:
                best_valid_action = max(valid_actions, key=lambda action: q_values[self.actions.index(action)])
                return best_valid_action
            else:
                return None  # If no valid action is available

    def learn(self, state, action, reward, next_state):
        """
        Q-learning update rule.
        """
        action_index = self.actions.index(action)
        best_future_q = np.max(self.q_table[next_state])
        current_q = self.q_table[state][action_index]

        # Update the Q-value for the current state-action pair
        new_q = (1 - self.learning_rate) * current_q + self.learning_rate * (reward + self.discount_factor * best_future_q)
        self.q_table[state][action_index] = new_q

    def update_exploration_rate(self):
        # Reduce exploration rate after each episode
        self.exploration_rate *= self.exploration_decay

    def play(self, episodes=1000):
        for episode in range(episodes):
            self.maze_game.__init__()  # Reset the game for each episode
            self.visited = set()  # Reset visited cells at the beginning of each episode
            state = self.get_state()
            self.visited.add(state)  # Mark the starting position as visited
            total_reward = 0
            step = 0

            while state != self.maze_game.goal_pos:
                action = self.choose_action(state)
                if action is None:
                    print("No valid moves left! Stuck.")
                    break

                prev_state = state

                # Execute action in the environment
                self.maze_game.move(action)
                state = self.get_state()

                # Add the new state to the visited set
                self.visited.add(state)

                # Calculate reward based on the board values
                x, y = state
                reward = self.maze_game.board_values[x][y]

                # Update Q-table based on the action taken
                self.learn(prev_state, action, reward, state)
                total_reward += reward
                step += 1

                if state == self.maze_game.goal_pos:
                    print(f"Episode {episode + 1}: Reached goal in {step} steps with total reward {total_reward}")
                    break

            # Decay the exploration rate
            self.update_exploration_rate()

    def play_one_episode(self):
        """
        Play one episode using the learned Q-table, without exploration, and avoid revisiting cells.
        """
        self.maze_game.__init__()  # Reset the game to initial state
        self.visited = set()  # Reset visited cells
        state = self.get_state()
        self.visited.add(state)  # Mark the starting position as visited
        step = 0
        total_reward = 0

        while state != self.maze_game.goal_pos:
            # Always exploit the best action from the Q-table, and ensure it's valid and not revisited
            action = self.choose_action(state, explore=False)
            if action is None:
                print("No valid moves left! Stuck.")
                break

            print(f"Step {step + 1}: Moving {action} from {state}")

            # Execute the action in the environment
            self.maze_game.move(action)
            state = self.get_state()

            # Add the new state to the visited set
            self.visited.add(state)

            # Calculate reward based on the board values
            x, y = state
            reward = self.maze_game.board_values[x][y]
            total_reward += reward

            step += 1

            # Check if reached the goal
            if state == self.maze_game.goal_pos:
                print(f"Reached the goal in {step} steps with total reward: {total_reward}")
                break


In [None]:
# Create the maze game
maze_game = MazeGame()

# Create the Q-learning agent
agent = QLearningAgent(maze_game, learning_rate=0.1, discount_factor=0.0, exploration_rate=1.0, exploration_decay=0.0)

# Train the agent (you can adjust the number of episodes)
agent.play(episodes=1000)


Episode 1: Reached goal in 4 steps with total reward 118
No valid moves left! Stuck.
Episode 3: Reached goal in 6 steps with total reward 137
Episode 4: Reached goal in 4 steps with total reward 118
Episode 5: Reached goal in 4 steps with total reward 118
Episode 6: Reached goal in 4 steps with total reward 118
Episode 7: Reached goal in 4 steps with total reward 118
Episode 8: Reached goal in 4 steps with total reward 118
Episode 9: Reached goal in 6 steps with total reward 137
Episode 10: Reached goal in 4 steps with total reward 118
Episode 11: Reached goal in 4 steps with total reward 118
Episode 12: Reached goal in 4 steps with total reward 118
Episode 13: Reached goal in 4 steps with total reward 118
Episode 14: Reached goal in 4 steps with total reward 118
Episode 15: Reached goal in 6 steps with total reward 137
Episode 16: Reached goal in 4 steps with total reward 118
Episode 17: Reached goal in 4 steps with total reward 118
Episode 18: Reached goal in 4 steps with total rewar

In [None]:
# Now play one episode using the learned Q-table without exploration, avoiding invalid moves
agent.play_one_episode()

Step 1: Moving right from (0, 0)
Step 2: Moving down from (0, 1)
Step 3: Moving left from (1, 1)
Step 4: Moving down from (1, 0)
Step 5: Moving right from (2, 0)
Step 6: Moving right from (2, 1)
Reached the goal in 6 steps with total reward: 137


In [None]:
maze_game.render_animation()