# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [13]:
import numpy as np

from itertools import combinations

In [14]:
class TicTacToeAgent:
    """
    A Tic-Tac-Toe agent that learns to play the game using Q-learning.
    
    Attributes:
    - q_table: The Q-table that maps states to actions.
    
    Methods:
    - take_action: Takes an action based on the current state and epsilon value.
    - update_q_table: Updates the Q-table based on the current state, action, and reward.
    """
    def __init__(self):
        self.q_table = np.zeros((3**9, 9))
        
    def take_action(self, state, epsilon=0.001):
        """
        Takes an action based on the current state and epsilon value.

        Parameters:
        - state: The current state of the game.
        - epsilon: The exploration rate.

        Returns:
        - The chosen action.
        """
        if np.random.uniform(0, 1) < epsilon:
            return np.random.choice(9)
        else:
            return np.argmax(self.q_table[state.get_board()])
    
    def update_q_table(self, state, action, reward):
        """
        Updates the Q-table based on the current state, action, and reward.

        Parameters:
        - state: The current state of the game.
        - action: The chosen action.
        - reward: The reward received.

        Returns:
        - None
        """
        self.q_table[state, action] += reward

In [15]:
class TicTacToeState:
    """
    Represents the state of a Tic-Tac-Toe game.

    Attributes:
    - x (set): Set of positions where player X has made a move.
    - o (set): Set of positions where player O has made a move.
    - _magic (list): Magic square used to map positions to indices on the board.

    Methods:
    - print_board(): Prints the current state of the board.
    - is_terminal(player): Checks if the current game state is terminal for the given player.
    - get_reward(): Calculates the reward for the current game state.
    - make_move(player, position): Makes a move for the given player at the given position.
    """

    def __init__(self, x, o):
        self.x = x
        self.o = o
        self._magic = [2, 7, 6, 9, 5, 1, 4, 3, 8]
    
    def print_board(self):
        """
        Prints the current state of the board.

        The board is represented by a 3x3 grid. Each cell can contain either 'X', 'O', or '.'.
        'X' represents a move made by player X, 'O' represents a move made by player O, and '.' represents an empty cell.

        Example:
            ```
            X..
            .O.
            ..X
            ```

        Returns:
        None
        """
        for r in range(3):
            for c in range(3):
                i = r * 3 + c
                if self._magic[i] in self.x:
                    print('X', end='')
                elif self._magic[i] in self.o:
                    print('O', end='')
                else:
                    print('.', end='')
            print()
        print()
    
    def is_terminal(self, player=None):
        """
        Checks if the current game state is terminal for the given player.
        
        Args:
            player (str): The player to check for terminal state. Can be 'x', 'o', or None.
            
        Returns:
            bool: True if the game state is terminal for the given player, False otherwise.
        """
        assert(player in ['x', 'o', None]), 'Invalid player'
        
        if player is None:
            return self.is_terminal('x') or self.is_terminal('o') or len(self.x) + len(self.o) == 9
        
        return any(sum(c) == 15 for c in combinations(self.x if player == 'x' else self.o, 3))

    def get_reward(self):
        """
        Calculates the reward for the current game state.

        Returns:
        - 1 if the current player (self.x) has won the game
        - -1 if the opponent player (self.o) has won the game
        - 0 if the game is still ongoing
        """
        if self.is_terminal(self.x):
            return 1
        elif self.is_terminal(self.o):
            return -1
        else:
            return 0
    
    def make_move(self, player, position):
        """
        Makes a move for the given player at the given position.

        Parameters:
        - player (str): The player to make the move ('x' or 'o').
        - position (int): The position to make the move at.

        Returns:
        - None
        """
        assert(player in ['x', 'o']), 'Invalid player'
        assert(position in range(1, 10)), 'Invalid position'
        
        if player == 'x':
            self.x.add(position)
        else:
            self.o.add(position)
    
    def get_board(self):
        """
        Returns the current state of the board as a 3x3 numpy array.

        Returns:
        - A 3x3 numpy array representing the current state of the board.
        """
        board = np.zeros((3, 3))
        for r in range(3):
            for c in range(3):
                i = r * 3 + c
                if self._magic[i] in self.x:
                    board[r, c] = 1
                elif self._magic[i] in self.o:
                    board[r, c] = -1
        return board
    

In [16]:
agent = TicTacToeAgent()

for _ in range(10_000):
    state = TicTacToeState(set(), set())
    
    while not state.is_terminal():
        for player in ['x', 'o']:
            action = agent.take_action(state)
            state.make_move(player, action)
            reward = state.get_reward()
            agent.update_q_table(state, action, reward)
            if state.is_terminal():
                break

IndexError: arrays used as indices must be of integer (or boolean) type