Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [13]:
from itertools import permutations
from IPython.display import clear_output
import numpy as np
from termcolor import colored
from tqdm import tqdm

In [14]:
from abc import ABC, abstractmethod


class Player(ABC):
    def __init__(self) -> None:
        """You can change this for your player if you need to handle state/have memory"""
        pass

    @abstractmethod
    def decide_move(
        self, game: "TicTacToe", move_symbol: int
    ) -> tuple[int, int]:
        """
        game: the TicTacToe game. You can use it to override the current game with yours, but everything is evaluated by the main game
        return values: this method shall return a tuple of X,Y positions
        """
        pass

    @abstractmethod
    def back_prop(self, reward: int):
        pass

In [15]:
import json
from collections import defaultdict
from copy import deepcopy


class RLayer(Player):
    def __init__(self, path: str = None, epsilon: float = 0.3):
        self.game_state = []
        self.path = path
        self.epsilon = epsilon
        self.lr = 0.2
        self.gamma_decay = 1.0
        if not path:
            self.policy = defaultdict(float)
        else:
            f = open(path, "r")
            self.policy = json.load(f)

    def decide_move(
        self, game: "TicTacToe", move_symbol: int
    ) -> tuple[int, int]:
        all_moves = game.possible_moves()
        if np.random.random() < self.epsilon:
            # exploration phase
            index = np.random.randint(len(all_moves))
            return all_moves[index]
        else:
            best_move = None
            best_hash = None
            best_value = float("-inf")
            for move in all_moves:
                board_next_move = deepcopy(game)
                board_next_move.make_move(move, move_symbol)
                hash = board_next_move.hash()
                if self.policy[hash] > best_value:
                    best_move = move
                    best_hash = hash
                    best_value = self.policy[hash]
            self.game_state.append(best_hash)
            return best_move

    def back_prop(self, reward: int):
        for state in reversed(self.game_state):
            # state is an hash of next_board
            self.policy[state] += self.lr * (
                self.gamma_decay * reward - self.policy[state]
            )
        self.game_state = []

    def save_policy(self):
        pass

In [16]:
class TicTacToe:
    def __init__(self, board=None):
        """
        Board legend:
        cell = 0 -> no move on this cell
        cell = 1 -> player 1 made 'x' (1) as move
        cell = -1 -> player 2 made 'o' (-1) as move
        """
        train_mode = False
        if isinstance(board, np.ndarray):
            self.board = board
        else:  # all cells initalized at 0
            self.board = np.zeros(shape=(3, 3))

    def hash(self) -> str:
        return str(self.board)

    def check_win(self):
        """Checks if someone won the game."""

        # Check win by columns
        if (abs(self.board.sum(axis=0)) == 3).any():
            return True
        # Check win by rows
        if (abs(self.board.sum(axis=1)) == 3).any():
            return True

        # Check win by diagonals
        sum_diag_princ = 0
        sum_diag_back = 0
        for i in range(3):
            sum_diag_back += self.board[i][i]
            sum_diag_princ += self.board[2 - i][i]
        if abs(sum_diag_back) == 3 or abs(sum_diag_princ) == 3:
            return True

        return False

    def check_tie(self) -> bool:
        if not self.possible_moves():
            return True
        return False

    def make_move(self, move: tuple[int, int], value: int):
        """Take a move in format x y and makes it."""
        x, y = move
        # check is a valid move
        if not (0 <= x <= 2 and 0 <= y <= 2):
            print("invalid move")
        elif self.board[x][y] != 0:
            print("invalid move")
        else:
            self.board[x][y] = value

    def possible_moves(self):
        """Return all the possible available moves to make."""
        moves = []
        for i in range(3):
            for j in range(3):
                if self.board[i][j] == 0:
                    moves.append((i, j))
        return moves

    def print(self):
        l_len = 19
        clear_output()
        print(f'{colored('Player 1', 'green')} make your move:\n')
        print("   ", "  (0)   (1)   (2)")
        for i in range(3):
            print("   ", "-" * l_len)
            print(f"({i})", "| ", end="")
            print(
                " | ".join(
                    map(
                        lambda e: "   "
                        if e == 0
                        else colored(" X ", "red")
                        if e == 1
                        else colored(" O ", "green"),
                        self.board[i].astype(int),
                    )
                ),
                end=" ",
            )
            print("|")
        print("   ", "-" * l_len)

    def run(
        self, player1: Player, player2: Player
    ) -> tuple[int, (Player, Player)]:
        """Returns a int with the winning player index. If tie, the index value will be -1."""
        board_state = []
        someone_won, is_tie = False, False
        self.board = np.zeros(shape=(3, 3))
        players = [player1, player2]
        np.random.shuffle(players)
        pl_index = -1
        value_to_assign = -1 if isinstance(players[0], RLayer) else 1
        if not self.train_mode:
            self.print()
        while not someone_won and not is_tie:
            pl_index += 1
            pl_index %= 2
            value_to_assign *= -1
            move = players[pl_index].decide_move(self, value_to_assign)
            self.make_move(move, value_to_assign)
            if not self.train_mode:
                self.print()
            someone_won = self.check_win()
            is_tie = self.check_tie()

        if is_tie and not someone_won:
            pl_index = -1
        return (pl_index, players)

    def train(self, player1: Player, player2: Player, epochs: int):
        self.train_mode = True
        bar = tqdm(total=epochs, desc="Epoch")
        for _ in range(epochs):
            winner_idx, players = self.run(player1, player2)
            if winner_idx >= 0:
                players[winner_idx].back_prop(1)
                players[(winner_idx + 1) % 2].back_prop(0)
            else:
                players[0].back_prop(0.5)
                players[1].back_prop(0.5)
            bar.update(1)
        self.train_mode = False

In [17]:
class RandomPlayer(Player):
    def __init__(self) -> None:
        super().__init__()

    def decide_move(self, game: "TicTacToe", move_symbol: int) -> str:
        all_moves = game.possible_moves()
        index = np.random.randint(len(all_moves))
        return all_moves[index]
        # return super().make_move(game)

    def back_prop(self, reward: int):
        return super().back_prop(reward)

In [18]:
class HumanPlayer(Player):
    def __init__(self) -> None:
        super().__init__()

    def decide_move(
        self, game: "TicTacToe", move_symbol: int
    ) -> tuple[int, int]:
        move = input("Insert x and y coordinates in format: 'x y' :")
        x, y = move.split(" ")
        return (int(x), int(y))

    def back_prop(self, reward: int):
        return super().back_prop(reward)

In [19]:
rp = RandomPlayer()
rl = RLayer()

In [34]:
EPOCHS = 100000
game = TicTacToe()
game.train_mode = True
game.train(rp, rl, EPOCHS)

Epoch: 100%|██████████| 100000/100000 [01:11<00:00, 1408.33it/s]


In [25]:
# alanlysis
print(f'Discovered states: {len(rl.policy)}/{3**9} ({len(rl.policy)/3**9:.2%})')

Discovered states: 4489/19683 (22.81%)


In [27]:
NUM_GAME = 1000
win_rl = 0
ties = 0
for _ in range(NUM_GAME):
    win_idx, players = game.run(rp, rl)
    if win_idx != -1 and id(players[win_idx]) == id(rl):
        win_rl += 1
    elif win_idx == -1:
        ties += 1
print()
print(f"% of wins of RL: {win_rl/NUM_GAME:.2%}")
print(f"% of ties: {ties/NUM_GAME:.2%}")
print(f"(% of wins other player: {1 - ((ties + win_rl)/NUM_GAME):.2%})")

[32mPlayer 1[0m make your move:

      (0)   (1)   (2)
    -------------------
(0) | [31m X [0m | [31m X [0m | [31m X [0m |
    -------------------
(1) |     |     |     |
    -------------------
(2) | [32m O [0m |     | [32m O [0m |
    -------------------

% of wins of RL: 59.40%
% of ties: 11.20%
(% of wins other player: 29.40%)


In [33]:
hp = HumanPlayer()
game.train_mode = False
game.run(rl, hp)

[32mPlayer 1[0m make your move:

      (0)   (1)   (2)
    -------------------
(0) | [31m X [0m |     | [31m X [0m |
    -------------------
(1) | [32m O [0m | [32m O [0m | [32m O [0m |
    -------------------
(2) |     |     |     |
    -------------------


(0, [<__main__.HumanPlayer at 0x10747b1a0>, <__main__.RLayer at 0x1072a5c70>])