Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4

In [1]:
from IPython.display import clear_output
import numpy as np
from termcolor import colored
from tqdm import tqdm
from collections import defaultdict
from copy import deepcopy

# Players

In [2]:
from abc import ABC, abstractmethod


class Player(ABC):
    def __init__(self) -> None:
        """You can change this for your player if you need to handle state/have memory"""
        pass

    @abstractmethod
    def decide_move(
        self, game: "TicTacToe", move_symbol: int
    ) -> tuple[int, int]:
        """
        game: the TicTacToe game. You can use it to override the current game with yours, but everything is evaluated by the main game
        return values: this method shall return a tuple of X,Y positions
        """
        pass

    @abstractmethod
    def set_train_mode(self, mode: bool):
        pass

    @abstractmethod
    def back_prop(self, reward: int):
        pass

In [3]:
import json

EPSILON = 0.3
LR = 0.2
GAMMA_DECAY = 0.9


class RLayer(Player):
    def __init__(
        self, path: str = None, epsilon: float = EPSILON, name: str = ""
    ):
        self.game_state = []
        self.path = path
        self.name = name
        self.epsilon = epsilon
        self.lr = LR
        self.gamma_decay = GAMMA_DECAY
        self.train_mode = False  # if set to true allows move exploration.
        self.policy = defaultdict(float)
        if path:
            f = open(path, "r")
            policy = dict(json.load(f))
            for k, v in policy.items():
                self.policy[k] = v

    def set_train_mode(self, mode: bool):
        self.train_mode = mode

    def decide_move(
        self, game: "TicTacToe", move_symbol: int
    ) -> tuple[int, int]:
        """
        The following function is used by an agent to determine its next move in the game.
        During training only:
            with a probability epsilon, it randomly selects a move from all possible moves,
            otherwise selects the move with the highest predicted policy value based on the agent's learned policy.
        The function returns the chosen move.
        """
        all_moves = game.possible_moves()
        if self.train_mode and np.random.random() < self.epsilon:
            # exploration phase
            index = np.random.choice(len(all_moves))
            move = all_moves[index]
            board_next_move = deepcopy(game)
            board_next_move.make_move(move, move_symbol)
            self.game_state.append(board_next_move.hash())
            return move
        else:
            # selects the best move in the policy.
            best_move = None
            best_hash = None
            best_value = float("-inf")
            for move in all_moves:
                board_next_move = deepcopy(game)
                board_next_move.make_move(move, move_symbol)
                hash = board_next_move.hash()
                if self.policy[hash] > best_value:
                    best_move = move
                    best_hash = hash
                    best_value = self.policy[hash]
            if self.train_mode:
                self.game_state.append(best_hash)
            return best_move

    def back_prop(self, reward: int):
        """
        The back_prop function is responsible for updating the policy values in the agent's memory during the training process.
        It adjusts the policy values based on the received reward.
        """
        for state in reversed(self.game_state):
            # state is an hash of next_board
            self.policy[state] += self.lr * (
                self.gamma_decay * reward - self.policy[state]
            )
            reward = self.policy[state]
        self.game_state = []

    def save_policy(self):
        """
        It saves the policy learnt when trining is done
        """
        if not self.path:
            s = "policy"
            if self.name != "":
                s += "_" + self.name
            self.path = f"{s}.json"
        f = open(self.path, "w")
        json.dump(self.policy, f)

In [4]:
class RandomPlayer(Player):
    """
    RandomPlayer just choose a ranodm move among the possible ones in a given board configuration
    """

    def __init__(self) -> None:
        super().__init__()

    def set_train_mode(self, mode: bool):
        return super().set_train_mode(mode)

    def decide_move(self, game: "TicTacToe", move_symbol: int) -> str:
        all_moves = game.possible_moves()
        index = np.random.choice(len(all_moves))
        move = all_moves[index]
        return move

    def back_prop(self, reward: int):
        return super().back_prop(reward)

In [5]:
class HumanPlayer(Player):
    """
    HumanPlayer allows a user to play against our trained RLayer player
    """

    def __init__(self) -> None:
        super().__init__()

    def set_train_mode(self, mode: bool):
        return super().set_train_mode(mode)

    def decide_move(
        self, game: "TicTacToe", move_symbol: int
    ) -> tuple[int, int]:
        # printing with the assigned color, for clarity.
        color = "green" if move_symbol == -1 else "red"
        print_string = "Human player"
        print(f"\n{colored(print_string,color)} it's your turn")
        move = input("Insert x and y coordinates in format: 'x y' :")
        x, y = move.split(" ")
        return (int(x), int(y))

    def back_prop(self, reward: int):
        return super().back_prop(reward)

# TicTacToe

In [6]:
class TicTacToe:
    def __init__(self, board=None):
        """
        Board legend:
        cell = 0 -> no move on this cell
        cell = 1 -> player 1 made 'x' (1) as move
        cell = -1 -> player 2 made 'o' (-1) as move
        """
        self.train_mode = False
        if isinstance(board, np.ndarray):
            self.board = board
        else:  # all cells initalized with 0
            self.board = np.zeros(shape=(3, 3))

    def hash(self) -> str:
        return str(self.board)

    def check_win(self):
        """Checks if someone won the game."""

        # Check win by columns
        if (abs(self.board.sum(axis=0)) == 3).any():
            return True
        # Check win by rows
        if (abs(self.board.sum(axis=1)) == 3).any():
            return True

        # Check win by diagonals
        sum_diag_princ = 0
        sum_diag_back = 0
        for i in range(3):
            sum_diag_back += self.board[i][i]
            sum_diag_princ += self.board[2 - i][i]
        if abs(sum_diag_back) == 3 or abs(sum_diag_princ) == 3:
            return True

        return False

    def check_tie(self) -> bool:
        """Checks if there is a tie."""
        if not self.possible_moves():
            return True
        return False

    def make_move(self, move: tuple[int, int], value: int):
        """Take a move in format x y and makes it."""
        x, y = move
        # check is a valid move
        if not (0 <= x <= 2 and 0 <= y <= 2):
            print("invalid move")
        elif self.board[x][y] != 0:
            print("invalid move")
        else:
            self.board[x][y] = value

    def possible_moves(self):
        """Return all the possible available moves to make."""
        moves = []
        for i in range(3):
            for j in range(3):
                if self.board[i][j] == 0:
                    moves.append((i, j))
        return moves

    def print(self):
        """Pretty printing of the board."""
        l_len = 19
        clear_output()
        print("   ", "  (0)   (1)   (2)")
        for i in range(3):
            print("   ", "-" * l_len)
            print(f"({i})", "| ", end="")
            print(
                " | ".join(
                    map(
                        lambda e: "   "
                        if e == 0
                        else colored(" X ", "red")
                        if e == 1
                        else colored(" O ", "green"),
                        self.board[i].astype(int),
                    )
                ),
                end=" ",
            )
            print("|")
        print("   ", "-" * l_len)

    def run(
        self, player1: "Player", player2: "Player"
    ) -> tuple[int, ("Player", "Player")]:
        """
        This function just plays a game between player1 and player2.
        It returns the the winner index 0/1 or -1 if it's tie.
        """
        someone_won, is_tie = False, False
        # always reset the board state.
        self.board = np.zeros(shape=(3, 3))
        players = [player1, player2]
        # this is the index of the second starting player.
        pl_index = np.random.choice(len(players))
        # value of the second starting player
        if not self.train_mode:
            self.print()
        # The starting player will have 1 as value to assign.
        value_to_assign = -1
        while not someone_won and not is_tie:
            pl_index += 1
            pl_index %= 2
            value_to_assign *= -1
            move = players[pl_index].decide_move(self, value_to_assign)
            self.make_move(move, value_to_assign)
            if not self.train_mode:
                self.print()
            someone_won = self.check_win()
            is_tie = self.check_tie()

        if is_tie and not someone_won:
            pl_index = -1
        return (pl_index, players)

    def train(self, player1: Player, trainers: list[Player], epochs: int):
        """
        The train function iterates through a specified number of epochs, randomly selecting trainers for each iteration.
        After a game is played between the primary player (player1) and the selected trainer, back_prop is called for reward updates.
        The function also saves the learned policy for RLayer player (Reinforcement Learning player)
        """
        WIN_REWARD = 1
        TIES_REWARD = 0.5
        LOSE_REWARD = -1
        self.train_mode = True
        player1.set_train_mode(True)
        for trainer in trainers:  # we may have more than one trainer.
            trainer.set_train_mode(True)
        bar = tqdm(total=epochs, desc="Epoch")
        for _ in range(epochs):
            idx_pl = np.random.choice(len(trainers))
            trainer = trainers[idx_pl]
            winner_idx, players = self.run(player1, trainer)
            if winner_idx >= 0:
                players[winner_idx].back_prop(WIN_REWARD)
                players[(winner_idx + 1) % 2].back_prop(LOSE_REWARD)
            else:
                players[0].back_prop(TIES_REWARD)
                players[1].back_prop(TIES_REWARD)
            bar.update(1)
        self.train_mode = False
        if isinstance(player1, RLayer):
            player1.save_policy()
            player1.set_train_mode(False)

# Training and Evaluation

## Function for evaluation

In [7]:
def player_evaluation(player_to_eval, player_to_fight, num_game):
    """
    Given a trained player in input and an otehr player to play against,
    this function evaulates the first player (player_to_eval) on 'num_game' games.
    It prints its %wins, %ties and %loss"""
    wins = 0
    ties = 0
    game = TicTacToe()
    for _ in range(num_game):
        win_idx, players = game.run(player_to_eval, player_to_fight)
        if win_idx != -1 and id(players[win_idx]) == id(player_to_eval):
            wins += 1
        elif win_idx == -1:
            ties += 1
    print()
    print(f"% of wins of RL: {wins/num_game:.2%}")
    print(f"% of ties: {ties/num_game:.2%}")
    print(f"(% of wins other player: {1 - ((ties + wins)/num_game):.2%})")

In [8]:
NUM_GAMES_EVALUATION = 2000
NUM_EPOCHS_TRAINING = 400_000

## Training RL player vs. RandomPlayer

In [26]:
rl_base = RLayer(name="rl_base")
random_player = RandomPlayer()

In [27]:
game = TicTacToe()
game.train(rl_base, [random_player], NUM_EPOCHS_TRAINING)

Epoch: 100%|██████████| 400000/400000 [07:28<00:00, 891.74it/s]


In [28]:
# evaluate the model against a random player

player_evaluation(rl_base, random_player, NUM_GAMES_EVALUATION)

      (0)   (1)   (2)
    -------------------
(0) |     | [32m O [0m | [31m X [0m |
    -------------------
(1) |     | [31m X [0m |     |
    -------------------
(2) | [31m X [0m |     | [32m O [0m |
    -------------------

% of wins of RL: 92.90%
% of ties: 5.70%
(% of wins other player: 1.40%)


## Training RL player vs other RL

In [12]:
rl_RL_trained = RLayer(name="rl_RL_trained")
rl_trainer = RLayer()

In [13]:
game = TicTacToe()
game.train(rl_RL_trained, [rl_trainer], NUM_EPOCHS_TRAINING)

Epoch: 100%|██████████| 400000/400000 [14:24<00:00, 462.80it/s]


In [14]:
player_evaluation(rl_RL_trained, random_player, NUM_GAMES_EVALUATION)

      (0)   (1)   (2)
    -------------------
(0) | [31m X [0m | [32m O [0m | [31m X [0m |
    -------------------
(1) | [32m O [0m | [31m X [0m | [32m O [0m |
    -------------------
(2) |     |     | [31m X [0m |
    -------------------

% of wins of RL: 89.40%
% of ties: 9.45%
(% of wins other player: 1.15%)


## Training RL player vs other RL player and RandomPlayer together


In [15]:
board = TicTacToe()
the_ROCK = RLayer(name="the_ROCK")
trainer_1 = rl_RL_trained
trainer_2 = random_player

In [16]:
board.train(the_ROCK, [trainer_1, trainer_2], NUM_EPOCHS_TRAINING)

Epoch: 100%|██████████| 400000/400000 [10:52<00:00, 613.48it/s]


In [17]:
player_evaluation(the_ROCK, random_player, NUM_GAMES_EVALUATION)

      (0)   (1)   (2)
    -------------------
(0) | [31m X [0m | [32m O [0m |     |
    -------------------
(1) | [32m O [0m | [31m X [0m |     |
    -------------------
(2) |     |     | [31m X [0m |
    -------------------

% of wins of RL: 93.00%
% of ties: 6.50%
(% of wins other player: 0.50%)


## Human Player match

#### *Run cells below if you want to play a match aginst a trained player*

In [21]:
game = TicTacToe()
human_player = HumanPlayer()
# choose your favorite opponent:
#   rl_baseline     rl_RL_trained     random_player    the_ROCK
opponent = rl_baseline

In [24]:
def print_winner(winner: int, players: tuple) -> None:
    if winner == -1:
        print(f"\n🟡 Tie")
    elif (winner == 0 and isinstance(players[0], HumanPlayer)) or (
        winner == 1 and isinstance(players[1], HumanPlayer)
    ):
        print(f"\n🟢 You won")
    else:
        print(f"\n🔴 You lost. {type(players[winner]).__name__} won,")

In [25]:
winner, players = game.run(human_player, opponent)
print_winner(winner, players)

      (0)   (1)   (2)
    -------------------
(0) | [32m O [0m | [32m O [0m | [31m X [0m |
    -------------------
(1) | [31m X [0m | [31m X [0m | [32m O [0m |
    -------------------
(2) | [32m O [0m | [31m X [0m | [31m X [0m |
    -------------------

🟡 Tie
