Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [1]:
from IPython.display import clear_output
import numpy as np
from termcolor import colored
from tqdm import tqdm

In [25]:
from abc import ABC, abstractmethod


class Player(ABC):
    def __init__(self) -> None:
        """You can change this for your player if you need to handle state/have memory"""
        pass

    @abstractmethod
    def decide_move(
        self, game: "TicTacToe", move_symbol: int
    ) -> tuple[int, int]:
        """
        game: the TicTacToe game. You can use it to override the current game with yours, but everything is evaluated by the main game
        return values: this method shall return a tuple of X,Y positions
        """
        pass

    @abstractmethod
    def back_prop(self, reward: int):
        pass

In [26]:
import json
from collections import defaultdict
from copy import deepcopy

WIN_REWARD = 1
LOSE_REWARD = -1
TIE_REWARD = 0.2

EPSILON = 0.3
LR = 0.2
GAMMA_DECAY = 0.9


class RLayer(Player):
    def __init__(self, path: str = None, epsilon: float = EPSILON):
        self.game_state = []
        self.path = path
        self.epsilon = epsilon
        self.lr = LR
        self.gamma_decay = GAMMA_DECAY
        self.train_mode = False
        self.policy = defaultdict(float)
        if path:
            f = open(path, "r")
            policy = dict(json.load(f))
            for k, v in policy.items():
                self.policy[k] = v

    def set_train_mode(self, mode: bool):
        self.train_mode = mode

    def decide_move(
        self, game: "TicTacToe", move_symbol: int
    ) -> tuple[int, int]:
        all_moves = game.possible_moves()
        if self.train_mode and np.random.random() < self.epsilon:
            # exploration phase
            index = np.random.randint(len(all_moves))
            return all_moves[index]
        else:
            best_move = None
            best_hash = None
            best_value = float("-inf")
            for move in all_moves:
                board_next_move = deepcopy(game)
                board_next_move.make_move(move, move_symbol)
                hash = board_next_move.hash()
                if self.policy[hash] > best_value:
                    best_move = move
                    best_hash = hash
                    best_value = self.policy[hash]
            self.game_state.append(best_hash)
            return best_move

    def back_prop(self, reward: int):
        for state in reversed(self.game_state):
            # state is an hash of next_board
            self.policy[state] += self.lr * (
                self.gamma_decay * reward - self.policy[state]
            )
            reward = self.policy[state]
        self.game_state = []

    def save_policy(self):
        if not self.path:
            self.path = "policy_lore.json"
        f = open(self.path, "w")
        json.dump(self.policy, f)

In [27]:
class TicTacToe:
    def __init__(self, board=None):
        """
        Board legend:
        cell = 0 -> no move on this cell
        cell = 1 -> player 1 made 'x' (1) as move
        cell = -1 -> player 2 made 'o' (-1) as move
        """
        self.train_mode = False
        if isinstance(board, np.ndarray):
            self.board = board
        else:  # all cells initalized at 0
            self.board = np.zeros(shape=(3, 3))

    def hash(self) -> str:
        return str(self.board)

    def check_win(self):
        """Checks if someone won the game."""

        # Check win by columns
        if (abs(self.board.sum(axis=0)) == 3).any():
            return True
        # Check win by rows
        if (abs(self.board.sum(axis=1)) == 3).any():
            return True

        # Check win by diagonals
        sum_diag_princ = 0
        sum_diag_back = 0
        for i in range(3):
            sum_diag_back += self.board[i][i]
            sum_diag_princ += self.board[2 - i][i]
        if abs(sum_diag_back) == 3 or abs(sum_diag_princ) == 3:
            return True

        return False

    def check_tie(self) -> bool:
        if not self.possible_moves():
            return True
        return False

    def make_move(self, move: tuple[int, int], value: int):
        """Take a move in format x y and makes it."""
        x, y = move
        # check is a valid move
        if not (0 <= x <= 2 and 0 <= y <= 2):
            print("invalid move")
        elif self.board[x][y] != 0:
            print("invalid move")
        else:
            self.board[x][y] = value

    def possible_moves(self):
        """Return all the possible available moves to make."""
        moves = []
        for i in range(3):
            for j in range(3):
                if self.board[i][j] == 0:
                    moves.append((i, j))
        return moves

    def print(self):
        l_len = 19
        clear_output()
        print(f"{colored('Player 1', 'green')} make your move:\n")
        print("   ", "  (0)   (1)   (2)")
        for i in range(3):
            print("   ", "-" * l_len)
            print(f"({i})", "| ", end="")
            print(
                " | ".join(
                    map(
                        lambda e: "   "
                        if e == 0
                        else colored(" X ", "red")
                        if e == 1
                        else colored(" O ", "green"),
                        self.board[i].astype(int),
                    )
                ),
                end=" ",
            )
            print("|")
        print("   ", "-" * l_len)

    def run(
        self, player1: Player, player2: Player
    ) -> tuple[int, (Player, Player)]:
        """Returns a int with the winning player index. If tie, the index value will be -1."""
        board_state = []
        someone_won, is_tie = False, False
        self.board = np.zeros(shape=(3, 3))
        players = [player1, player2]
        np.random.shuffle(players)
        pl_index = -1
        value_to_assign = -1 if isinstance(players[0], RLayer) else 1
        if not self.train_mode:
            self.print()
        while not someone_won and not is_tie:
            pl_index += 1
            pl_index %= 2
            value_to_assign *= -1
            move = players[pl_index].decide_move(self, value_to_assign)
            self.make_move(move, value_to_assign)
            if not self.train_mode:
                self.print()
            someone_won = self.check_win()
            is_tie = self.check_tie()

        if is_tie and not someone_won:
            pl_index = -1
        return (pl_index, players)

    def train(self, player1: Player, players: list, epochs: int):
        self.train_mode = True
        bar = tqdm(total=epochs, desc="Epoch")
        if isinstance(player1, RLayer):
            player1.set_train_mode = True
        for _ in range(epochs):
            index = np.random.randint(len(players))
            player2 = players[index]
            winner_idx, players = self.run(player1, player2)
            if winner_idx >= 0:
                players[winner_idx].back_prop(WIN_REWARD)
                players[(winner_idx + 1) % 2].back_prop(LOSE_REWARD)
            else:
                players[0].back_prop(TIE_REWARD)
                players[1].back_prop(TIE_REWARD)
            bar.update(1)
        if isinstance(player1, RLayer):
            player1.save_policy()
            player1.set_train_mode = False
        self.train_mode = False

In [38]:
class RandomPlayer(Player):
    def __init__(self) -> None:
        self.pol = defaultdict(float)
        super().__init__()

    def decide_move(self, game: "TicTacToe", move_symbol: int) -> str:
        all_moves = game.possible_moves()
        index = np.random.choice(len(all_moves))
        move = all_moves[index]
        board = deepcopy(game)
        board.make_move(move, move_symbol)
        self.pol[board.hash] += 1
        return all_moves[index]
        # return super().make_move(game)

    def back_prop(self, reward: int):
        return super().back_prop(reward)

In [29]:
class HumanPlayer(Player):
    def __init__(self) -> None:
        super().__init__()

    def decide_move(
        self, game: "TicTacToe", move_symbol: int
    ) -> tuple[int, int]:
        move = input("Insert x and y coordinates in format: 'x y' :")
        x, y = move.split(" ")
        return (int(x), int(y))

    def back_prop(self, reward: int):
        return super().back_prop(reward)

# Nuovo training

In [8]:
rp = RandomPlayer()
rl1_new_train = RLayer()
rl2_new_train = RLayer()

In [9]:
EPOCHS = 1_000_000
game = TicTacToe()
game.train(rl1_new_train, [rp], EPOCHS)

Epoch: 100%|██████████| 1000000/1000000 [43:59<00:00, 378.92it/s]  


In [13]:
print(
    f"Discovered states rl2: {len(rl1_new_train.policy)}/{3**9} ({len(rl1_new_train.policy)/3**9:.2%})"
)
ll = len([t for t in rl1_new_train.policy.items() if t[1] > 0])
print(
    f"\tDifferent from 0: {ll}/{len(rl1_new_train.policy)} ({ll/len(rl1_new_train.policy):.2%})"
)

Discovered states rl2: 916/19683 (4.65%)
	Different from 0: 8/916 (0.87%)


In [14]:
NUM_GAME = 1000
win = 0
ties = 0
rl1_new_train = RLayer(path="policy.json")
for _ in range(NUM_GAME):
    win_idx, players = game.run(rp, rl1_new_train)
    if win_idx != -1 and id(players[win_idx]) == id(rl1_new_train):
        win += 1
    elif win_idx == -1:
        ties += 1
print()
print(f"% of wins of RL: {win/NUM_GAME:.2%}")
print(f"% of ties: {ties/NUM_GAME:.2%}")
print(f"(% of wins other player: {1 - ((ties + win)/NUM_GAME):.2%})")

[32mPlayer 1[0m make your move:

      (0)   (1)   (2)
    -------------------
(0) | [31m X [0m | [31m X [0m | [32m O [0m |
    -------------------
(1) | [31m X [0m | [32m O [0m | [32m O [0m |
    -------------------
(2) | [32m O [0m |     |     |
    -------------------

% of wins of RL: 61.20%
% of ties: 3.90%
(% of wins other player: 34.90%)


In [None]:
hp = HumanPlayer()
game.run(rl1_new_train, hp)

# Roba Lore

In [39]:
rp = RandomPlayer()
rl1 = RLayer()
# rl2 = RLayer()

In [53]:
EPOCHS = 100_000
game = TicTacToe()
game.train(rl1, [rp], EPOCHS)

Epoch: 100%|██████████| 100000/100000 [02:57<00:00, 563.45it/s]


In [58]:
# analysis
from itertools import product

# Define the possible values
values = [0, 1, -1]

# Generate all possible combinations
combinations = product(values, repeat=9)


# Function to check if matrix meets the condition
def meets_condition(matrix):
    return np.sum(matrix == 1) == (np.sum(matrix == -1) + 1)


# Convert combinations to 3x3 matrices and print
counter = 0
for combo in combinations:
    matrix = np.array(combo).reshape(3, 3)
    if meets_condition(matrix):
        counter += 1
MAX_DISCOVERABLE_STATES = counter

print(
    f"Discovered states rl: {len(rl1.policy)}/{MAX_DISCOVERABLE_STATES} ({len(rl1.policy)/MAX_DISCOVERABLE_STATES:.2%})"
)
print(
    f"Discovered states rp: {len(rp.pol)}/{MAX_DISCOVERABLE_STATES} ({len(rp.pol)/MAX_DISCOVERABLE_STATES:.2%})"
)

Discovered states rl: 1045/2907 (35.95%)
Discovered states rp: 9829/2907 (338.11%)


In [59]:
NUM_GAME = 1000
win_rl1 = 0
ties = 0
for _ in range(NUM_GAME):
    win_idx, players = game.run(rl1, rp)
    if win_idx != -1 and id(players[win_idx]) == id(rl1):
        win_rl1 += 1
    elif win_idx == -1:
        ties += 1
print()
print(f"% of wins of RL: {win_rl1/NUM_GAME:.2%}")
print(f"% of ties: {ties/NUM_GAME:.2%}")
print(f"(% of wins other player: {1 - ((ties + win_rl1)/NUM_GAME):.2%})")

[32mPlayer 1[0m make your move:

      (0)   (1)   (2)
    -------------------
(0) | [31m X [0m | [31m X [0m | [32m O [0m |
    -------------------
(1) | [31m X [0m |     |     |
    -------------------
(2) | [32m O [0m | [32m O [0m | [32m O [0m |
    -------------------

% of wins of RL: 58.60%
% of ties: 4.20%
(% of wins other player: 37.20%)


In [62]:
hp = HumanPlayer()
game.run(hp, rp)

[32mPlayer 1[0m make your move:

      (0)   (1)   (2)
    -------------------
(0) |     | [32m O [0m |     |
    -------------------
(1) |     |     |     |
    -------------------
(2) |     |     |     |
    -------------------


ValueError: not enough values to unpack (expected 2, got 1)