Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [12]:
from itertools import combinations
from collections import namedtuple
from random import choice, random
from copy import deepcopy
import numpy as np

from tqdm.auto import tqdm

import pickle

In [13]:
State = namedtuple('State', ['x', 'o'])

In [14]:
class TicTacToe:
    MAGIC = [2, 7, 6,
             9, 5, 1,
             4, 3, 8]
    
    # for the expert agent
    CENTER = 5
    CORNERS = [2, 6, 8, 4]
    SIDES = [7, 1, 3, 9]

    def __init__(self, init_player = 0):
        self.state = State(set(), set())
        self.available_moves = [2, 7, 6,
                                9, 5, 1,
                                4, 3, 8]

        # once this has been introduces, you can even remove from the expert agent the possibility to choose the player (is embedded here)
        self.current_player = init_player
        self.winner = None
        self.game_over = False

    def reset(self):
        self.state = State(set(), set())
        self.available_moves = [2, 7, 6,
                                9, 5, 1,
                                4, 3, 8]
        self.current_player = 0
        self.winner = None
        self.game_over = False

    def make_move(self, move):
        if self.check_available(move) == False or self.game_over:
            return False
        self.state[self.current_player].add(move)
        self.available_moves.remove(move)
        self.check_winner()
        self.switch_player()
        return True

    def switch_player(self):
        self.current_player = (self.current_player + 1) % 2

    def check_available(self, move):
        return move in self.available_moves

    def check_winner(self):
        if any(sum(c) == 15 for c in combinations(self.state[self.current_player], 3)):
            self.winner = self.current_player
            self.game_over = True
        elif not self.available_moves:
            # ties
            self.game_over = True

    @property
    def reward(self):
        if self.winner == 0:
            return 1
        elif self.winner == 1:
            return -1
        return 0

    def print_board(self):
        for r in range(3):
            for c in range(3):
                i = r * 3 + c
                if self.MAGIC[i] in self.state.x:
                    print('X', end='')
                elif self.MAGIC[i] in self.state.o:
                    print('O', end='')
                else:
                    print('.', end='')
            print()
        print()

In [15]:
from abc import abstractmethod

class Agent:
  @abstractmethod
  def choose_action(self, state: State, available_moves: list) -> int:
    pass

## expert agent
implemented following https://en.wikipedia.org/wiki/Tic-tac-toe

In [16]:
class ExpertAgent(Agent):
  def __init__(self, player = 0):
    self.player = player

  def choose_action(self, state: State, available_moves: list) -> int:
    move = self.win_move(available_moves, state)
    if move != 0:
      return move
    move = self.two_wins_move(available_moves, state)
    if move != 0:
      return move
    move = self.center_move(available_moves)
    if move != 0:
      return move
    move = self.corner_move(available_moves, state)
    if move != 0:
      return move
    return self.side_move(available_moves)

  # win else block
  def win_move(self, available, state):
    move = next((a for a in available for c in combinations(state[self.player], 2) if (sum(c+tuple([a]))==15)), 0)
    if move != 0:
      return move
    return next((a for a in available for c in combinations(state[(self.player+1)%2], 2) if (sum(c+tuple([a]))==15)), 0)

  # go to two wins else block two wins
  def two_wins_move(self, available, state):
    # per ogni available, faccio una copia dello stato, aggiungo l'available, conto i 15, se ce ne sono due la faccio
    move = next((a for a in available if sum(sum(c) == 15 for c in combinations(state[self.player].union([a]), 3)) == 2), 0)
    if move != 0:
      return move
    return next((a for a in available if sum(sum(c) == 15 for c in combinations(state[(self.player+1)%2].union([a]), 3)) == 2), 0)

  def center_move(self, available):
    if TicTacToe.CENTER in available:
      return TicTacToe.CENTER
    return 0

  # play the opposite corner of opponent, else any corner
  def corner_move(self, available, state):
    for i, c in enumerate(TicTacToe.CORNERS):
      if c in state[self.player]:
        opp_i = i + 2
        if opp_i >= len(TicTacToe.CORNERS):
          opp_i -= len(TicTacToe.CORNERS)
        if TicTacToe.CORNERS[opp_i] in available:
          return TicTacToe.CORNERS[opp_i]
    for c in TicTacToe.CORNERS:
      if c in available:
        return c
    return 0

  # choose any of the side
  def side_move(self, available):
    for s in TicTacToe.SIDES:
      if s in available:
        return s
    return 0

## model-free

### Q-learning
implemented following https://plainenglish.io/blog/building-a-tic-tac-toe-game-with-reinforcement-learning-in-python

In [17]:
class QLearningAgent:
    def __init__(self, alpha, epsilon, gamma, input_filename = None, output_filename = "Q"):
        if input_filename:
            with open(input_filename, "rb") as f:
                self.Q = pickle.load(f)
        else:
            self.Q = {}
        self.alpha = alpha
        self.epsilon = epsilon
        self.gamma = gamma

        self.output_filename = output_filename

    def get_value(self, state, action):
        if (state, action) not in self.Q:
            self.Q[(state, action)] = 0.0 # self.Q(( (x, o), action ))
        return self.Q[(state, action)]

    def choose_action(self, state: State, available_moves: list) -> int:
        hashable_state = (frozenset(state.x), frozenset(state.o))

        if np.random.uniform() < self.epsilon:
            return choice(available_moves)
        else:
            Q_values = [self.get_value(hashable_state, action) for action in available_moves]
            max_Q = max(Q_values)
            if Q_values.count(max_Q) > 1:
                best_moves = [i for i in range(len(available_moves)) if Q_values[i] == max_Q]
                i = choice(best_moves)
            else:
                i = Q_values.index(max_Q)
            return available_moves[i]

    # state, action, game.reward, next_state, game.available_moves
    def update(self, state, action, reward, next_state, next_available_moves):
        hashable_state = (frozenset(state.x), frozenset(state.o))
        hashable_next_state = (frozenset(next_state.x), frozenset(next_state.o))

        next_Q_values = [self.get_value(hashable_next_state, next_action) for next_action in next_available_moves]
        max_next_Q = max(next_Q_values) if next_Q_values else 0.0

        Q_value = self.get_value(hashable_state, action)
        self.Q[(hashable_state, action)] = Q_value + self.alpha * (reward + self.gamma * max_next_Q - Q_value)

    def set_epsilon(self, epsilon):
        self.epsilon = epsilon

    def save_Q(self):
        with open(self.output_filename, 'wb') as f:
            pickle.dump(self.Q, f)

### random

In [18]:
class RandomAgent:
  def choose_action(self, _: State, available_moves: list) -> int:
    return choice(available_moves)

## training Q-agent

In [19]:
# utils function to canonize, i.e. consider the symmetry
def canonize(state):
  pass
  # return canonic, transformation

In [20]:
def key(state, action):
  return ((frozenset(state.x), frozenset(state.x)), action)

In [21]:
def Q_train(num_episodes, alpha = .5, epsilon = .8, gamma = .8):
    agent_1 = QLearningAgent(alpha, epsilon, gamma)
    agent_2 = RandomAgent()
    e_range = np.linspace(1, 0.1, num_episodes)
    game = TicTacToe()
    for step in tqdm(range(num_episodes)):
        agent_1.set_epsilon(e_range[step])
        while not game.game_over:
            state = deepcopy(game.state)
            action = agent_1.choose_action(state, game.available_moves)
            game.make_move(action)

            if game.game_over:
                next_state = deepcopy(game.state)
                next_actions = game.available_moves
                reward = game.reward
                agent_1.update(state, action, reward, next_state, next_actions)

            else:
                reward = game.reward
                
                a2 = agent_2.choose_action(game.state, game.available_moves)
                game.make_move(a2)

                if game.game_over:
                    reward = game.reward

                next_state = deepcopy(game.state)
                next_actions = game.available_moves
                agent_1.update(state, action, reward, next_state, next_actions)
        game.reset()
    return agent_1

In [22]:
Q_train(100_000, .5, .8, .8).save_Q()

  1%|          | 1021/100000 [00:00<00:09, 10204.39it/s]

100%|██████████| 100000/100000 [00:07<00:00, 12510.62it/s]


## test policy

In [23]:
ties = 0
wins = 0
total = 100_000

alpha = .5
epsilon = 1
gamma = .8

players = [QLearningAgent(alpha, epsilon, gamma, input_filename="Q"), RandomAgent()]

game = TicTacToe()

for steps in tqdm(range(total)):
    while not game.game_over:
        move = players[game.current_player].choose_action(game.state, game.available_moves)
        game.make_move(move)
    if game.reward == 1:
        wins += 1
    elif game.reward == 0:
        ties += 1
    game.reset()

print(f"wins: {wins/total:.2%} ")

100%|██████████| 100000/100000 [00:01<00:00, 75821.34it/s]

wins: 58.45% 



