## Lab 10

Deadline by 25/12/2023
Devise a strategy to play tic-tac-toe by means of RL

In [88]:
import lab10_lib as ttt
import random as rand
from copy import deepcopy
from collections import defaultdict
import numpy

Example to show how to use the board

In [89]:
board = ttt.Board()
board.update((1,1), 1)
board.update((0,0), -1)
board.update((2,0), 1)
board.update((0,1), 1)
board.update((1,2), -1)
board.update((1,0), 1)
board.update((2,2), 1)
board.update((2,1), -1)
#board.update((0,2), -1)

print(board)
print(board.winner())

⭕️❌  
❌❌⭕️
❌⭕️❌

None


Some strategies: random, expert, learning

In [90]:
def possible_moves(board: ttt.Board) -> list[ttt.Move]:
    """returns a list of possible moves"""
    possible = []
    for i in range(3):
        for j in range(3):
            if board.state[i][j] == 0:
                possible.append((i,j))
    return possible    
            

class RandomAgent():
    """returns a random move"""
    def generate_move(self, board: ttt.Board) -> ttt.Move:
        possible = possible_moves(board)
        return possible[rand.randint(0, len(possible)-1)]


class ExpertAgent():
    """checks if it possible to block a winning move, else returns random"""
    def generate_move(self, board: ttt.Board) -> ttt.Move:
        possible = possible_moves(board)
        for m in possible:
            sim_board = ttt.Board(deepcopy(board.state))
            sim_board.update(m, 1)
            if sim_board.winner() == 1:
                return m
        return possible[rand.randint(0, len(possible)-1)]
    

class LearningAgent():
    """learns how to play by playing a lot of games"""
    def __init__(self) -> None:
        self.qtable = defaultdict(float)
        self.index = 0 
        self.trajectory = [] 
        self.epsilon = 0.1                      #probability to make a random move
        self.lr = 0.5                           #learning rate
        self.gamma = 0.8                        #discount rate

    def stop_exploring(self) -> None:
        self.epsilon = 0.0

    def generate_move(self, board: ttt.Board) -> ttt.Move:
        """generates a move using, if possible the best one"""
        possible = possible_moves(board)
        keys = [ (board.state, move) for move in possible ]
        if len(keys) == 0 or rand.random() < self.epsilon:
            return possible[rand.randint(0, len(possible)-1)]
        keys = [ (board.state, move) for move in possible ]
        vals = [ self.qtable[k] for k in keys ]
        choice = keys[numpy.argmax(vals)]
        self.trajectory.append(choice)
        return choice[1]
    
    def feedback(self, won: int) -> None:
        """updates qtable and resets trajectory"""
        won = [-5, 15, -10][won]      #draw, win, lose
        i = 0
        self.trajectory.reverse()
        for c in self.trajectory:
            maxq = max([self.qtable[res] for res in self.trajectory[0:i]]) if i != 0 else won
            self.qtable[c] = (1 - self.lr) * self.qtable[c] + self.lr * (won + self.gamma * maxq) 
            i += 1
        self.trajectory = []        

Toy example to show how it works

In [91]:
b = ttt.Board()
p1 = LearningAgent()
p2 = RandomAgent()
p3 = ExpertAgent()

ttt.play_games(b, p1, [p2, p3], n_games=2, logging=True, toy_games=True, print_board=True)

❌❌⭕️
❌  ⭕️
  ⭕️⭕️

❌❌⭕️
⭕️⭕️❌
❌❌⭕️

❌❌⭕️
❌⭕️⭕️
⭕️    

❌❌⭕️
❌⭕️  
⭕️⭕️❌

won: 0
lost: 3
draw: 1



## Now we learn and then try to play

In [92]:
b = ttt.Board()
p1 = LearningAgent()
p2 = RandomAgent()
p3 = ExpertAgent()

#create a baseline
ttt.play_games(b, p1, [p2, p3], n_games=100, logging=True, toy_games=True)

#learn
ttt.play_games(b, p1, [p2, p3], n_games=10000)

#play to confront with baseline
p1.stop_exploring                       #set epsilon to 0 to avoid random moves
ttt.play_games(b, p1, [p2, p3], n_games=100, logging=True, toy_games=True)

won: 77
lost: 104
draw: 19

won: 128
lost: 43
draw: 29

