## Lab 10

Deadline by 25/12/2023
Devise a strategy to play tic-tac-toe by means of RL

In [1]:
import lab10_lib as ttt
import random as rand
from copy import deepcopy
from collections import defaultdict
import numpy

Example to show how to use the board

In [2]:
board = ttt.Board()
board.update((1,1), 1)
board.update((0,0), -1)
board.update((2,0), 1)
board.update((0,1), 1)
board.update((1,2), -1)
board.update((1,0), 1)
board.update((2,2), 1)
board.update((2,1), -1)
#board.update((0,2), -1)

print(board)
print(board.winner())

⭕️❌  
❌❌⭕️
❌⭕️❌

None


Some strategies: random, expert, learning

In [3]:
class RandomAgent():
    """returns a random move"""
    def generate_move(self, board: ttt.Board) -> ttt.Move:
        possible = board.possible_moves()
        return possible[rand.randint(0, len(possible)-1)]


class ExpertAgent():
    """checks if it possible to block a winning move, else returns random"""
    def generate_move(self, board: ttt.Board) -> ttt.Move:
        possible = board.possible_moves()
        for m in possible:
            sim_board = ttt.Board(deepcopy(board.state))
            sim_board.update(m, 1)
            if sim_board.winner() == 1:
                return m
        return possible[rand.randint(0, len(possible)-1)]
    

class LearningAgent():
    """learns how to play by playing a lot of games"""
    def __init__(self) -> None:
        self.qtable = defaultdict(float)
        self.index = 0 
        self.trajectory = [] 
        self.epsilon = 0.1                      #probability to make a random move
        self.lr = 0.5                           #learning rate
        self.gamma = 0.8                        #discount rate

    def stop_exploring(self) -> None:
        """sets epsilon to 0 to avoid random moves from now on"""
        self.epsilon = 0.0

    def garbage_collect(self) -> None:
        """resets trajectory, use it to avoid memory leaks"""
        self.trajectory = []    

    def generate_move(self, board: ttt.Board) -> ttt.Move:
        """generates a move using, if possible the best one"""
        possible = board.possible_moves()
        keys = [ (board.state, move) for move in possible ]             #generate all the possible keys = (state, action)
        if len(keys) == 0 or rand.random() < self.epsilon:
            return possible[rand.randint(0, len(possible)-1)]           #return a random move if no idea on what to do
        vals = [ self.qtable[k] for k in keys ]
        choice = keys[numpy.argmax(vals)]                               #choose the highest quality move
        self.trajectory.append(choice)                                  #save the move for later
        return choice[1]
    
    def feedback(self, won: int) -> None:
        """updates qtable and resets trajectory"""
        won = [-1, 15, -5][won]      #draw, win, lose
        i = 0
        self.trajectory.reverse()
        for c in self.trajectory:
            maxq = 0
            if i != 0:                                              #apply q-learning formula
                board = ttt.Board(self.trajectory[i-1][0])          #take the board after applying the move
                possible = board.possible_moves()                    #compute all the possible (state, action) to find the max
                keys = [ (board.state, move) for move in possible ]
                maxq = max([self.qtable[k] for k in keys])
            self.qtable[c] = (1 - self.lr) * self.qtable[c] + self.lr * (won + self.gamma * maxq)   #update qtable
            i += 1
        self.trajectory = []                                        #reset trajectory    

Toy example to show how it works

In [4]:
b = ttt.Board()
p1 = LearningAgent()
p2 = RandomAgent()
p3 = ExpertAgent()

ttt.play_games(b, p1, [p2, p3], n_games=1, logging=True, toy_games=True, print_board=True)

⭕️❌⭕️
❌❌⭕️
⭕️⭕️❌

❌❌⭕️
❌    
⭕️⭕️⭕️

won: 0
lost: 1
draw: 1



## Now we learn and then try to play

In [5]:
b = ttt.Board()
p1 = LearningAgent()
p2 = RandomAgent()
p3 = ExpertAgent()

#create a baseline
print('Before learning:')
ttt.play_games(b, p1, [p2, p3], logging=True, toy_games=True)
p1.garbage_collect()                    #delete the trajectory to avoid updateing all the states after the first real game

#learn
ttt.play_games(b, p1, [p2, p3], n_games=10000)

#play to confront with baseline
print('After learning:')
p1.stop_exploring                       #set epsilon to 0 to avoid random moves
ttt.play_games(b, p1, [p2, p3], logging=True, toy_games=True)
p1.garbage_collect()

Before learning:
won: 73
lost: 97
draw: 30

After learning:
won: 139
lost: 28
draw: 33

