# Tic Tac Toe RL solving

In this notebook we will try to provide a model which is able to play correctly against a human and an other AI

In [148]:
import numpy as np
import random

In [149]:
class TicTacToe(object):
    
    def __init__(self, player1, player2):
        super(TicTacToe, self).__init__()
        self.board = np.array(["v", "v", "v", "v", "v", "v", "v", "v", "v"])
        
        self.players = [player1, player2]
        self.markers = ["x", "o"]
        random.shuffle(self.players)
        
        self.history = []
   
    def __str__(self):
        s = "_____________\n"
        for i in range(self.board.shape[0]):
            if i % 3 == 0 and i != 0:
                s += "|\n_____________\n"
            s += "| {} ".format(self.board[i])
        s += "|\n_____________"
        return s
    
    def __repr__(self):
        return "<Object TicTacToe, Players:[{},{}]>".format(self.players[0], self.players[1])
    
    def reset_env(self):
        self.board = np.array(["v", "v", "v", "v", "v", "v", "v", "v", "v"])
        random.shuffle(self.players)
        self.history = []
        
    def run_game(self, verbose=0):
        done = False
        while not done:
            turn = len(self.history)%2
            p = self.players[turn]
            m = self.markers[turn]
            
            # player give an action following the grid
            choices = self.get_choices()
            action = p.step(self.history, choices)
            if action in choices:
                self.board[action] = m
                self.history.append(action)
            else:
                raise Exception("Error - You choose the action {} but were only available {}".format(action, choices))
            # prints
            if verbose==2:
                print(self)
            #checks
            done = self.has_win(m)
            if not done:
                done = self.is_board_full(verbose)
                if done:
                    p.update(0)
                    self.players[(turn+1)%2].update(0)
            else:
                p.update(1)
                self.players[(turn+1)%2].update(-1)
                if verbose >= 1:
                    print("Player {} wins :\n{}".format(p, self))
    
    def is_board_full(self, verbose):
        cond = len(self.history) == self.board.shape[0]
        if cond and verbose >= 1:
            print("Game is finished with no winner")
        return cond
    
    def has_win(self, marker):
        cond = False
        # row
        cond = cond or (self.board[0] == marker and self.board[1] == marker and self.board[2] == marker) or \
            (self.board[3] == marker and self.board[4] == marker and self.board[5] == marker) or \
            (self.board[6] == marker and self.board[7] == marker and self.board[8] == marker)
        # column
        cond = cond or (self.board[0] == marker and self.board[3] == marker and self.board[6] == marker) or \
            (self.board[1] == marker and self.board[4] == marker and self.board[7] == marker) or \
            (self.board[2] == marker and self.board[5] == marker and self.board[8] == marker)
        # diagonals
        cond = cond or (self.board[0] == marker and self.board[4] == marker and self.board[8] == marker) or \
            (self.board[2] == marker and self.board[4] == marker and self.board[6] == marker)
        return cond
        
    def get_choices(self):
        choices = [i for i in range(0,9)]
        for i in self.history:
            choices.remove(i)
        return choices
            
        

In [150]:
class Player(object):
    def __init__(self, name):
        super(Player, self).__init__()
        self.name = name
        
    def step(self, history, choices):
        action = random.choice(choices)
        return action
    
    def update(self, reward):
        pass
    
    def __str__(self):
        return self.name
    def __repr__(self):
        return "<Object Player:Random, Name:{}>".format(self.name)

In [173]:
class Player_Q_Learning(object):
    def __init__(self, name, epsilon=1., gamma=0.9, rates=[0.01, 0.01, 0.01]):
        super(Player_Q_Learning, self).__init__()
        self.name = name
        self.rates = rates
        self.gamma = gamma
        self.epsilon = epsilon
        self.states = {}
        self.states_actions = []
        self.last_state = None
        
    def step(self, history, choices):
        action = 0
        state = [-1,-1]
        state[0] = self.check_key_exists(str(choices).replace("[", "").replace("]", "").replace(", ", "-"), len(choices))
        if np.random.rand() <= self.epsilon:
            state[1] = random.choice(range(len(choices)))
            action = choices[state[1]]
            
            # update weights
            self.update_from_previous(state)
        else:
            state[1], action = self.argmax(self.states_actions[state[0]], choices)
            
            # update weights
            self.update_from_previous(state)
        self.last_state = state
        return action
    
    def update(self, reward):
        if self.last_state != None:
            self.states_actions[self.last_state[0]][self.last_state[1]] = reward
        self.epsilon = self.epsilon - self.rates[1]
        if self.epsilon < self.rates[2]:
            self.epsilon = self.rates[2]
    
    def update_from_previous(self, state):
        if self.last_state != None:
            new = self.states_actions[self.last_state[0]][self.last_state[1]]
            self.states_actions[state[0]][state[1]] += self.rates[0] * (0 + self.gamma * new - self.states_actions[state[0]][state[1]])
        pass
    
    def __str__(self):
        return self.name
    def __repr__(self):
        return "<Object Player:QLearning, Name:{}>".format(self.name)
    
    def check_key_exists(self, key, choices):
        try:
            i = self.states[key]
            return i
        except:
            self.states[key] = len(self.states_actions)
            self.states_actions.append([0 for i in range(choices)])
            return self.check_key_exists(key, choices)
        
    def argmax(self, array, choices):
        max_i = 0
        max_val = -9999999
        for i in range(len(array)):
            if array[i] > max_val:
                max_val = array[i]
                max_i = i
        return (max_i, choices[max_i])

In [174]:
p1 = Player("Tonclure")
p2 = Player_Q_Learning("Tonclurette")

ttt = TicTacToe(p1, p2)
ttt

<Object TicTacToe, Players:[Tonclurette,Tonclure]>

In [175]:
ttt.run_game(verbose=1)

creating
not creating
creating
not creating
creating
not creating
creating
not creating
Player Tonclurette wins :
_____________
| x | v | o |
_____________
| x | o | v |
_____________
| x | o | x |
_____________


In [177]:
p2.states

{'0-1-2-3-4-5-6-7-8': 0, '0-1-2-3-4-5-6': 1, '1-3-4-5-6': 2, '1-5-6': 3}