# Tic Tac Toe RL solving

In this notebook we will try to provide a model which is able to play correctly against a human and an other AI

In [1]:
import numpy as np

In [3]:
class TicTacToe(object):
    def __init__(self, player1, player2):
        super(TicTacToe, self).__init__()
        self.board = np.zeros((3,3))
        self.players = np.random.shuffle([player1, player2])
        self.history = np.array([])
        
    def reset(self):
        self.history = np.array([])
        self.players = np.random.shuffle(self.players)
        self.board = np.zeros((3,3))
        
    def Play_Game():
        is_finished = False
        if (not self.Is_Game_Ended(None)) and not is_finished :
            turn = len(self.history) % 2
            player = self.players[turn]
            pos = player.Play_Turn(self.Find_Pos_Available(), self.history)
            
            numpy.append(self.history, (pos[0]+1)*(pos[1]+1))
            self.board[pos[0]][pos[1]] = turn+1
            
            # check if the player wins
            win = self.Is_Game_Ended(turn)
            if win:
                self.players[turn].Update(1)
                self.players[(turn+1)%2].Update(-1)
            
        else:
            print(self.board)
            
    def Is_Game_Ended(self, turn):
        if turn == None:
            return len(self.history) == 9
        else:
            positions = []
            for i in range(turn, len(history), 2):
                positions.append(history[i])
            # horizontal check
            cond = ([0,1,2] in positions) or ([3,4,5] in positions) or ([6,7,8] in positions)
            # vertical check
            cond = cond or ([0,3,6] in positions) or ([1,4,7] in positions) or ([2,5,8] in positions)
            # diagonal check
            cond = cond or ([0,4,8] in positions) or ([2,4,6] in positions)
            return cond
    
    def Find_Pos_Available(self):
        pos_avail = np.array([0,1,2,3,4,5,6,7,8])
        for i in self.history:
            pos_avail = np.delete(pos_avail, i)
                

In [7]:
class Player(object):
    def __init__(self, name):
        super(Player, self).__init__()
        self.name = name
        
    def Play_Turn(choices, history):
        action = np.random.choice(choices)
        return (int(action/3), action%3)
    
    def Update(self, reward):
        pass
    
    def __str__(self):
        return self.name
    def __repr__(self):
        return "<Object Player:Random, Name:{}>".format(self.name)

In [38]:
class Q_Player(Player):
    def __init__(self, name, rates):
        super(Q_Player, self).__init__(name)
        # instanciate Q matrix
        self.Q = []
        for i in range(9):
            self.Q.append(np.zeros([j for j in range(9,i,-1)]))
        # rates
        self.rates = rates
        self.state = None
        
    def __repr__(self):
        return "<Object Player:Q_Player, Name:{}>".format(self.name)
    
    def Play_Turn(choices, history):
        if np.random.uniform(0, 1) <= self.rates[0]:
            action = np.random.choice(choices)
            return (int(action/3), action%3)
        # if using memory then maximize rewards
        else:
            temp_state = []
            turn = len(self.Q)-len(history)-1
            arr = self.Q[turn]
            temp_state.append(turn)
            for i,past_action in enumerate(history):
                if i+1 == len(history):
                    action = np.argmax(arr)
                    temp_state.append(action)
                    self.state = temp_state
                else:
                    arr = arr[past_action]
                    temp_state.append(past_action)
            
        return action
    
    def Update(self, reward):
        arr = self.Q[self.state[0]]
        for i,past_action in enumerate(self.state[1:len(self.state)]):
            if i+1 == len(self.state):
                arr[past_action] = reward
            else:
                arr = arr[past_action]
    
    def Update_Q(self, reward, ):
        arr = self.Q[self.state[0]]
        for i,past_action in enumerate(self.state[1:len(self.state)]):
            if i+1 == len(self.state):
                arr[past_action] = reward
            else:
                arr = arr[past_action]
        

In [39]:
q_player = Q_Player("Tonclure", [1., 0.000001, 0.1])
q_player

<Object Player:Q_Player, Name:Tonclure>

array([[0., 0.],
       [0., 0.]])

In [41]:
for i in (1,0,1):
    pass