# Tic Tac Toe RL solving

In this notebook we will try to provide a model which is able to play correctly against a human and an other AI

In [2]:
import numpy as np
import random
import json
import sys

In [3]:
class TicTacToe(object):
    
    def __init__(self, player1, player2):
        super(TicTacToe, self).__init__()
        self.board = np.array(["v", "v", "v", "v", "v", "v", "v", "v", "v"])
        
        self.players = [player1, player2]
        self.markers = ["x", "o"]
        random.shuffle(self.players)
        
        self.history = []
   
    def __str__(self):
        s = "_____________\n"
        for i in range(self.board.shape[0]):
            if i % 3 == 0 and i != 0:
                s += "|\n_____________\n"
            s += "| {} ".format(self.board[i])
        s += "|\n_____________"
        return s
    
    def __repr__(self):
        return "<Object TicTacToe, Players:[{},{}]>".format(self.players[0], self.players[1])
    
    def reset_env(self):
        self.board = np.array(["v", "v", "v", "v", "v", "v", "v", "v", "v"])
        random.shuffle(self.players)
        self.history = []
        
    def run_game(self, verbose=0):
        done = False
        while not done:
            turn = len(self.history)%2
            p = self.players[turn]
            m = self.markers[turn]
            
            # player give an action following the grid
            choices = self.get_choices()
            action = p.step(self.history, choices)
            if action in choices:
                self.board[action] = m
                self.history.append(action)
            else:
                raise Exception("Error - You choose the action {} but were only available {}".format(action, choices))
            # prints
            if verbose==2:
                print(self)
            #checks
            done = self.has_win(m)
            if not done:
                done = self.is_board_full(verbose)
                if done:
                    p.update(0)
                    self.players[(turn+1)%2].update(0)
            else:
                p.update(1)
                self.players[(turn+1)%2].update(-1)
                if verbose >= 1:
                    print("Player {} wins :\n{}".format(p, self))
                    
    def train(self, epochs=1, iter_save=1, iter_verbose=1, verbose = 0):
        for i in range(epochs):
            if (i+1) % iter_verbose == 0:
                if verbose >= 0:
                    sys.stdout.write("\repoch {} / {}".format(i+1, epochs))
                    sys.stdout.flush()
                    
            self.run_game(verbose)
            self.reset_env()
            
            if (i+1) % iter_save == 0:
                for p in self.players:
                    p.save_model()
                    
        #print("Training Done")
            
    
    def is_board_full(self, verbose):
        cond = len(self.history) == self.board.shape[0]
        if cond and verbose >= 1:
            print("Game is finished with no winner")
        return cond
    
    def has_win(self, marker):
        cond = False
        # row
        cond = cond or (self.board[0] == marker and self.board[1] == marker and self.board[2] == marker) or \
            (self.board[3] == marker and self.board[4] == marker and self.board[5] == marker) or \
            (self.board[6] == marker and self.board[7] == marker and self.board[8] == marker)
        # column
        cond = cond or (self.board[0] == marker and self.board[3] == marker and self.board[6] == marker) or \
            (self.board[1] == marker and self.board[4] == marker and self.board[7] == marker) or \
            (self.board[2] == marker and self.board[5] == marker and self.board[8] == marker)
        # diagonals
        cond = cond or (self.board[0] == marker and self.board[4] == marker and self.board[8] == marker) or \
            (self.board[2] == marker and self.board[4] == marker and self.board[6] == marker)
        return cond
        
    def get_choices(self):
        choices = [i for i in range(0,9)]
        for i in self.history:
            choices.remove(i)
        return choices
            
        

In [4]:
class Player(object):
    def __init__(self, name):
        super(Player, self).__init__()
        self.name = name
        self.game_played=0
        
    def step(self, history, choices):
        action = random.choice(choices)
        return action
    
    def update(self, reward):
        self.game_played += 1
        pass
    
    def save_model(self):
        data = {"game_played":self.game_played}
        
        with open('{}.json'.format(self.name), 'w') as outfile:
            json.dump(data, outfile)
    
    def __str__(self):
        return self.name
    def __repr__(self):
        return "<Object Player:Human, Name:{}>".format(self.name)

In [5]:
class Player_Human(Player):
    def __init__(self, name):
        super(Player_Human, self).__init__(name)
        self.name = name
        self.game_played=0
        
    def step(self, history, choices):
        action = int(input("Choose an action {} = ".format(choices)))
        return action
    
    def update(self, reward):
        self.game_played += 1
        pass
    
    def save_model(self):
        data = {"game_played":self.game_played}
        
        with open('{}.json'.format(self.name), 'w') as outfile:
            json.dump(data, outfile)
    
    def __str__(self):
        return self.name
    def __repr__(self):
        return "<Object Player:Random, Name:{}>".format(self.name)

In [6]:
class Player_Q_Learning(Player):
    def __init__(self, name, epsilon=1., gamma=0.9, rates=[0.01, 0.01, 0.01]):
        super(Player_Q_Learning, self).__init__(name)
        self.name = name
        self.rates = rates
        self.gamma = gamma
        self.epsilon = epsilon
        self.states = {}
        self.states_actions = []
        self.last_state = None
        
    def step(self, history, choices):
        action = 0
        state = [-1,-1]
        state[0] = self.check_key_exists(str(choices).replace("[", "").replace("]", "").replace(", ", "-"), len(choices))
        if np.random.rand() <= self.epsilon:
            state[1] = random.choice(range(len(choices)))
            action = choices[state[1]]
            
            # update weights
            self.update_from_previous(state)
        else:
            state[1], action = self.argmax(self.states_actions[state[0]], choices)
            
            # update weights
            self.update_from_previous(state)
        self.last_state = state
        return action
    
    def update(self, reward):
        Player.update(self, reward)
        if self.last_state != None:
            self.states_actions[self.last_state[0]][self.last_state[1]] = reward
        self.epsilon = self.epsilon - self.rates[1]
        if self.epsilon < self.rates[2]:
            self.epsilon = self.rates[2]
    
    def update_from_previous(self, state):
        if self.last_state != None:
            new = self.states_actions[self.last_state[0]][self.last_state[1]]
            self.states_actions[state[0]][state[1]] += self.rates[0] * (0 + self.gamma * new - self.states_actions[state[0]][state[1]])
        pass
    
    def __str__(self):
        return self.name
    def __repr__(self):
        return "<Object Player:QLearning, Name:{}>".format(self.name)
    
    def check_key_exists(self, key, choices):
        try:
            i = self.states[key]
            return i
        except:
            self.states[key] = len(self.states_actions)
            self.states_actions.append([0 for i in range(choices)])
            return self.check_key_exists(key, choices)
        
    def argmax(self, array, choices):
        max_i = 0
        max_val = -9999999
        for i in range(len(array)):
            if array[i] > max_val:
                max_val = array[i]
                max_i = i
        return (max_i, choices[max_i])
    
    def save_model(self):
        # Q's
        data = {"game_played":self.game_played, "Q":[]}
        for key in self.states:
            data["Q"].append([key, self.states[key], self.states_actions[self.states[key]]])
        
        with open('{}.json'.format(self.name), 'w') as outfile:
            json.dump(data, outfile)
            
    def load_model(self):
        with open('{}.json'.format(self.name)) as json_file:
            data = json.load(json_file)
            self.game_played = int(data["game_played"])
            for i in range(len(data['Q'])):
                self.states_actions.append(0)
            for state in data['Q']:
                self.states[state[0]] = int(state[1])
                self.states_actions[int(state[1])] = [float(i) for i in state[2]]

In [11]:
p1 = Player("RandomPlayer")
p2 = Player_Q_Learning("QLearningPlayer", epsilon=1., gamma=0.99, rates=[0.01, 0.00000001, 0.0001])

In [12]:
# if load model
p2.load_model()

In [13]:
ttt = TicTacToe(p1, p2)
ttt

<Object TicTacToe, Players:[RandomPlayer,QLearningPlayer]>

In [14]:
ttt.train(epochs=10_000_000, iter_save=10000, iter_verbose=10000, verbose=0)

epoch 10000000 / 10000000

In [15]:
p2.game_played

11000000

In [18]:
p2.states_actions

[[0.0024402897959738424,
  -0.07055253800110621,
  0.09589160086436807,
  0.10320766697960011,
  -0.1073857161675238,
  0.040552039562199126,
  0.01446399043215677,
  0.0051518215623756435],
 [-0.9304113521964521,
  -0.9578158895443376,
  -0.9684258816744501,
  -0.7791905975349652,
  -0.9681693276113277,
  -0.9898248595668939],
 [1, -0.999403583588252, 0.9634676276274561, -0.9890903283879019],
 [0, -1],
 [-0.10584168726739984,
  0.004218880989399645,
  0.07141305039863304,
  0.12052523922168518,
  0.011719914677601983,
  0.05302063072117422,
  -0.030069809426477218,
  0.10596584690705248],
 [-0.9438108199489631, -1, -0.8730851164228759, -0.982035086668297, -1, -1],
 [0.07009237060062423,
  -0.019938204290615824,
  -0.05973096952034849,
  -0.05116393562828293,
  0.06534377072213832,
  -0.09041600633397408,
  0.03666538174221627,
  -0.016575419010061015],
 [-1,
  -0.9896056842390211,
  -0.9430655693597424,
  -0.9894185433999403,
  -1,
  -0.9611292319736905],
 [-0.9749134710167688, -1, -0

# Play Against the Q_learning player

In [16]:
p3 = Player_Human("Camille")
ttt = TicTacToe(p3, p2)

In [19]:
ttt.reset_env()
ttt.run_game(verbose=2)


Choose an action [0, 1, 2, 3, 4, 5, 6, 7, 8] = 0
_____________
| x | v | v |
_____________
| v | v | v |
_____________
| v | v | v |
_____________
_____________
| x | o | v |
_____________
| v | v | v |
_____________
| v | v | v |
_____________
Choose an action [2, 3, 4, 5, 6, 7, 8] = 3
_____________
| x | o | v |
_____________
| x | v | v |
_____________
| v | v | v |
_____________
_____________
| x | o | v |
_____________
| x | v | o |
_____________
| v | v | v |
_____________
Choose an action [2, 4, 6, 7, 8] = 6
_____________
| x | o | v |
_____________
| x | v | o |
_____________
| x | v | v |
_____________
Player Camille wins :
_____________
| x | o | v |
_____________
| x | v | o |
_____________
| x | v | v |
_____________
