## **Playing TicTacToe with Reinforcement Learning & OpenAI Gym**

First we will import the necessary modules required i.e gym(to initialize & work with the TicTacToe environment), random(to make random choices when interacting with the environment) & gym_tictactoe

In [1]:
import gym
import random
import gym_tictactoe # this is our tictactoe environment

Creating an instance of the installed environment

In [2]:
env = gym.make("TicTacToe-v0")

Understanding our environment

In [3]:
env.state

[['-', '-', '-'], ['-', '-', '-'], ['-', '-', '-']]

In [4]:
env.hash()

'---------'

In [5]:
new_state, reward, done, info = env.step(0, "X")

In [6]:
new_state

'X--------'

In [7]:
reward

(0, 0)

In [8]:
done

False

In [9]:
info

{}

Now we will render the current state of the game in the form of a TicTacToe board

In [10]:
env.render()

Board
['X', '-', '-']
['-', '-', '-']
['-', '-', '-']


Function to return a list of positions available based on the current state of the game.

In [11]:
env.available_actions()

[1, 2, 3, 4, 5, 6, 7, 8]

Function to return a list of tuples representing the states and rewards a possible player can get based on the current state of the game. This function will be used to see what is possible to get to and what reward they give which will be used in decision making

In [12]:
env.available_states("O")

[('XO-------', (0, 0)),
 ('X-O------', (0, 0)),
 ('X--O-----', (0, 0)),
 ('X---O----', (0, 0)),
 ('X----O---', (0, 0)),
 ('X-----O--', (0, 0)),
 ('X------O-', (0, 0)),
 ('X-------O', (0, 0))]

Function to check whether or not the game is done in form of boolean and a return a reward

In [13]:
env.check_done(env.hash())

(False, (0, 0))

Reset the board so a new game can be played

In [14]:
env.reset()
env.render()

Board
['-', '-', '-']
['-', '-', '-']
['-', '-', '-']


An example game

In [15]:
done = False        # variable to keep track of whether game is over or not
env.reset()     # reset environment to clear any old game
env.render()        # print the initial board
while not done:     # play game until it is over
    # make a random action from list of available_actions for X
    new_state, reward, done, info = env.step(
        random.choice(env.available_actions()), "X")
    env.render()        # print board after X's action

    if not done:        # if game is done on X action we don't want O to make an action
        # make a random action from list of available_actions for O
        new_state, reward, done, info = env.step(
            random.choice(env.available_actions()), "O")
            # print board after O action
        env.render()

print(reward)       # print reward after game is done

Board
['-', '-', '-']
['-', '-', '-']
['-', '-', '-']
Board
['-', '-', '-']
['-', '-', '-']
['X', '-', '-']
Board
['-', '-', '-']
['-', '-', 'O']
['X', '-', '-']
Board
['-', '-', '-']
['-', '-', 'O']
['X', '-', 'X']
Board
['-', '-', '-']
['O', '-', 'O']
['X', '-', 'X']
Board
['X', '-', '-']
['O', '-', 'O']
['X', '-', 'X']
Board
['X', 'O', '-']
['O', '-', 'O']
['X', '-', 'X']
Board
['X', 'O', '-']
['O', 'X', 'O']
['X', '-', 'X']
(10, -10)


Scenario that the agent can get into with respect to the environment, below are all examples of states in TicTacToe remember we are using the hash of the board as the state.

In [16]:
done = False
env.reset()
while not done:
    new_state, reward, done, info = env.step(
        random.choice(env.available_actions()), "X")
    print(env.hash())

    if not done:
        new_state, reward, done, info = env.step(
            random.choice(env.available_actions()), "O")
        print(env.hash())       # print state

---X-----
---X--O--
X--X--O--
X-OX--O--
X-OX--O-X
X-OX--OOX
XXOX--OOX
XXOXO-OOX


In [17]:
env.reset()
env.render()

Board
['-', '-', '-']
['-', '-', '-']
['-', '-', '-']


In [18]:
reward

(-10, 10)

In [19]:
env.available_actions()

[0, 1, 2, 3, 4, 5, 6, 7, 8]

Create a Temporal Difference Learning Agent

In [20]:
class Agent():
    def __init__(self, env, player="X", alpha=0.4, gamma=0.9):      # default alpha & gamma values determined from testing
        self.alpha = alpha
        self.gamma = gamma
        self.env = env      # reference to environment agent is in
        self.player = player
        self.player_number = 0 if player == "X" else 1        # index of the reward tuple agent uses
        self.V = {}

In [21]:
class Agent(Agent):
    def select_action(self, epsilon=0.1):
        if (random.random() < epsilon):
            action = random.choice(self.env.available_actions())
        else:
            q_values = []
            for state in self.env.available_states(self.player):
                q_values.append(self.gamma*self.V[state[0]] + state[1][self.player_number])
            
            max_value = max(q_values)
            max_indexs = [i for i, j in enumerate(q_values) if j == max_value]
            action = self.env.available_actions()[random.choice(max_indexs)]
        
        return action

In [22]:
class Agent(Agent):
    def add_states(self):
        if (self.env.hash() not in self.V):
            self.V[self.env.hash()] = 0
        for state in self.env.available_states("X"):
            if (state[0] not in self.V):
                self.V[state[0]] = 0

        for state in self.env.available_states("O"):
            if (state[0] not in self.V):
                self.V[state[0]] = 0

In [23]:
class Agent(Agent):
    def update_state_values(self, new_state, old_state, reward):
        self.V[old_state] = self.V[old_state] + self.alpha*(reward + self.gamma*self.V[new_state] - self.V[old_state])

In [24]:
def train(episodes):
    agent_x = Agent(env, "X")
    agent_o = Agent(env, "O")
    for episode in range(episodes):
        done = False
        env.reset()
        while not done:
            agent_x.add_states()
            agent_o.add_states()
            old_state = env.hash()
            action = agent_x.select_action()
            new_state, reward, done, _ = env.step(action, agent_x.player)
            agent_x.update_state_values(new_state, old_state, reward[agent_x.player_number])
            agent_o.update_state_values(new_state, old_state, reward[agent_o.player_number])
            if not done:
                agent_x.add_states()
                agent_o.add_states()
                old_state = env.hash()
                action = agent_o.select_action()
                new_state, reward, done, _ = env.step(action, agent_o.player)
                agent_x.update_state_values(new_state, old_state, reward[agent_x.player_number])
                agent_o.update_state_values(new_state, old_state, reward[agent_o.player_number])

    return agent_x, agent_o

In [25]:
%%time

agent_x, agent_o = train(110000)

CPU times: user 1min 31s, sys: 58.1 ms, total: 1min 31s
Wall time: 1min 31s


In [26]:
def test_x(episodes):
    win = 0
    tie = 0
    loss = 0
    for episode in range(episodes):
        done = False
        env.reset()
        while not done:
            agent_x.add_states()
            x_action = agent_x.select_action(epsilon=0)
            new_state, reward, done, _ = env.step(x_action, agent_x.player)
            if (not done):
                agent_x.add_states()
                o_action = random.choice(env.available_actions())
                new_state, reward, done, _ = env.step(o_action, "O")
        
        if (reward == (10, -10)):
            win += 1
        elif (reward == (-10, 10)):
            loss += 1
        elif (reward == (0, 0)):
            tie += 1
    return win, loss, tie

In [27]:
episodes = 10000
win, loss, tie = test_x(episodes)

print("Win:", win, "Tie:", tie, "Loss:", loss)
print("Win Rate:", win/episodes*100, "Tie Rate:", tie / episodes*100, "Loss Rate:", loss/episodes*100)

Win: 9902 Tie: 98 Loss: 0
Win Rate: 99.02 Tie Rate: 0.98 Loss Rate: 0.0


In [28]:
def test_o(episodes):
    win = 0
    tie = 0
    loss = 0
    for episode in range(episodes):
        done = False
        env.reset()
        while not done:
            agent_o.add_states()
            x_action = random.choice(env.available_actions())
            new_state, reward, done, _ = env.step(x_action, "X")
            if (not done):
                agent_o.add_states()
                o_action = agent_o.select_action(epsilon=0)
                new_state, reward, done, _ = env.step(o_action, agent_o.player)

        if (reward == (-10, 10)):
            win += 1
        elif (reward == (10, -10)):
            loss += 1
        elif (reward == (0, 0)):
            tie += 1
    return win, loss, tie

In [29]:
episodes = 10000
win, loss, tie = test_o(episodes)

print("Win:", win, "Tie:", tie, "Loss:", loss)
print("Win Rate:", win/episodes*100, "Tie Rate:", tie / episodes*100, "Loss Rate:", loss/episodes*100)

Win: 8520 Tie: 1473 Loss: 7
Win Rate: 85.2 Tie Rate: 14.729999999999999 Loss Rate: 0.06999999999999999


In [30]:
def test(episodes):
    x_win = 0
    o_win = 0
    tie = 0
    for episode in range(episodes):
        done = False
        env.reset()
        while not done:
            agent_x.add_states()
            agent_o.add_states()
            x_action = agent_x.select_action(epsilon=0)
            new_state, reward, done, _ = env.step(x_action, "X")
            if (not done):
                agent_x.add_states()
                agent_o.add_states()
                o_action = agent_o.select_action(epsilon=0)
                new_state, reward, done, _ = env.step(o_action, "O")

        if (reward == (-10, 10)):
            o_win += 1
        elif (reward == (10, -10)):
            x_win += 1
        elif (reward == (0, 0)):
            tie += 1
    return x_win, o_win, tie

In [31]:
episodes = 10000
x_win, o_win, tie = test(episodes)

print("X Win:", x_win, "Tie:", tie, "O Win:", o_win)
print("X Win Rate:", x_win/episodes*100, "Tie Rate:", tie/episodes*100, "O Win Rate:", o_win/episodes*100)

X Win: 0 Tie: 10000 O Win: 0
X Win Rate: 0.0 Tie Rate: 100.0 O Win Rate: 0.0


In [32]:
def play_as_x(episodes=1):
    x_win = 0
    o_win = 0
    tie = 0
    for episode in range(episodes):
        done = False
        env.reset()
        while not done:
            env.render()
            print(env.available_actions())
            agent_o.add_states()
            x_action = int(input())
            new_state, reward, done, _ = env.step(x_action, "X")
            if (not done):
                agent_o.add_states()
                o_action = agent_o.select_action(epsilon=0)
                new_state, reward, done, _ = env.step(o_action, "O")

        env.render()
        if (reward == (-10, 10)):
            print("You Lose")
        elif (reward == (10, -10)):
            print("You Win")
        elif (reward == (0, 0)):
            print("Tie")

In [34]:
play_as_x()

Board
['-', '-', '-']
['-', '-', '-']
['-', '-', '-']
[0, 1, 2, 3, 4, 5, 6, 7, 8]
Board
['-', 'X', '-']
['-', 'O', '-']
['-', '-', '-']
[0, 2, 3, 5, 6, 7, 8]
Board
['O', 'X', 'X']
['-', 'O', '-']
['-', '-', '-']
[3, 5, 6, 7, 8]
Board
['O', 'X', 'X']
['X', 'O', '-']
['-', '-', 'O']
You Lose


In [35]:
def play_as_o(episodes=1):
    x_win = 0
    o_win = 0
    tie = 0
    for episode in range(episodes):
        done = False
        env.reset()
        while not done:
            agent_x.add_states()
            x_action = agent_x.select_action(epsilon=0)
            new_state, reward, done, _ = env.step(x_action, "X")
            if (not done):
                env.render()
                print(env.available_actions())
                agent_x.add_states()
                o_action = int(input())
                new_state, reward, done, _ = env.step(o_action, "O")

        env.render()
        if (reward == (-10, 10)):
            print("You Win")
        elif (reward == (10, -10)):
            print("You Lose")
        elif (reward == (0, 0)):
            print("Tie")

In [36]:
play_as_o()

Board
['-', '-', '-']
['-', 'X', '-']
['-', '-', '-']
[0, 1, 2, 3, 5, 6, 7, 8]
Board
['O', 'X', '-']
['-', 'X', '-']
['-', '-', '-']
[2, 3, 5, 6, 7, 8]
Board
['O', 'X', '-']
['O', 'X', '-']
['-', 'X', '-']
You Lose


In [37]:
def train_o_against_random(episodes):
    agent_o = Agent(env, "O")
    for episode in range(episodes):
        done = False
        env.reset()
        while not done:
            agent_o.add_states()
            old_state = env.hash()
            action = random.choice(env.available_actions())
            new_state, reward, done, _ = env.step(action, "X")
            agent_o.update_state_values(new_state, old_state, reward[agent_o.player_number])
            if not done:
                agent_o.add_states()
                old_state = env.hash()
                action = agent_o.select_action()
                new_state, reward, done, _ = env.step(action, agent_o.player)
                agent_o.update_state_values(new_state, old_state, reward[agent_o.player_number])

    return agent_o

In [38]:
agent_o = train_o_against_random(110000)

In [39]:
episodes = 10000
win, loss, tie = test_o(episodes)

print("Win:", win, "Tie:", tie, "Loss:", loss)
print("Win Rate:", win/episodes*100, "Tie Rate:", tie / episodes*100, "Loss Rate:", loss/episodes*100)

Win: 9094 Tie: 731 Loss: 175
Win Rate: 90.94 Tie Rate: 7.31 Loss Rate: 1.7500000000000002
