In [None]:
import gym
from gym import error, spaces, utils
from gym.spaces import space
from gym.utils import seeding

import random

class TicTacToeEnv(gym.Env):
  metadata = {'render.modes': ['human']}

  def __init__(self):
    self.state = [
        ["-","-","-"],
        ["-","-","-"],
        ["-","-","-"]
        ]


  def hash(self):
    return "".join([item for sublist in self.state for item in sublist])


  def available_actions(self):
    return [i for i, x in enumerate(self.hash()) if x == "-"]

  def available_states(self, player):
    states = []
    actions = self.available_actions()
    for action in actions:
      state_list = list(self.hash())
      state_list[action] = player
      state = "".join(state_list)
      _, reward = self.check_done(state)
      states.append((state, reward))
    return states


  def check_done(self, state):
    winner = ""
    for player in ["X", "O"]:
        if (state[0:3] == 3*player):
            winner = player
        elif (state[3:6] == 3*player):
            winner = player
        elif (state[6:9] == 3*player):
            winner = player
        elif (state[0] == player and state[3] == player and state[6] == player):
            winner = player
        elif (state[1] == player and state[4] == player and state[7] == player):
            winner = player
        elif (state[2] == player and state[5] == player and state[8] == player):
            winner = player
        elif (state[0] == player and state[4] == player and state[8] == player):
            winner = player
        elif (state[2] == player and state[4] == player and state[6] == player):
            winner = player

    if (winner == "X"):
      return True, (10, -10)
    elif (winner == "O"):
      return True, (-10, 10)
    elif ("-" not in self.hash()):
      return True, (0, 0)
    else:
      return False, (0, 0)

  def step(self, action, player):

    self.state[action//3][action%3] = player

    done, reward = self.check_done(self.hash())

    return self.hash(), reward, done, {}

  def reset(self):
    self.state = [
        ["-","-","-"],
        ["-","-","-"],
        ["-","-","-"]
        ]


  def render(self, mode='human'):
    print("Board")
    for row in self.state:
      print(row)

In [None]:
env = TicTacToeEnv()

In [None]:
env.state

[['-', '-', '-'], ['-', '-', '-'], ['-', '-', '-']]

In [None]:
env.hash()

'---------'

In [None]:
new_state, reward, done, info = env.step(0, "X")

In [None]:
new_state

'X--------'

In [None]:
reward

(0, 0)

In [None]:
env.render()

Board
['X', '-', '-']
['-', '-', '-']
['-', '-', '-']


In [None]:
env.available_actions()

[1, 2, 3, 4, 5, 6, 7, 8]

In [None]:
env.reset()
env.render()

Board
['-', '-', '-']
['-', '-', '-']
['-', '-', '-']


In [None]:
# variable to keep track of if the game is over
done = False
# Good practice to reset environment before you play a game to clear any old game
env.reset()
# Print the initial board
env.render()
# Want to keep playing untill game is over
while not done:
    # Make a random action from the list of available actions for X
    new_state, reward, done, info = env.step(random.choice(env.available_actions()), "X")
    # Print board after X action
    env.render()

    # If the game is done on X action we dont want O to make an action
    if not done:
        # Make a random action from the list of available actions for O
        new_state, reward, done, info = env.step(random.choice(env.available_actions()), "O")
        # Print board after O action
        env.render()

# Print the reward after the game is done, reward for X is the first value and O is the second value
print(reward)

Board
['-', '-', '-']
['-', '-', '-']
['-', '-', '-']
Board
['-', '-', 'X']
['-', '-', '-']
['-', '-', '-']
Board
['O', '-', 'X']
['-', '-', '-']
['-', '-', '-']
Board
['O', '-', 'X']
['-', '-', '-']
['-', 'X', '-']
Board
['O', '-', 'X']
['-', '-', 'O']
['-', 'X', '-']
Board
['O', '-', 'X']
['-', 'X', 'O']
['-', 'X', '-']
Board
['O', '-', 'X']
['-', 'X', 'O']
['-', 'X', 'O']
Board
['O', '-', 'X']
['-', 'X', 'O']
['X', 'X', 'O']
(10, -10)


In [None]:
# variable to keep track of if the game is over
done = False
# Good practice to reset environment before you play a game to clear any old game
env.reset()
# Want to keep playing untill game is over
while not done:
    # Make a random action from the list of available actions for X
    new_state, reward, done, info = env.step(random.choice(env.available_actions()), "X")
    # Print state
    print(env.hash())

    # If the game is done on X action we dont want O to make an action
    if not done:
        # Make a random action from the list of available actions for O
        new_state, reward, done, info = env.step(random.choice(env.available_actions()), "O")
        # Print state
        print(env.hash())

-------X-
---O---X-
-X-O---X-
-X-O--OX-
-XXO--OX-
-XXO--OXO
XXXO--OXO


In [None]:
env.reset()
env.render()

Board
['-', '-', '-']
['-', '-', '-']
['-', '-', '-']


In [None]:
reward

(10, -10)

In [None]:
env.available_actions()

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [None]:
class Agent():

    def __init__(self, env, player="X", alpha=.4, gamma=.9):
        self.alpha = alpha
        self.gamma = gamma
        self.env = env
        self.player = player
        self.player_number = 0 if player == "X" else 1
        self.V = {}

In [None]:
class Agent(Agent):

    def select_action(self, epsilon=.1):
        # generates random number between 0 and 1 if it is below epsilon we take random action otherwise a greedy action
        if (random.random() < epsilon):
            # gets a random action from list of available actions
            action = random.choice(self.env.available_actions())
        else:
            # list to store action calculations
            q_values = []
            # loops through the list of available states and rewards
            for state in self.env.available_states(self.player):
                # calculates gamma*V(S') + Reward for the state
                # example: state = (("X--O-----"), (0,0))
                q_values.append(self.gamma*self.V[state[0]] + state[1][self.player_number])
            # find the max value of the action calculations
            max_value = max(q_values)
            # selects indexs of values in q_values that are the max_value
            max_indexs = [i for i, j in enumerate(q_values) if j == max_value]
            # select a random action from the actions that all have the max_value
            action = self.env.available_actions()[random.choice(max_indexs)]
        return action

    def add_states(self):
        # adds current state to state value function
        if (self.env.hash() not in self.V):
            self.V[self.env.hash()] = 0
        # adds all states X can get to
        for state in self.env.available_states("X"):
            if (state[0] not in self.V):
                self.V[state[0]] = 0
        # adds all states O can get to
        for state in self.env.available_states("O"):
            if (state[0] not in self.V):
                self.V[state[0]] = 0


    def update_state_values(self, new_state, old_state, reward):
        # V(S) = V(S) + alpha*(R + gamma*(V(S') - V(S)))
        self.V[old_state] = self.V[old_state] + self.alpha*(reward + self.gamma*self.V[new_state] - self.V[old_state])

In [None]:
# number of games (episodes)
def train(episodes):
    # create our agents
    agent_x = Agent(env, "X")
    agent_o = Agent(env, "O")
    # loops for a certain number of games (episodes)
    for episode in range(episodes):
        # stops while loop when game is done
        done = False
        # resets environment when game is done
        env.reset()
        # while loop for a single game
        while not done:

            # X agents turn

            # adds states for both agents
            agent_x.add_states()
            agent_o.add_states()

            # records the state we are in before action
            old_state = env.hash()
            # get an action using policy
            action = agent_x.select_action()
            # performs an action
            new_state, reward, done, _ = env.step(action, agent_x.player)

            # update state values for both agents
            agent_x.update_state_values(new_state, old_state, reward[agent_x.player_number])
            agent_o.update_state_values(new_state, old_state, reward[agent_o.player_number])

            # if the game ends on X move, we don't want to make an O move
            if not done:

                # O agents turn

                # adds states for both agents
                agent_x.add_states()
                agent_o.add_states()

                # records the state we are in before action
                old_state = env.hash()
                # get an action using policy
                action = agent_o.select_action()
                # performs an action
                new_state, reward, done, _ = env.step(action, agent_o.player)

                # update state values for both agents
                agent_x.update_state_values(new_state, old_state, reward[agent_x.player_number])
                agent_o.update_state_values(new_state, old_state, reward[agent_o.player_number])

    return agent_x, agent_o

In [None]:
%%time

agent_x, agent_o = train(110000)

CPU times: user 2min 38s, sys: 435 ms, total: 2min 39s
Wall time: 2min 41s


In [None]:
# number of games (episodes)
def test_x(episodes):
    # counters to keep track of results
    win = 0
    tie = 0
    loss = 0
    # loops for a certain number of games (episodes)
    for episode in range(episodes):
        # stops while loop when game is done
        done = False
        # resets environment when game is done
        env.reset()
        while not done:

            # adds states for X only because we are acting randomly and not updating state values for O
            agent_x.add_states()

            # always get the best action
            x_action = agent_x.select_action(epsilon=0)
            # performs an action
            new_state, reward, done, _ = env.step(x_action, agent_x.player)

            # if the game ends on X move, we don't want to make an O move
            if (not done):

                # O agents turn

                # adds states for X only because we are acting randomly and not updating state values for O
                agent_x.add_states()

                # O always makes a random action from the available actions
                o_action = random.choice(env.available_actions())
                new_state, reward, done, _ = env.step(o_action, "O")

        # record results when game is done
        if (reward == (10, -10)):
            win+=1
        elif (reward == (-10, 10)):
            loss+=1
        elif (reward == (0, 0)):
            tie+=1
    return win, loss, tie

In [None]:
episodes = 10000

win, loss, tie = test_x(episodes)

print("Win:", win, "Tie:", tie, "Loss:", loss)
print("Win Rate:", win/episodes*100, "Tie Rate:", tie/episodes*100, "Loss Rate:", loss/episodes*100)

Win: 9648 Tie: 352 Loss: 0
Win Rate: 96.48 Tie Rate: 3.52 Loss Rate: 0.0


In [None]:
episodes = 110000

win, loss, tie = test_x(episodes)

print("Win:", win, "Tie:", tie, "Loss:", loss)
print("Win Rate:", win/episodes*100, "Tie Rate:", tie/episodes*100, "Loss Rate:", loss/episodes*100)

Win: 105991 Tie: 4009 Loss: 0
Win Rate: 96.35545454545455 Tie Rate: 3.6445454545454545 Loss Rate: 0.0


In [None]:
# number of games (episodes)
def test_o(episodes):
    # counters to keep track of results
    win = 0
    tie = 0
    loss = 0
    # loops for a certain number of games (episodes)
    for episode in range(episodes):
        # stops while loop when game is done
        done = False
        # resets environment when game is done
        env.reset()
        while not done:

            # adds states for O only because we are acting randomly and not updating state values for X
            agent_o.add_states()

            # X always makes a random action from the available actions
            x_action = random.choice(env.available_actions())
            # performs an action
            new_state, reward, done, _ = env.step(x_action, "X")

            # if the game ends on X move, we don't want to make an O move
            if (not done):

                # O agents turn

                # adds states for O only because we are acting randomly and not updating state values for X
                agent_o.add_states()

                # always get the best action
                o_action = agent_o.select_action(epsilon=0)
                new_state, reward, done, _ = env.step(o_action, agent_o.player)

        # record results when game is done
        if (reward == (-10, 10)):
            win+=1
        elif (reward == (10, -10)):
            loss+=1
        elif (reward == (0, 0)):
            tie+=1
    return win, loss, tie

In [None]:
episodes = 10000

win, loss, tie = test_o(episodes)

print("Win:", win, "Tie:", tie, "Loss:", loss)
print("Win Rate:", win/episodes*100, "Tie Rate:", tie/episodes*100, "Loss Rate:", loss/episodes*100)

Win: 8862 Tie: 1138 Loss: 0
Win Rate: 88.62 Tie Rate: 11.379999999999999 Loss Rate: 0.0


In [None]:
# number of games (episodes)
def test(episodes):
    # counters to keep track of results
    x_win = 0
    o_win = 0
    tie = 0
    # loops for a certain number of games (episodes)
    for episode in range(episodes):
        # stops while loop when game is done
        done = False
        # resets environment when game is done
        env.reset()
        while not done:

            # adds states for both agents because we are using select_action on both
            agent_x.add_states()
            agent_o.add_states()

            # always get the best action
            x_action = agent_x.select_action(epsilon=0)
            # performs an action
            new_state, reward, done, _ = env.step(x_action, "X")

            # if the game ends on X move, we don't want to make an O move
            if (not done):

                # O agents turn

                # adds states for both agents because we are using select_action on both
                agent_x.add_states()
                agent_o.add_states()

                # always get the best action
                o_action = agent_o.select_action(epsilon=0)
                new_state, reward, done, _ = env.step(o_action, "O")

        # record results when game is done
        if (reward == (-10, 10)):
            o_win+=1
        elif (reward == (10, -10)):
            x_win+=1
        elif (reward == (0, 0)):
            tie+=1
    return x_win, o_win, tie

In [None]:
episodes = 10000

x_win, o_win, tie = test(episodes)

print("X Win:", x_win, "Tie:", tie, "O Win:", o_win)
print("X Win Rate:", x_win/episodes*100, "Tie Rate:", tie/episodes*100, "O Win Rate:", o_win/episodes*100)

X Win: 0 Tie: 10000 O Win: 0
X Win Rate: 0.0 Tie Rate: 100.0 O Win Rate: 0.0


In [None]:
# number of games (episodes)
def play_as_x(episodes=1):
    # counters to keep track of results
    x_win = 0
    o_win = 0
    tie = 0
    # loops for a certain number of games (episodes)
    for episode in range(episodes):
        # stops while loop when game is done
        done = False
        # resets environment when game is done
        env.reset()
        while not done:

            # print the environment before you go
            env.render()
            # print available actions
            print(env.available_actions())

            # adds states for O only because we are controlling X
            agent_o.add_states()

            # get user input
            x_action = int(input())
            # performs an action
            new_state, reward, done, _ = env.step(x_action, "X")

            # if the game ends on X move, we don't want to make an O move
            if (not done):

                # O agents turn

                # adds states for O only because we are controlling X
                agent_o.add_states()

                # always get the best action
                o_action = agent_o.select_action(epsilon=0)
                new_state, reward, done, _ = env.step(o_action, "O")

        env.render()
        # record results when game is done
        if (reward == (-10, 10)):
            print("You Lose")
        elif (reward == (10, -10)):
            print("You Win")
        elif (reward == (0, 0)):
            print("Tie")

In [None]:
play_as_x()

Board
['-', '-', '-']
['-', '-', '-']
['-', '-', '-']
[0, 1, 2, 3, 4, 5, 6, 7, 8]
0
Board
['X', '-', '-']
['-', 'O', '-']
['-', '-', '-']
[1, 2, 3, 5, 6, 7, 8]
8
Board
['X', '-', '-']
['-', 'O', 'O']
['-', '-', 'X']
[1, 2, 3, 6, 7]
3
Board
['X', '-', '-']
['X', 'O', 'O']
['O', '-', 'X']
[1, 2, 7]
2
Board
['X', 'O', 'X']
['X', 'O', 'O']
['O', '-', 'X']
[7]
8
Board
['X', 'O', 'X']
['X', 'O', 'O']
['O', 'O', 'X']
You Lose


In [None]:
# number of games (episodes)
def play_as_o(episodes=1):
    # counters to keep track of results
    x_win = 0
    o_win = 0
    tie = 0
    # loops for a certain number of games (episodes)
    for episode in range(episodes):
        # stops while loop when game is done
        done = False
        # resets environment when game is done
        env.reset()
        while not done:

            # adds states for X only because we are controlling O
            agent_x.add_states()

            # always get the best action
            x_action = agent_x.select_action(epsilon=0)
            # performs an action
            new_state, reward, done, _ = env.step(x_action, "X")

            # if the game ends on X move, we don't want to make an O move
            if (not done):

                # O agents turn

                # print the environment before you go
                env.render()
                # print available actions
                print(env.available_actions())

                # adds states for X only because we are controlling O
                agent_x.add_states()

                # get user input
                o_action = int(input())
                new_state, reward, done, _ = env.step(o_action, "O")

        env.render()
        # record results when game is done
        if (reward == (-10, 10)):
            print("You Win")
        elif (reward == (10, -10)):
            print("You Lose")
        elif (reward == (0, 0)):
            print("Tie")

In [None]:
play_as_o()

Board
['-', '-', '-']
['-', 'X', '-']
['-', '-', '-']
[0, 1, 2, 3, 5, 6, 7, 8]
4
Board
['-', '-', '-']
['-', 'O', '-']
['-', '-', 'X']
[0, 1, 2, 3, 5, 6, 7]
0
Board
['O', '-', 'X']
['-', 'O', '-']
['-', '-', 'X']
[1, 3, 5, 6, 7]
1
Board
['O', 'O', 'X']
['-', 'O', 'X']
['-', '-', 'X']
You Lose


In [None]:
episodes = 10000

win, loss, tie = test_o(episodes)

print("Win:", win, "Tie:", tie, "Loss:", loss)
print("Win Rate:", win/episodes*100, "Tie Rate:", tie/episodes*100, "Loss Rate:", loss/episodes*100)

Win: 8861 Tie: 1139 Loss: 0
Win Rate: 88.61 Tie Rate: 11.39 Loss Rate: 0.0
