In [1]:
import random

In [2]:
BLANK = ' '
AI_PLAYER = 'X'
HUMAN_PLAYER = 'O'
TRAINING_EPOCHS = 40000
TRAINING_EPSILON = 0.4
REWARD_WIN = 10
REWARD_LOSE = -10
REWARD_TIE = 0

In [3]:
class Player:
    @staticmethod
    def show_board(board):
        print('|'.join(board[0:3]))
        print('|'.join(board[3:6]))
        print('|'.join(board[6:9]))

In [4]:
class HumanPlayer(Player):
    def reward(self, value, board):
        pass
    def make_move(self, board):
        while True:
            try:
                self.show_board(board)
                move = input('Your next move (cell index 1-9):')
                move = int(move)

                if not (move - 1 in range(9)):
                    raise ValueError
            except ValueError:
                print('Invalid move; try again:\n')
            else:
                return move-1

In [5]:
class AIPlayer(Player):
    def __init__(self, epsilon=0.4, alpha=0.3, gamma=0.9, default_q=1):
        self.EPSILON = epsilon
        self.ALPHA = alpha
        self.GAMMA = gamma
        self.DEAULT_Q = default_q
        self.q = {}
        self.move = None
        self.board = (' ',) * 9

    def available_moves(self, board):
        return [i for i in range(9) if board[i] == ' ']

    def get_q(self, state, action):
        if self.q.get((state, action)) is None:
            self.q[(state, action)] = self.DEAULT_Q

        return self.q[(state, action)]

    def make_move(self, board):
        self.board = tuple(board)
        actions = self.available_moves(board)

        if random.random() < self.EPSILON:
            self.move = random.choice(actions)
            return self.move
        q_values = [self.get_q(self.board, a) for a in actions]
        max_q_value = max(q_values)

        if q_values.count(max_q_value) > 1:
            best_actions = [i for i in range(len(actions)) if q_values[i] == max_q_value]
            best_move = actions[random.choice(best_actions)]
        else:
            best_move = actions[q_values.index(max_q_value)]

        self.move = best_move
        return self.move

    def reward(self, reward, board):
        if self.move:
            prev_q = self.get_q(self.board, self.move)
            max_q_new = max([self.get_q(tuple(board), a) for a in self.available_moves(self.board)])
            self.q[(self.board, self.move)] = prev_q + self.ALPHA * (reward + self.GAMMA * max_q_new - prev_q)

In [12]:
class TicTacToe:
    def __init__(self, player1, player2):
        self.player1 = player1
        self.player2 = player2
        self.first_player_turn = random.choice([True, False])
        self.board = [' '] * 9

    def play(self):
        while True:
            if self.first_player_turn:
                player = self.player1
                other_player = self.player2
                player_tickers = (AI_PLAYER, HUMAN_PLAYER)
            else:
                player = self.player2
                other_player = self.player1
                player_tickers = (HUMAN_PLAYER, AI_PLAYER)

            game_over, winner = self.is_game_over(player_tickers)

            if game_over:
                if winner == player_tickers[0]:
                    player.show_board(self.board[:])
                    print('\n %s won!' % player.__class__.__name__)
                    player.reward(REWARD_WIN, self.board[:])
                    other_player.reward(REWARD_LOSE, self.board[:])
                if winner == player_tickers[1]:
                    player.show_board(self.board[:])
                    print('\n %s won!' % other_player.__class__.__name__)
                    other_player.reward(REWARD_WIN, self.board[:])
                    player.reward(REWARD_LOSE, self.board[:])
                else:
                    player.show_board(self.board[:])
                    print('Tie!')
                    player.reward(REWARD_TIE, self.board[:])
                    other_player.reward(REWARD_TIE, self.board[:])
                break

            self.first_player_turn = not self.first_player_turn
            move = player.make_move(self.board)
            self.board[move] = player_tickers[0]

    def is_game_over(self, player_tickers):
        for player_ticker in player_tickers:
            for i in range(3):
                if self.board[3 * i + 0] == player_ticker and self.board[3 * i + 1] == player_ticker and self.board[3 * i +2] == player_ticker:
                    return True, player_ticker
            for j in range(3):
                if self.board[j + 0] == player_ticker and self.board[j + 3] == player_ticker and self.board[j + 6] == player_ticker:
                    return True, player_ticker
            if self.board[0] == player_ticker and self.board[4] == player_ticker and self.board[8] == player_ticker:
                return True, player_ticker
            if self.board[2] == player_ticker and self.board[4] == player_ticker and self.board[6] == player_ticker:
                return True, player_ticker

        if self.board.count(' ') == 0:
            return True, None
        else:
            return False, None

In [14]:
if __name__ == '__main__':
    ai_player1 = AIPlayer()
    ai_player2 = AIPlayer()

    print('Training the AI player(s)...')

    ai_player1.EPSILON = TRAINING_EPSILON
    ai_player2.EPSILON = TRAINING_EPSILON

    for i in range(TRAINING_EPOCHS):
        game = TicTacToe(ai_player1, ai_player2)
        game.play()

    print('\nTraining is Done')

    ai_player1.EPSILON = 0
    human_player = HumanPlayer()
    game = TicTacToe(ai_player1, human_player)
    game.play()

Training the AI player(s)...
O| | 
O|X|X
O|X|O

 AIPlayer won!
X|X|X
 |O| 
 | |O

 AIPlayer won!
 |X|O
X|X|O
 | |O

 AIPlayer won!
O|X| 
O|O|O
X| |X

 AIPlayer won!
X|O|O
X|O|X
X| |O

 AIPlayer won!
X|X|O
O|X|O
 | |X

 AIPlayer won!
X|O|O
X|X|O
 | |O

 AIPlayer won!
O|X|O
O|X|O
X|O|X
Tie!
X|O|O
O|X|X
O|X|O
Tie!
X|X|O
 |O|X
O|X|O

 AIPlayer won!
X|X|X
O|X|O
O|O|X

 AIPlayer won!
O|O|X
X|X|O
X|X|O

 AIPlayer won!
O|O|X
O|X| 
O|X| 

 AIPlayer won!
O| |X
 | |X
 |O|X

 AIPlayer won!
X|O|X
 |O| 
 |O|X

 AIPlayer won!
O|X|X
O|O|X
 | |X

 AIPlayer won!
X|X| 
O|X|O
O|X| 

 AIPlayer won!
X|O|O
X|X|O
O|X|X

 AIPlayer won!
 |X|X
 |X|O
O|O|O

 AIPlayer won!
O|O|X
X|X|X
O| |O

 AIPlayer won!
X| | 
 |X|O
 |O|X

 AIPlayer won!
 | |X
 |X|O
X|O| 

 AIPlayer won!
 |X|X
 | |X
O|O|O

 AIPlayer won!
O|X| 
 |X|O
X|X|O

 AIPlayer won!
O|X|O
X|X|O
O|X| 

 AIPlayer won!
X|O|X
O|O|O
X|X|O

 AIPlayer won!
O|X|X
X|O|O
O|O|X
Tie!
X|X|X
X|O|O
X|O|O

 AIPlayer won!
 | |X
O|O|O
 | |X

 AIPlayer won!
X| |O
 |O|X
O|X|O
