In [7]:
import numpy as np
import random

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)  # 0 = empty, 1 = player 1 (X), -1 = player 2 (O)
        self.current_winner = None

    def print_board(self):
        # Convert the board to a human-readable format
        symbols = {1: 'X', -1: 'O', 0: ' '}
        for row in self.board:
            print('| ' + ' | '.join([symbols[cell] for cell in row]) + ' |')

    def available_moves(self):
        # Returns a list of available moves (i.e., empty squares)
        return [(r, c) for r in range(3) for c in range(3) if self.board[r, c] == 0]

    def make_move(self, row, col, player):
        # Place the player's symbol on the board if the move is valid
        if self.board[row, col] == 0:
            self.board[row, col] = player
            if self.check_winner(player):
                self.current_winner = player
            return True
        return False

    def check_winner(self, player):
        # Check rows, columns, and diagonals for a win
        for row in self.board:
            if np.all(row == player):
                return True
        for col in self.board.T:
            if np.all(col == player):
                return True
        if np.all(np.diag(self.board) == player) or np.all(np.diag(np.fliplr(self.board)) == player):
            return True
        return False

    def is_full(self):
        # Check if the board is full (draw)
        return not np.any(self.board == 0)

    def reset(self):
        # Reset the board for a new game
        self.board = np.zeros((3, 3), dtype=int)
        self.current_winner = None


In [8]:
def get_features(board, player):
    # A feature vector for the current state
    features = np.zeros(5)  # Example: 5 simple features

    # Feature 1: Number of two-in-a-row (for player)
    features[0] = count_two_in_a_row(board, player)

    # Feature 2: Number of two-in-a-row (for opponent)
    features[1] = count_two_in_a_row(board, -player)

    # Feature 3: Number of empty spaces
    features[2] = len(np.where(board == 0)[0])

    # Feature 4: Number of player 1 marks
    features[3] = np.sum(board == player)

    # Feature 5: Number of opponent marks
    features[4] = np.sum(board == -player)

    return features

def count_two_in_a_row(board, player):
    # Helper function to count two-in-a-rows
    two_in_row = 0

    # Check rows, columns, and diagonals for two-in-a-row with one empty space
    for row in board:
        if np.sum(row == player) == 2 and np.sum(row == 0) == 1:
            two_in_row += 1
    for col in board.T:
        if np.sum(col == player) == 2 and np.sum(col == 0) == 1:
            two_in_row += 1
    diag1 = np.diag(board)
    diag2 = np.diag(np.fliplr(board))
    if np.sum(diag1 == player) == 2 and np.sum(diag1 == 0) == 1:
        two_in_row += 1
    if np.sum(diag2 == player) == 2 and np.sum(diag2 == 0) == 1:
        two_in_row += 1

    return two_in_row


In [9]:
class QLearningAgent:
    def __init__(self, player, learning_rate=0.01, discount=0.9, epsilon=0.1):
        self.player = player
        self.weights = np.random.randn(5)  # Initialize random weights for the features
        self.lr = learning_rate
        self.gamma = discount
        self.epsilon = epsilon

    def q_value(self, features):
        # Linear function approximation for Q-value
        return np.dot(self.weights, features)

    def select_action(self, game):
        # Epsilon-greedy policy for action selection
        if np.random.rand() < self.epsilon:
            return random.choice(game.available_moves())  # Explore
        else:
            # Exploit: select the action with the highest Q-value
            available_moves = game.available_moves()
            q_values = []
            for move in available_moves:
                # Simulate the move and get features for the resulting state
                game.board[move] = self.player
                features = get_features(game.board, self.player)
                q_values.append(self.q_value(features))
                game.board[move] = 0  # Undo the move

            return available_moves[np.argmax(q_values)]

    def update(self, state_features, reward, next_state_features):
        # Update weights using the Q-learning rule
        current_q = self.q_value(state_features)
        next_q = self.q_value(next_state_features)
        td_error = reward + self.gamma * next_q - current_q
        self.weights += self.lr * td_error * state_features


In [13]:
import numpy as np
import matplotlib.pyplot as plt
def play_game(agent1, agent2, game):
    game.reset()
    current_agent = agent1  # Player 1 starts
    next_agent = agent2

    while True:
        move = current_agent.select_action(game)
        game.make_move(*move, current_agent.player)

        if game.current_winner:
            return current_agent.player  # The current agent wins

        if game.is_full():
            return 0

        current_agent, next_agent = next_agent, current_agent

agent1 = QLearningAgent(1)
agent2 = QLearningAgent(-1)

num_games = 10000
win_counts = {1: 0, -1: 0, 0: 0}

for i in range(num_games):
    result = play_game(agent1, agent2, TicTacToe())
    win_counts[result] += 1

    if (i + 1) % 1000 == 0:
        print(f"Games played: {i+1}, Win counts: {win_counts}")


Games played: 1000, Win counts: {1: 198, -1: 107, 0: 695}
Games played: 2000, Win counts: {1: 397, -1: 222, 0: 1381}
Games played: 3000, Win counts: {1: 601, -1: 327, 0: 2072}
Games played: 4000, Win counts: {1: 807, -1: 435, 0: 2758}
Games played: 5000, Win counts: {1: 1036, -1: 549, 0: 3415}
Games played: 6000, Win counts: {1: 1256, -1: 644, 0: 4100}
Games played: 7000, Win counts: {1: 1471, -1: 746, 0: 4783}
Games played: 8000, Win counts: {1: 1692, -1: 855, 0: 5453}
Games played: 9000, Win counts: {1: 1930, -1: 961, 0: 6109}
Games played: 10000, Win counts: {1: 2147, -1: 1058, 0: 6795}
