In [1]:
import numpy as np
import random
from collections import defaultdict


In [2]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)  # 3x3 board initialized to 0
        self.done = False
        self.current_player = 1  # Player 1 starts (1 for Player 1, -1 for Player 2)

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.done = False
        self.current_player = 1
        return self.board.flatten()

    def available_actions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def step(self, action):
        if self.done:
            raise ValueError("Game is over")
        
        row, col = action
        self.board[row, col] = self.current_player
        
        if self.check_winner():
            reward = 1
            self.done = True
        elif len(self.available_actions()) == 0:
            reward = 0  # Draw
            self.done = True
        else:
            reward = 0  # Game continues
            self.current_player *= -1  # Switch players
        
        return self.board.flatten(), reward, self.done

    def check_winner(self):
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3 or abs(sum(self.board[:, i])) == 3:
                return True
        if abs(self.board.trace()) == 3 or abs(np.fliplr(self.board).trace()) == 3:
            return True
        return False


In [3]:
class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.q_table = defaultdict(lambda: np.zeros(9))  # 3x3 board flattened to 9 actions
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate

    def choose_action(self, state, available_actions):
        if random.uniform(0, 1) < self.epsilon:  # Exploration
            return random.choice(available_actions)
        else:  # Exploitation
            state_key = tuple(state)
            action_values = self.q_table[state_key]
            action_index = np.argmax(action_values)
            return (action_index // 3, action_index % 3)

    def update_q_value(self, state, action, reward, next_state):
        state_key = tuple(state)
        next_state_key = tuple(next_state)
        action_index = action[0] * 3 + action[1]

        # Q-learning update rule
        best_next_action = np.max(self.q_table[next_state_key])
        self.q_table[state_key][action_index] += self.alpha * (reward + self.gamma * best_next_action - self.q_table[state_key][action_index])


In [4]:
def train(agent, env, episodes=5000):
    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            available_actions = env.available_actions()
            action = agent.choose_action(state, available_actions)
            next_state, reward, done = env.step(action)
            agent.update_q_value(state, action, reward, next_state)
            state = next_state


In [5]:
def test(agent, env, episodes=100):
    wins = 0
    draws = 0
    losses = 0

    for episode in range(episodes):
        state = env.reset()
        done = False
        agent.current_player = 1  # Our agent is player 1
        
        while not done:
            if env.current_player == 1:
                action = agent.choose_action(state, env.available_actions())
            else:
                action = random.choice(env.available_actions())  # Random opponent
            next_state, reward, done = env.step(action)
            state = next_state

        if reward == 1 and env.current_player == -1:
            wins += 1  # Agent won
        elif reward == 0:
            draws += 1  # Draw
        else:
            losses += 1  # Agent lost

    print(f"Results after {episodes} games:")
    print(f"Wins: {wins}, Draws: {draws}, Losses: {losses}")


In [6]:
# Initialize environment and agent
env = TicTacToe()
agent = QLearningAgent()

# Train the agent
train(agent, env, episodes=10000)

# Test the trained agent
test(agent, env, episodes=100)


Results after 100 games:
Wins: 52, Draws: 7, Losses: 41
