In [1]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

In [None]:
class TicTacToe:
    def __init__(self):
        self.reset()

    def reset(self):
        self.board = np.zeros(9, dtype=int)  # 0 = empty, 1 = agent, -1 = opponent
        self.done = False
        self.winner = None
        return self.get_state()

    def return_inverted_board(self):
        return -self.board

    def get_state(self):
        return self.board.copy()

    def available_actions(self):
        return [i for i in range(9) if self.board[i] == 0]

    def check_winner(self):
        combos = [(0,1,2),(3,4,5),(6,7,8),(0,3,6),(1,4,7),(2,5,8),(0,4,8),(2,4,6)]
        for a,b,c in combos:
            s = self.board[a] + self.board[b] + self.board[c]
            if s == 3:
                return 1  # Agent wins
            elif s == -3:
                return -1  # Opponent wins
        if 0 not in self.board:
            return 0  # Draw
        return None  # Game continues

    def step(self, action):
        if self.board[action] != 0 or self.done:
            return self.get_state(), -10000, True  # Invalid move penalty
        self.board[action] = 1  # Agent move
        winner = self.check_winner()
        if winner is not None:
            self.done = True
            return self.get_state(), 1 if winner == 1 else 0, True
        return self.get_state(), 0.5, False

    def step_opp(self):
        # Opponent move (random)
        opp_actions = self.available_actions()
        if opp_actions:
            opp_action = random.choice(opp_actions)
            self.board[opp_action] = -1
        winner = self.check_winner()
        if winner is not None:
            self.done = True
            return self.get_state(), -1 if winner == -1 else 0, True
        return self.get_state(), 0, False

    def step_opp_action(self,action):
        self.board[action] = -1  # Agent move
        winner = self.check_winner()
        if winner is not None:
            self.done = True
            return self.get_state(), -1 if winner == -1 else 0, True
        return self.get_state(), 0, False


# --- DQN Model ---
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(9, 128),
            nn.ReLU(),
            nn.Linear(128, 9)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)

    def push(self, transition):
        self.buffer.append(transition)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.array, zip(*batch))
        return (
            torch.tensor(state, dtype=torch.float32),
            torch.tensor(action),
            torch.tensor(reward, dtype=torch.float32),
            torch.tensor(next_state, dtype=torch.float32),
            torch.tensor(done, dtype=torch.float32),
        )

    def __len__(self):
        return len(self.buffer)

In [None]:
# --- Training ---
def train(episodes = 1000):
    env = TicTacToe()
    model = DQN()
    target_model = DQN()
    target_model.load_state_dict(model.state_dict())

    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    buffer = ReplayBuffer()
    batch_size = 64
    gamma = 0.9999
    epsilon = 1.0
    epsilon_min = 0.1
    epsilon_decay = 0.999
    update_target_every = 20
    total_reward = 0

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            if random.random() < epsilon:
                action = random.choice(env.available_actions())
            else:
                with torch.no_grad():
                    q_values = model(torch.tensor(state, dtype=torch.float32))
                    # mask = torch.tensor([float('-inf')] * 9)
                    # for a in env.available_actions():
                        # mask[a] = 0
                    action = torch.argmax(q_values).item()
                    if action not in env.available_actions():
                        buffer.push((state, action, -10, state, True))
                        action = random.choice(env.available_actions())

            next_state, reward, done = env.step(action)
            buffer.push((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward
            if not done:
                _, _, done = env.step_opp()

            if len(buffer) >= batch_size:
                s, a, r, s_, d = buffer.sample(batch_size)
                q_values = model(s).gather(1, a.unsqueeze(1)).squeeze()
                with torch.no_grad():
                    q_next = target_model(s_).max(1)[0]
                    q_target = r + gamma * q_next * (1 - d)
                loss = nn.MSELoss()(q_values, q_target)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        if epsilon > epsilon_min:
            epsilon *= epsilon_decay

        if episode % update_target_every == 0:
            target_model.load_state_dict(model.state_dict())

        if episode % 100 == 0:
            print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {epsilon:.3f}")
            total_reward = 0

    # torch.save(model.state_dict(), "dqn_tictactoe.pth")
    print("Training complete.")
    return model


In [None]:
# --- Training ---
def train_against_self(model, episodes = 1000):
    env = TicTacToe()
    target_model = DQN()
    target_model.load_state_dict(model.state_dict())

    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    buffer = ReplayBuffer()
    batch_size = 64
    gamma = 0.999
    epsilon = 1.0
    epsilon_min = 0.1
    epsilon_decay = 0.9999
    update_target_every = 20
    total_reward = 0

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            if random.random() < epsilon:
                action = random.choice(env.available_actions())
            else:
                with torch.no_grad():
                    q_values = model(torch.tensor(state, dtype=torch.float32))
                    # mask = torch.tensor([float('-inf')] * 9)
                    # for a in env.available_actions():
                        # mask[a] = 0
                    action = torch.argmax(q_values).item()
                    if action not in env.available_actions():
                        buffer.push((state, action, -10, state, True))
                        action = random.choice(env.available_actions())

            next_state, reward, done = env.step(action)
            buffer.push((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward
            if not done:
                q_values = target_model(torch.tensor(-state, dtype=torch.float32))
                mask = torch.tensor([float('-inf')] * 9)
                for a in env.available_actions():
                    mask[a] = 0
                action_op = torch.argmax(q_values + mask).item()
                _, _, done = env.step_opp_action(action_op)

            if len(buffer) >= batch_size:
                s, a, r, s_, d = buffer.sample(batch_size)
                q_values = model(s).gather(1, a.unsqueeze(1)).squeeze()
                with torch.no_grad():
                    q_next = target_model(s_).max(1)[0]
                    q_target = r + gamma * q_next * (1 - d)
                loss = nn.MSELoss()(q_values, q_target)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        if epsilon > epsilon_min:
            epsilon *= epsilon_decay

        if episode % update_target_every == 0:
            target_model.load_state_dict(model.state_dict())

        if episode % 100 == 0:
            print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {epsilon:.3f}")
            total_reward = 0

    # torch.save(model.state_dict(), "dqn_tictactoe.pth")
    print("Training complete.")
    return model


In [34]:
dqn_model = train(episodes=10000)
dqn_model = train_against_self(dqn_model, episodes=10000)

Episode 0, Total Reward: 1, Epsilon: 0.999
Episode 100, Total Reward: 52, Epsilon: 0.904
Episode 200, Total Reward: 50, Epsilon: 0.818
Episode 300, Total Reward: 61, Epsilon: 0.740
Episode 400, Total Reward: 61, Epsilon: 0.670
Episode 500, Total Reward: 63, Epsilon: 0.606
Episode 600, Total Reward: 60, Epsilon: 0.548
Episode 700, Total Reward: 58, Epsilon: 0.496
Episode 800, Total Reward: 62, Epsilon: 0.449
Episode 900, Total Reward: 69, Epsilon: 0.406
Episode 1000, Total Reward: 59, Epsilon: 0.367
Episode 1100, Total Reward: 62, Epsilon: 0.332
Episode 1200, Total Reward: 67, Epsilon: 0.301
Episode 1300, Total Reward: 69, Epsilon: 0.272
Episode 1400, Total Reward: 78, Epsilon: 0.246
Episode 1500, Total Reward: 69, Epsilon: 0.223
Episode 1600, Total Reward: 67, Epsilon: 0.202
Episode 1700, Total Reward: 67, Epsilon: 0.182
Episode 1800, Total Reward: 73, Epsilon: 0.165
Episode 1900, Total Reward: 82, Epsilon: 0.149
Episode 2000, Total Reward: 69, Epsilon: 0.135
Episode 2100, Total Reward

In [35]:
dqn_model = train_against_self(dqn_model, episodes=1000)
dqn_model = train_against_self(dqn_model, episodes=1000)
dqn_model = train_against_self(dqn_model, episodes=1000)
dqn_model = train_against_self(dqn_model, episodes=1000)
dqn_model = train_against_self(dqn_model, episodes=1000)

Episode 0, Total Reward: 0, Epsilon: 1.000
Episode 100, Total Reward: 43, Epsilon: 0.990
Episode 200, Total Reward: 51, Epsilon: 0.980
Episode 300, Total Reward: 55, Epsilon: 0.970
Episode 400, Total Reward: 50, Epsilon: 0.961
Episode 500, Total Reward: 50, Epsilon: 0.951
Episode 600, Total Reward: 52, Epsilon: 0.942
Episode 700, Total Reward: 44, Epsilon: 0.932
Episode 800, Total Reward: 55, Epsilon: 0.923
Episode 900, Total Reward: 57, Epsilon: 0.914
Training complete.
Episode 0, Total Reward: 1, Epsilon: 1.000
Episode 100, Total Reward: 48, Epsilon: 0.990
Episode 200, Total Reward: 60, Epsilon: 0.980
Episode 300, Total Reward: 55, Epsilon: 0.970
Episode 400, Total Reward: 60, Epsilon: 0.961
Episode 500, Total Reward: 56, Epsilon: 0.951
Episode 600, Total Reward: 49, Epsilon: 0.942
Episode 700, Total Reward: 59, Epsilon: 0.932
Episode 800, Total Reward: 48, Epsilon: 0.923
Episode 900, Total Reward: 63, Epsilon: 0.914
Training complete.
Episode 0, Total Reward: 0, Epsilon: 1.000
Episo

In [37]:
def evaluate(model, games=100):
    env = TicTacToe()
    model.eval()

    wins, losses, draws = 0, 0, 0

    for _ in range(games):
        state = env.reset()
        done = False

        while not done:
            # Agent move
            with torch.no_grad():
                q_values = model(torch.tensor(state, dtype=torch.float32))
                mask = torch.tensor([float('-inf')] * 9)
                for a in env.available_actions():
                    mask[a] = 0
                action = torch.argmax(q_values + mask).item()

            next_state, reward, done = env.step(action)
            if not done:
                next_state, reward, done = env.step_opp()
            state = next_state

        # Final result
        if reward == 1:
            wins += 1
        elif reward == -1:
            losses += 1
        else:
            draws += 1

    print(f"Results over {games} games:")
    print(f"Wins:   {wins}")
    print(f"Losses: {losses}")
    print(f"Draws:  {draws}")


if __name__ == "__main__":
    # Uncomment to run evaluation
    # train()
    evaluate(model=dqn_model, games=100)


Results over 100 games:
Wins:   44
Losses: 49
Draws:  7


In [27]:
def print_board(state):
    symbols = {1: 'X', -1: 'O', 0: ' '}
    board = [symbols[val] for val in state]
    print("\nBoard:")
    for i in range(3):
        row = " | ".join(board[i * 3:(i + 1) * 3])
        print(row)
        if i < 2:
            print("---------")
    print()

In [39]:
def play_vs_model(model_path="dqn_tictactoe.pth", user_starts=True):
    env = TicTacToe()
    model = DQN()
    model.load_state_dict(torch.load(model_path))
    model.eval()

    state = env.reset()
    done = False

    print("Welcome to Tic-Tac-Toe!")
    print("You are O (opponent), model is X (agent).")
    print("Board positions are 0-8:")
    print("0 | 1 | 2\n3 | 4 | 5\n6 | 7 | 8\n")

    if not user_starts:
        print("Model goes first.")
        with torch.no_grad():
            q_values = model(torch.tensor(state, dtype=torch.float32))
            mask = torch.tensor([float('-inf')] * 9)
            for a in env.available_actions():
                mask[a] = 0
            action = torch.argmax(q_values + mask).item()
        state, _, done = env.step(action)
        print_board(state)

    while not done:
        # --- User move ---
        user_action = -1
        valid = env.available_actions()
        while user_action not in valid:
            try:
                user_action = int(input(f"Your move (available: {valid}): "))
            except ValueError:
                continue

        # Apply user move
        env.board[user_action] = -1
        result = env.check_winner()
        if result is not None:
            done = True
            print_board(env.get_state())
            if result == -1:
                print("You win!")
            elif result == 0:
                print("It's a draw!")
            else:
                print("Model wins!")
            return

        # --- Model move ---
        state = env.get_state()
        with torch.no_grad():
            q_values = model(torch.tensor(state, dtype=torch.float32))
            mask = torch.tensor([float('-inf')] * 9)
            for a in env.available_actions():
                mask[a] = 0
            action = torch.argmax(q_values + mask).item()

        state, _, done = env.step(action)
        print_board(state)

        if done:
            result = env.check_winner()
            if result == 1:
                print("Model wins!")
            elif result == -1:
                print("You win!")
            else:
                print("It's a draw!")


if __name__ == "__main__":
    # train()
    # evaluate()
    play_vs_model(user_starts=True)

  model.load_state_dict(torch.load(model_path))


Welcome to Tic-Tac-Toe!
You are O (opponent), model is X (agent).
Board positions are 0-8:
0 | 1 | 2
3 | 4 | 5
6 | 7 | 8

Your move (available: [0, 1, 2, 3, 4, 5, 6, 7, 8]): 0

Board:
O |   |  
---------
  |   |  
---------
X |   |  

Your move (available: [1, 2, 3, 4, 5, 7, 8]): 5

Board:
O |   |  
---------
X |   | O
---------
X |   |  

Your move (available: [1, 2, 4, 7, 8]): 4

Board:
O |   |  
---------
X | O | O
---------
X | X |  

Your move (available: [1, 2, 8]): 8

Board:
O |   |  
---------
X | O | O
---------
X | X | O

You win!
