In [1]:
import random
from collections import deque
import numpy as np
import copy
import tensorflow as tf
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
class TicTacToe:
    def __init__(self):
        self.reset()

    def reset(self):
        # 0 = empty, 1 = X, -1 = O
        self.board = [0] * 9
        self.current_player = 1  # 1 for X, -1 for O
        self.done = False
        self.winner = None

    def render(self):
        symbols = {1: 'X', -1: 'O', 0: ' '}
        for i in range(3):
            row = [symbols[self.board[j]] for j in range(i*3, (i+1)*3)]
            print('|'.join(row))
            if i < 2:
                print('-'*5)

    def get_valid_actions(self):
        return [i for i, x in enumerate(self.board) if x == 0]

    def step(self, action):
        if self.done or self.board[action] != 0:
            raise ValueError("Invalid move")

        self.board[action] = self.current_player
        self._check_game_over()
        reward = 0
        if self.done:
            if self.winner == self.current_player:
                reward = 1
            elif self.winner == 0:
                reward = 0.5  # draw
            else:
                reward = -1

        self.current_player *= -1  # Switch players
        return self.board.copy(), reward, self.done

    def _check_game_over(self):
        winning_combinations = [
            [0,1,2], [3,4,5], [6,7,8],  # Rows
            [0,3,6], [1,4,7], [2,5,8],  # Columns
            [0,4,8], [2,4,6]            # Diagonals
        ]
        for combo in winning_combinations:
            total = self.board[combo[0]] + self.board[combo[1]] + self.board[combo[2]]
            if total == 3:
                self.winner = 1
                self.done = True
                return
            elif total == -3:
                self.winner = -1
                self.done = True
                return

        if all(cell != 0 for cell in self.board):
            self.winner = 0  # Draw
            self.done = True


In [6]:
class DQNAgent:
    def __init__(self, state_size=9, action_size=9, gamma=0.95, epsilon=1.0,
                 epsilon_min=0.1, epsilon_decay=0.995, learning_rate=0.001, batch_size=64):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)

        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        self.learning_rate = learning_rate
        self.batch_size = batch_size

        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

    def _build_model(self):
        model = tf.keras.Sequential([
            layers.Input(shape=(self.state_size,)),
            layers.Dense(64, activation='relu'),
            layers.Dense(64, activation='relu'),
            layers.Dense(self.action_size)
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate),
                      loss=tf.keras.losses.MeanSquaredError())
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, valid_actions):
        if np.random.rand() <= self.epsilon:
            return random.choice(valid_actions)

        q_values = self.model.predict(np.array([state]), verbose=0)[0]
        # Mask invalid actions
        masked_q_values = np.full(self.action_size, -np.inf)
        for a in valid_actions:
            masked_q_values[a] = q_values[a]
        return np.argmax(masked_q_values)

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        states = np.zeros((self.batch_size, self.state_size))
        targets = np.zeros((self.batch_size, self.action_size))

        for i, (state, action, reward, next_state, done) in enumerate(minibatch):
            states[i] = state
            target = self.model.predict(np.array([state]), verbose=0)[0]
            if done:
                target[action] = reward
            else:
                next_q = self.target_model.predict(np.array([next_state]), verbose=0)[0]
                target[action] = reward + self.gamma * np.max(next_q)
            targets[i] = target

        self.model.fit(states, targets, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [8]:
env = TicTacToe()
env.render()
print("Valid actions:", env.get_valid_actions())
state, reward, done = env.step(0)  # Player X moves
env.render()

 | | 
-----
 | | 
-----
 | | 
Valid actions: [0, 1, 2, 3, 4, 5, 6, 7, 8]
X| | 
-----
 | | 
-----
 | | 


In [15]:
import os

def train_dqn(episodes=1000, target_update_freq=10, save_dir="/content/saved_models"):
    env = TicTacToe()
    agent = DQNAgent()
    update_target_counter = 0

    # Create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    print("Training started...")

    for episode in tqdm(range(1, episodes + 1)):
        env.reset()
        state = np.array(env.board, dtype=np.float32)
        total_reward = 0

        while not env.done:
            valid_actions = env.get_valid_actions()

            if env.current_player == 1:
                action = agent.act(state, valid_actions)
                next_board, reward, done = env.step(action)
                next_state = np.array(next_board, dtype=np.float32)
                agent.remember(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward

                if done:
                    break
            else:
                action = random.choice(valid_actions)
                env.step(action)
                state = np.array(env.board, dtype=np.float32)

        agent.replay()
        update_target_counter += 1
        if update_target_counter >= target_update_freq:
            agent.update_target_model()
            update_target_counter = 0

        # Save model every 100 episodes
        if episode % 100 == 0:
            model_path = os.path.join(save_dir, f"dqn_model_ep{episode}.keras")
            agent.model.save(model_path)
            print(f"Episode {episode}, Reward: {total_reward:.2f}, Epsilon: {agent.epsilon:.3f} — Model saved to {model_path}")

    return agent

In [None]:
trained_agent = train_dqn(episodes=1000, save_dir="/content/drive/MyDrive/Models/TicTacToe")

Training started...


  5%|▌         | 54/1000 [09:47<3:27:02, 13.13s/it]

In [23]:
def play_game(agent):
    env = TicTacToe()
    env.reset()
    state = np.array(env.board, dtype=np.float32)
    env.render()

    while not env.done:
        if env.current_player == 1:
            action = agent.act(state, env.get_valid_actions())
        else:
            action = random.choice(env.get_valid_actions())
        env.step(action)
        state = np.array(env.board, dtype=np.float32)
        # print("\nMove:")
        env.render()

    result = "Draw" if env.winner == 0 else "Agent wins!" if env.winner == 1 else "Opponent wins!"
    print("\nGame Over:", result)


In [34]:
def play_games(agent, num_games):
    env = TicTacToe()
    agent_wins = 0
    opponent_wins = 0
    draws = 0
    for i in tqdm(range(num_games)):
        env.reset()
        state = np.array(env.board, dtype=np.float32)
        # env.render()
        while not env.done:
            if env.current_player == 1:
                action = agent.act(state, env.get_valid_actions())
            else:
                action = random.choice(env.get_valid_actions())
            env.step(action)
            state = np.array(env.board, dtype=np.float32)
            # print("\nMove:")
            # env.render()
        # result = "Draw" if env.winner == 0 else "Agent wins!" if env.winner == 1 else "Opponent wins!"
        if env.winner == 1:
            agent_wins += 1
        elif env.winner == -1:
            opponent_wins += 1
        else:
            draws += 1
    print("\nAgent wins: ", agent_wins/num_games, "%")
    print("\nOpponent wins: ", opponent_wins/num_games, "%")
    print("Draws:", draws/num_games, "%")

In [35]:
from tensorflow.keras.models import load_model

# Path to your saved model file
model_path = "/content/drive/MyDrive/Models/TicTacToe/dqn_model_ep100.keras"

# Load the model
loaded_model = load_model(model_path)
loadedAgent = DQNAgent()
loadedAgent.epsilon = 0.0
loadedAgent.model = loaded_model

In [37]:
play_games(loadedAgent,100)

100%|██████████| 100/100 [00:53<00:00,  1.87it/s]


Agent wins:  0.44 %

Opponent wins:  0.41 %
Draws: 0.15 %



