In [28]:
import sys

sys.path.append("../src/")

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Import all packages.

In [None]:
# Import necessary libraries
import random
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import shutil

from torch.distributions import Categorical
from pettingzoo.classic import tictactoe_v3


import cv2
import imageio

from utils import show
from summary_writer import LocalSummaryWriter


In [30]:
import torch

device = torch.device(
    "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
)

print(f"Using device: {device}")

Using device: mps


In [31]:
class DQN(nn.Module):
    """Deep Q Network."""

    def __init__(self, num_actions, input_dim, num_hidden=128):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(input_dim, num_hidden)
        self.layer2 = nn.Linear(num_hidden, num_hidden)
        self.layer3 = nn.Linear(num_hidden, num_actions)

    def forward(self, x):
        """Forward pass through the network."""
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x

In [32]:
class ActorNetwork(nn.Module):
    """Actor Network for the policy gradient method."""

    def __init__(self, state_size, action_size):
        super(ActorNetwork, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.linear1 = nn.Linear(self.state_size, 128)
        self.linear2 = nn.Linear(128, 256)
        self.linear3 = nn.Linear(256, self.action_size)

    def forward(self, state):
        """Forward pass through the network."""
        output = F.relu(self.linear1(state))
        output = F.relu(self.linear2(output))
        output = self.linear3(output)
        distribution = Categorical(F.softmax(output, dim=-1))
        return distribution


In [33]:
dqn_network = DQN(num_actions=9, input_dim=9).to(device)
acm_network = ActorNetwork(state_size=9, action_size=9).to(device)

In [34]:
def load_model(model, model_path):
    """Load a model from a given path."""
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

In [None]:
def state_preprocess(obs: any):
    board = obs["observation"]
    board_processed = board[:, :, 0] - board[:, :, 1]
    return board_processed.flatten()


def detect_winner(obs: any):
    board = obs["observation"].reshape(3, 3, 2)
    board = board.argmax(axis=2)  # 0 for X, 1 for O, 2 = empty
    for player, label in [(0, "player_1"), (1, "player_2")]:
        for i in range(3):
            if all(board[i, :] == player):
                return label
            if all(board[:, i] == player):
                return label
        if all(np.diag(board) == player):
            return label
        if all(np.diag(np.fliplr(board)) == player):
            return label

    return None

In [54]:
def run(env, agent_a, agent_b, video_path, seed=42):
    """Run/test the agent in the environment."""
    env.reset(seed=seed)
    obs, reward, done, truncation, info = env.last()
    state = state_preprocess(obs)

    done = False
    truncation = False
    step_size = 0
    episode_reward = 0
    frames = []

    frame = env.render()
    frames.append(frame)
    while not done and not truncation:
        dist = agent_a(torch.tensor(state, dtype=torch.float32, device=device))
        valid_moves = np.where(state == 0)[0]
        action = dist.sample()
        while action.item() not in valid_moves:
            action = dist.sample()

        env.step(action.item())
        obs, _, done, truncation, info = env.last()
        state = state_preprocess(obs)

        last_obs = obs

        # Self-play with random opponent
        if not done and not truncation:
            with torch.no_grad():
                q_values = agent_b(torch.tensor(state, dtype=torch.float32, device=device))
                q_values = q_values.cpu().numpy()

            valid_moves = np.where(state == 0)[0]
            masked_q_values = np.full(q_values.shape, -np.inf)
            masked_q_values[valid_moves] = q_values[valid_moves]
            action = int(np.argmax(masked_q_values))

            env.step(action)
            obs, _, done, truncation, info = env.last()
            state = state_preprocess(obs)
            if done or truncation:
                last_obs = obs

        step_size += 1

        frame = env.render()
        frames.append(frame)

    winner = detect_winner(last_obs)
    print(f"Winner: {winner}")
    episode_reward = 1 if winner == "player_1" else -1 if winner == "player_2" else 0

    # Print log
    result = f"Steps: {step_size:}, Reward: {episode_reward:.2f}, "
    print(result)

    print("\tSaving Animation ...")

    fps = 30
    imageio.mimsave(video_path, frames, fps=fps)

    video = cv2.VideoCapture(video_path)

    frame_per_second = video.get(cv2.CAP_PROP_FPS)
    assert frame_per_second == fps, f"FPS should be {fps} but got {frame_per_second}"
    frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)

    return frame_count / frame_per_second if frame_per_second > 0 else 0, episode_reward

In [None]:
def main(RL_hyperparams):
    """Train and Run the DQN agent."""
    torch.manual_seed(RL_hyperparams["random_seed"])
    np.random.seed(RL_hyperparams["random_seed"])
    random.seed(RL_hyperparams["random_seed"])

    draw_rewards = []
    agent_a_rewards = []
    agent_b_rewards = []
    run_times = []

    agent_a = load_model(acm_network, "./tictactoe_acm_model.pt")
    agent_b = load_model(dqn_network, "./tictactoe_dqn_model.pt")

    for i in range(1, RL_hyperparams["number_of_experiments"] + 1):
        print("[Experiment]\t{} of {}".format(i, RL_hyperparams["number_of_experiments"]))

        path = f"../experiments/run_tictactoe/run-{i}"
        shutil.rmtree(path, ignore_errors=True)
        writer = LocalSummaryWriter(log_dir=path)
        video_path = f"{writer.log_dir}/TicTacToe-movie-{i}.mp4"

        print("[Env]\tCreating Environment ...")
        env = tictactoe_v3.env(render_mode="human" if RL_hyperparams["render"] else "rgb_array")
        env.reset()

        print("[Run]\tRunning Simulation ...")
        run_time, reward = run(env, agent_a, agent_b, video_path)
        if reward == 0:
            draw_rewards.append(1)
        if reward == 1:
            agent_a_rewards.append(1)
        if reward == -1:
            agent_b_rewards.append(1)

        run_times.append(run_time)

        print("[Video]\tShow Video ...")
        show(video_path)

    print("[End]\tDone. Congratulations!")

    print("[Agent A]\tWins: ", (np.asarray(agent_a_rewards) == 1).sum())
    print("[Agent B]\tWins: ", (np.asarray(agent_b_rewards) == 1).sum())
    print("[Reward]\tDraws: ", (np.asarray(draw_rewards) == 1).sum())

    print("[RunTime]\tAverage Run Time: ", np.mean(run_times))
    print("[RunTime]\tStandard Deviation of Run Time: ", np.std(run_times))
    print("[RunTime]\tMax Run Time: ", np.max(run_times))
    print("[RunTime]\tMin Run Time: ", np.min(run_times))


In [None]:
if __name__ == "__main__":
    RL_hyperparams = {"number_of_experiments": 50, "random_seed": 42, "render": False}
    main(RL_hyperparams=RL_hyperparams)



[Experiment]	1 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	2 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	3 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	4 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: None
Steps: 5, Reward: 0.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=6.0
duration=0.2
[Experiment]	5 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	6 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	7 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	8 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	9 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	10 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	11 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	12 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	13 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	14 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	15 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_2
Steps: 5, Reward: -1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=6.0
duration=0.2
[Experiment]	16 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_2
Steps: 5, Reward: -1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=6.0
duration=0.2
[Experiment]	17 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	18 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	19 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	20 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	21 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	22 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	23 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	24 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_2
Steps: 5, Reward: -1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=6.0
duration=0.2
[Experiment]	25 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	26 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	27 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	28 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	29 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	30 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	31 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	32 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	33 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	34 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	35 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	36 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	37 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	38 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	39 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_2
Steps: 5, Reward: -1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=6.0
duration=0.2
[Experiment]	40 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	41 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	42 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	43 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	44 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	45 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	46 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 3, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=4.0
duration=0.13333333333333333
[Experiment]	47 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: None
Steps: 5, Reward: 0.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=6.0
duration=0.2
[Experiment]	48 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_2
Steps: 5, Reward: -1.00, 
	Saving Animation ...
[Video]	Show Video ...




frame per second=30.0
frame count=6.0
duration=0.2
[Experiment]	49 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...
Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[Experiment]	50 of 50
[Env]	Creating Environment ...
[Run]	Running Simulation ...




Winner: player_1
Steps: 4, Reward: 1.00, 
	Saving Animation ...
[Video]	Show Video ...


frame per second=30.0
frame count=5.0
duration=0.16666666666666666
[End]	Done. Congratulations!
[Agent A]	Wins:  43
[Agent B]	Wins:  5
[Reward]	Draws:  2
[RunTime]	Average Run Time:  0.156
[RunTime]	Standard Deviation of Run Time:  0.02351358945139787
[RunTime]	Max Run Time:  0.2
[RunTime]	Min Run Time:  0.13333333333333333
