# Google Colab notebook (some edits were made to the class to accomodate environment)

## Scroll to way bottom of the notebook for latest model training and tournament play

## Original Preliminary Setup (Obsolete):

In [1]:
# !pip install stable-baselines3[extra] gym torch --quiet
# !pip install shimmy
#!pip install gym==0.26.2 gymnasium shimmy --upgrade

# Don't rely on these to install, had to go through a bunch of edge cases
#   so you won't need some of these

In [5]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class HexEnv(gym.Env):
    def __init__(self, size=5):
        super().__init__()
        self.size = size
        self.action_space = spaces.Discrete(size * size)
        self.observation_space = spaces.Box(low=-1, high=1, shape=(size * size,), dtype=np.float32)
        self.board = None
        self.current_player = 1

    def reset(self, seed=None, options=None):
        self.board = np.zeros((self.size, self.size), dtype=np.int8)
        self.current_player = 1
        return self.board.flatten().astype(np.float32), {}

    def step(self, action):
        x, y = divmod(action, self.size)
        terminated = False
        truncated = False

        if self.board[x][y] != 0:
            reward = -10.0
            terminated = True
        else:
            self.board[x][y] = self.current_player
            won = self.check_win()
            reward = 1.0 if won else 0.0
            terminated = won
            self.current_player *= -1

        return self.board.flatten().astype(np.float32), reward, terminated, truncated, {}

    def check_win(self):
        size = self.size
        player = self.current_player
        visited = set()

        def dfs(x, y):
            if (x, y) in visited:
                return False
            visited.add((x, y))

            if player == 1 and x == size - 1:
                return True
            if player == -1 and y == size - 1:
                return True

            directions = [(-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0)]
            for dx, dy in directions:
                nx, ny = x + dx, y + dy
                if 0 <= nx < size and 0 <= ny < size:
                    if self.board[nx][ny] == player:
                        if dfs(nx, ny):
                            return True
            return False

        if player == 1:
            for col in range(size):
                if self.board[0][col] == player:
                    if dfs(0, col):
                        return True
        else:
            for row in range(size):
                if self.board[row][0] == player:
                    if dfs(row, 0):
                        return True
        return False


## Create and train


In [6]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

env = HexEnv(size=5)
check_env(env)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100_000)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 7.11     |
|    ep_rew_mean     | -10      |
| time/              |          |
|    fps             | 924      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 6.67         |
|    ep_rew_mean          | -10          |
| time/                   |              |
|    fps                  | 714          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0029331576 |
|    clip_fraction        | 0.00269      |
|    clip_range           | 0.2          |
|    en

<stable_baselines3.ppo.ppo.PPO at 0x7bd70366db10>

## Preliminary Testing (ensure working)

### Simple move

In [7]:
env = HexEnv(size=5)
obs, _ = env.reset()

# Let the model pick a move
action, _ = model.predict(obs)

# Convert the flat action index to (x, y)
x, y = divmod(action, env.size)
print(f"Model move: ({x}, {y})")

# Show board with the move placed
env.board[x][y] = env.current_player
print(env.board)


Model move: (2, 3)
[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 1 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]


### Play against it

In [8]:
env = HexEnv(size=5)
obs, _ = env.reset()

def print_board(b):
    symbols = {0: ".", 1: "X", -1: "O"}
    for row in b:
        print(" ".join(symbols[int(cell)] for cell in row))
    print()

print("You are Player 1 (X), bot is Player -1 (O)")
print_board(env.board)


You are Player 1 (X), bot is Player -1 (O)
. . . . .
. . . . .
. . . . .
. . . . .
. . . . .



In [9]:
# === Human Player 1 (X) Move ===
while True:
    try:
        x = int(input("Enter your row (0-4): "))
        y = int(input("Enter your column (0-4): "))
        if env.board[x][y] == 0:
            env.board[x][y] = 1
            env.current_player = -1  # switch to bot
            break
        else:
            print("Cell already taken! Try again.")
    except Exception as e:
        print("Invalid input. Try again.")

print("You moved:")
print_board(env.board)


Enter your row (0-4): 0
Enter your column (0-4): 0
You moved:
X . . . .
. . . . .
. . . . .
. . . . .
. . . . .



Full game testing

In [10]:
env = HexEnv(size=5)
obs, _ = env.reset()

def print_board(b):
    symbols = {0: ".", 1: "X", -1: "O"}
    for row in b:
        print(" ".join(symbols[int(cell)] for cell in row))
    print()

print("Welcome to Hex! You are Player 1 (X, top to bottom).")
print("Bot is Player -1 (O, left to right).")
print_board(env.board)

done = False
while not done:
    # === Human Turn ===
    while True:
        try:
            x = int(input("Enter your row (0-4): "))
            y = int(input("Enter your column (0-4): "))
            if env.board[x][y] == 0:
                env.board[x][y] = 1
                env.current_player = 1
                if env.check_win():
                    print_board(env.board)
                    print("You win!")
                    done = True
                else:
                    env.current_player = -1
                break
            else:
                print("Cell already taken! Try again.")
        except:
            print("Invalid input. Try again.")

    if done:
        break

    print_board(env.board)

    # === Bot Turn ===
    obs = env.board.flatten().astype(np.float32)
    action, _ = model.predict(obs)
    bot_x, bot_y = divmod(action, env.size)
    print(f"Bot plays: ({bot_x}, {bot_y})")

    if env.board[bot_x][bot_y] != 0:
        print("Bot chose an illegal move")
        break

    env.board[bot_x][bot_y] = -1
    env.current_player = -1
    if env.check_win():
        print_board(env.board)
        print("Bot wins")
        done = True
    else:
        env.current_player = 1

    print_board(env.board)


Welcome to Hex! You are Player 1 (X, top to bottom).
Bot is Player -1 (O, left to right).
. . . . .
. . . . .
. . . . .
. . . . .
. . . . .

Enter your row (0-4): 0
Enter your column (0-4): 0
X . . . .
. . . . .
. . . . .
. . . . .
. . . . .

Bot plays: (1, 4)
X . . . .
. . . . O
. . . . .
. . . . .
. . . . .

Enter your row (0-4): 1
Enter your column (0-4): 0
X . . . .
X . . . O
. . . . .
. . . . .
. . . . .

Bot plays: (4, 2)
X . . . .
X . . . O
. . . . .
. . . . .
. . O . .

Enter your row (0-4): 2
Enter your column (0-4): 0
X . . . .
X . . . O
X . . . .
. . . . .
. . O . .

Bot plays: (4, 1)
X . . . .
X . . . O
X . . . .
. . . . .
. O O . .

Enter your row (0-4): 4
Enter your column (0-4): 0
X . . . .
X . . . O
X . . . .
. . . . .
X O O . .

Bot plays: (0, 3)
X . . O .
X . . . O
X . . . .
. . . . .
X O O . .

Enter your row (0-4): 3
Enter your column (0-4): 0
X . . O .
X . . . O
X . . . .
X . . . .
X O O . .

🎉 You win!


## More training (Obsolete, scroll down for self-play training)

In [11]:
import os

# Re-initialize just for sanity check
#   Note: I'll start randomizing different dimensions next time I look at this
env = HexEnv(size=5)

# Directory to save models
save_dir = "hex_bot_checkpoints"
os.makedirs(save_dir, exist_ok=True)

# load/initialize model
model_path = f"{save_dir}/hex_bot_0.zip"
if os.path.exists(model_path):
    model = PPO.load(model_path, env=env)
    print("Loaded model from previous checkpoint.")
else:
    model = PPO("MlpPolicy", env, verbose=1)
    print("Starting fresh.")

# TRAINING LOOP
total_checkpoints = 5
timesteps_per_checkpoint = 100_000

for i in range(1, total_checkpoints + 1):
    print(f"\Training: Checkpoint {i} ({i * timesteps_per_checkpoint} steps total)")
    model.learn(total_timesteps=timesteps_per_checkpoint)

    # Save
    checkpoint_path = f"{save_dir}/hex_bot_{i * timesteps_per_checkpoint}.zip"
    model.save(checkpoint_path)
    print(f"Saved model to {checkpoint_path}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    fps                  | 653         |
|    iterations           | 6           |
|    time_elapsed         | 18          |
|    total_timesteps      | 12288       |
| train/                  |             |
|    approx_kl            | 0.012805634 |
|    clip_fraction        | 0.103       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.17       |
|    explained_variance   | -0.418      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0218     |
|    n_updates            | 50          |
|    policy_gradient_loss | -0.0378     |
|    value_loss           | 0.101       |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.14        |
|    ep_rew_mean          | -10         |
| time/                   |             |
|    fps                  | 634         |
|    iterat

In [12]:
# Reloading
env = HexEnv(size=5)
model = PPO.load("hex_bot_checkpoints/hex_bot_500000", env=env)

print("Loaded bot from checkpoint with 500k steps.")


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
✅ Loaded bot from checkpoint with 500k steps.


In [16]:
from torch import tensor, softmax
import torch

env = HexEnv(size=5)
obs, _ = env.reset()

def print_board(b):
    symbols = {0: ".", 1: "X", -1: "O"}
    for row in b:
        print(" ".join(symbols[int(cell)] for cell in row))
    print()

print("Welcome to Hex! You are Player 1 (X, top to bottom).")
print("Bot is Player -1 (O, left to right).")
print_board(env.board)

done = False
while not done:
    # === Human Turn ===
    while True:
        try:
            x = int(input("Enter your row (0-4): "))
            y = int(input("Enter your column (0-4): "))
            if env.board[x][y] == 0:
                env.board[x][y] = 1
                env.current_player = 1
                if env.check_win():
                    print_board(env.board)
                    print("You win!")
                    done = True
                else:
                    env.current_player = -1
                break
            else:
                print("Cell already taken! Try again.")
        except:
            print("Invalid input. Try again.")

    if done:
        break

    print_board(env.board)

    # === Bot Turn (with legal move filtering cuz it still does illegal stuff) ===
    obs = env.board.flatten().astype(np.float32)
    obs_tensor = torch.tensor(obs).unsqueeze(0)  # shape: [1, obs_dim]

    # Get legal moves
    legal_moves = [(i, j) for i in range(env.size) for j in range(env.size) if env.board[i][j] == 0]
    legal_indices = [i * env.size + j for i, j in legal_moves]

    # Get action probabilities
    dist = model.policy.get_distribution(obs_tensor)
    action_probs = dist.distribution.probs.detach().numpy().flatten()

    # Mask illegal moves
    masked_probs = np.zeros_like(action_probs)
    masked_probs[legal_indices] = action_probs[legal_indices]
    masked_probs /= masked_probs.sum()

    # Sample from legal actions
    action = np.random.choice(len(masked_probs), p=masked_probs)
    bot_x, bot_y = divmod(action, env.size)
    print(f"Bot plays: ({bot_x}, {bot_y})")

    # Apply it
    env.board[bot_x][bot_y] = -1
    env.current_player = -1
    if env.check_win():
        print_board(env.board)
        print("Bot wins!")
        done = True
    else:
        env.current_player = 1

    print_board(env.board)



Welcome to Hex! You are Player 1 (X, top to bottom).
Bot is Player -1 (O, left to right).
. . . . .
. . . . .
. . . . .
. . . . .
. . . . .

Enter your row (0-4): 0
Enter your column (0-4): 0
X . . . .
. . . . .
. . . . .
. . . . .
. . . . .

Bot plays: (2, 0)
X . . . .
. . . . .
O . . . .
. . . . .
. . . . .

Enter your row (0-4): 2
Enter your column (0-4): 1
X . . . .
. . . . .
O X . . .
. . . . .
. . . . .

Bot plays: (2, 2)
X . . . .
. . . . .
O X O . .
. . . . .
. . . . .

Enter your row (0-4): 1
Enter your column (0-4): 1
X . . . .
. X . . .
O X O . .
. . . . .
. . . . .

Bot plays: (1, 0)
X . . . .
O X . . .
O X O . .
. . . . .
. . . . .

Enter your row (0-4): 0
Enter your column (0-4): 1
X X . . .
O X . . .
O X O . .
. . . . .
. . . . .

Bot plays: (4, 3)
X X . . .
O X . . .
O X O . .
. . . . .
. . . O .

Enter your row (0-4): 3
Enter your column (0-4): 1
X X . . .
O X . . .
O X O . .
. X . . .
. . . O .

Bot plays: (4, 1)
X X . . .
O X . . .
O X O . .
. X . . .
. O . O .

Ente

# **Self Play Training - LATEST**

Repasting a bunch of stuff as well just for ease of access

In [34]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import random

class SelfPlayHexEnv(gym.Env):
    def __init__(self, size=5):
        super().__init__()
        self.size = size
        self.action_space = spaces.Discrete(size * size)
        self.observation_space = spaces.Box(low=-1, high=1, shape=(size * size,), dtype=np.float32)
        self.board = None
        self.current_player = 1  # alternates during reset
        self.agent_player = 1  # will be randomly assigned
        self.done = False

    def reset(self, seed=None, options=None):
        self.board = np.zeros((self.size, self.size), dtype=np.int8)
        self.agent_player = random.choice([1, -1])
        self.current_player = 1  # game always starts with Player 1
        self.done = False

        # If agent is Player -1, let Player 1 (opponent) make a random move
        if self.agent_player == -1:
            opp_moves = [(i, j) for i in range(self.size) for j in range(self.size)]
            move = random.choice(opp_moves)
            self.board[move] = 1

        return self.board.flatten().astype(np.float32), {}

    def step(self, action):
        if self.done:
            raise RuntimeError("Episode has ended, please reset.")

        x, y = divmod(action, self.size)

        # Illegal move
        if self.board[x][y] != 0:
            self.done = True
            return self.board.flatten().astype(np.float32), -10.0, True, False, {}

        self.board[x][y] = self.agent_player

        # Check if agent won
        self.current_player = self.agent_player
        if self.check_win():
            self.done = True
            return self.board.flatten().astype(np.float32), 1.0, True, False, {}

        # Opponent random move
        opp_player = -self.agent_player
        legal = [(i, j) for i in range(self.size) for j in range(self.size) if self.board[i][j] == 0]
        if not legal:
            self.done = True
            return self.board.flatten().astype(np.float32), 0.0, True, False, {}

        move = random.choice(legal)
        self.board[move] = opp_player

        # Check if opponent won
        self.current_player = opp_player
        if self.check_win():
            self.done = True
            return self.board.flatten().astype(np.float32), -1.0, True, False, {}

        # Game continues
        return self.board.flatten().astype(np.float32), 0.0, False, False, {}

    def check_win(self):
        size = self.size
        player = self.current_player
        visited = set()

        def dfs(x, y):
            if (x, y) in visited:
                return False
            visited.add((x, y))

            if player == 1 and y == size - 1:  # Player 1 goal: reach right edge
                return True
            if player == -1 and x == size - 1:  # Player -1 goal: reach bottom edge
                return True

            directions = [(-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0)]
            for dx, dy in directions:
                nx, ny = x + dx, y + dy
                if 0 <= nx < size and 0 <= ny < size:
                    if self.board[nx][ny] == player:
                        if dfs(nx, ny):
                            return True
            return False

        if player == 1:
            for row in range(size):
                if self.board[row][0] == player:
                    if dfs(row, 0):
                        return True
        else:
            for col in range(size):
                if self.board[0][col] == player:
                    if dfs(0, col):
                        return True
        return False




## Variable Board Size so can train a bunch for different scenarios

In [35]:
from stable_baselines3 import PPO
import os

def train_self_play_bot(size=5, total_checkpoints=5, timesteps_per_checkpoint=100_000):
    print(f"\nTraining self-play bot on {size}x{size} board")

    # Set up environment
    env = SelfPlayHexEnv(size=size)

    # Create directory and naming convention
    save_dir = f"selfplay_hex_{size}x{size}"
    os.makedirs(save_dir, exist_ok=True)

    # Try to load existing model
    latest_checkpoint = total_checkpoints * timesteps_per_checkpoint
    latest_path = f"{save_dir}/hex_bot_{latest_checkpoint}.zip"

    model_path = f"{save_dir}/hex_bot_0.zip"
    if os.path.exists(latest_path):
        model = PPO.load(latest_path, env=env)
        print(f"Resumed from: {latest_path}")
    elif os.path.exists(model_path):
        model = PPO.load(model_path, env=env)
        print(f"Resumed from: {model_path}")
    else:
        model = PPO("MlpPolicy", env, verbose=1)
        print("Starting new model.")

    # Training loop
    for i in range(1, total_checkpoints + 1):
        total_steps = i * timesteps_per_checkpoint
        print(f"\nCheckpoint {i}: {total_steps} steps")
        model.learn(total_timesteps=timesteps_per_checkpoint)

        # Save checkpoint
        checkpoint_path = f"{save_dir}/hex_bot_{total_steps}.zip"
        model.save(checkpoint_path)
        print(f"Saved to {checkpoint_path}")


In [36]:
train_self_play_bot(size=6, total_checkpoints=3)  # 300k steps on 6x6
# train_self_play_bot(size=7, total_checkpoints=3)  # 300k steps


Training self-play bot on 6x6 board
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Starting new model.

Checkpoint 1: 100000 steps
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 5.67     |
|    ep_rew_mean     | -10      |
| time/              |          |
|    fps             | 1102     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5.83         |
|    ep_rew_mean          | -9.91        |
| time/                   |              |
|    fps                  | 797          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0038422863 |
|    cli

# **TOURNAMENT DAY GAMEPLAY SETUP**

In [32]:
# Load Libraries needed
from stable_baselines3 import PPO
import numpy as np
import torch

In [33]:
# === SETUP ===
board_size = 6
goes_first = False

your_player = 1 if goes_first else -1
opponent_player = -your_player

In [38]:
# env and model
model = PPO.load(f"selfplay_hex_{board_size}x{board_size}/hex_bot_300000.zip")

env = SelfPlayHexEnv(size=board_size)
env.board = np.zeros((board_size, board_size), dtype=np.int8)
env.current_player = 1  # Hex always starts with Player 1

In [27]:
def print_board(board):
    symbols = {0: ".", 1: "X", -1: "O"}
    for row in board:
        print(" ".join(symbols[int(cell)] for cell in row))
    print()

In [39]:
# === GAME ON!!! ===
print("Starting Hex Match!")
print(f"You are playing as {'Player 1 (X)' if your_player == 1 else 'Player 2 (O)'}")
print_board(env.board)

done = False
while not done:
    # === Opponent's Turn (manually input this) ===
    if env.current_player == opponent_player:
        while True:
            try:
                x = int(input("Opponent move - Row: "))
                y = int(input("Opponent move - Column: "))
                if env.board[x][y] == 0:
                    env.board[x][y] = opponent_player
                    env.current_player = your_player
                    break
                else:
                    print("Cell already occupied. Try again.")
            except:
                print("Invalid input. Try again.")
        print("Opponent moved:")
        print_board(env.board)

    # === My SuperBot's Turn ===
    else:
        import torch
        obs = env.board.flatten().astype(np.float32)
        legal_moves = [(i, j) for i in range(board_size) for j in range(board_size) if env.board[i][j] == 0]
        legal_indices = [i * board_size + j for i, j in legal_moves]

        dist = model.policy.get_distribution(torch.tensor(obs).unsqueeze(0))
        probs = dist.distribution.probs.detach().numpy().flatten()

        masked_probs = np.zeros_like(probs)
        masked_probs[legal_indices] = probs[legal_indices]
        masked_probs /= masked_probs.sum()

        action = np.random.choice(len(masked_probs), p=masked_probs)
        x, y = divmod(action, board_size)
        env.board[x][y] = your_player
        env.current_player = opponent_player

        print(f"Bot moves: ({x}, {y})")
        print_board(env.board)

    # === Check Win ===
    if env.check_win():
        winner = "Bot (You)" if env.current_player == opponent_player else "Opponent"
        print(f"Game Over! Winner: {winner}")
        done = True

Starting Hex Match!
You are playing as Player 2 (O)
. . . . . .
. . . . . .
. . . . . .
. . . . . .
. . . . . .
. . . . . .

Opponent move - Row: 0
Opponent move - Column: 0
Opponent moved:
X . . . . .
. . . . . .
. . . . . .
. . . . . .
. . . . . .
. . . . . .

Bot moves: (0, 5)
X . . . . O
. . . . . .
. . . . . .
. . . . . .
. . . . . .
. . . . . .

Opponent move - Row: 1
Opponent move - Column: 0
Opponent moved:
X . . . . O
X . . . . .
. . . . . .
. . . . . .
. . . . . .
. . . . . .

Bot moves: (4, 4)
X . . . . O
X . . . . .
. . . . . .
. . . . . .
. . . . O .
. . . . . .

Opponent move - Row: 1
Opponent move - Column: 1
Opponent moved:
X . . . . O
X X . . . .
. . . . . .
. . . . . .
. . . . O .
. . . . . .

Bot moves: (3, 1)
X . . . . O
X X . . . .
. . . . . .
. O . . . .
. . . . O .
. . . . . .

Opponent move - Row: 1
Opponent move - Column: 2
Opponent moved:
X . . . . O
X X X . . .
. . . . . .
. O . . . .
. . . . O .
. . . . . .

Bot moves: (3, 4)
X . . . . O
X X X . . .
. . . . 

### Remember to save before exiting if you are in Colab

In [40]:
from google.colab import files
files.download("selfplay_hex_6x6/hex_bot_300000.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>