# Hybrid RL-Solver Training

Agent places first 5 pieces, solver checks if remaining 3 pieces can complete puzzle.

Rationale: train an early-/mid-game solver. Hopefully agent can learn to put the board in a winnable state, without having to worry about the end game.

In [1]:
import random

import numpy as np
from sb3_contrib import MaskablePPO
from stable_baselines3.common.vec_env import SubprocVecEnv

In [4]:
from torch.distributions import Distribution

from training_utils import GradNormCallback, TimerCallback, make_hybrid_env

Distribution.set_default_validate_args(False)

# Training

Here we go with the strategy of training on one board many times and then switching

In [5]:
total_timesteps = 1_000_000
checkpoint_interval = 100_000
board_switch_interval = 500_000

In [7]:
# Generate board pool
n_boards = 100
board_configs = [(random.randint(1, 12), random.randint(1, 30)) for _ in range(n_boards)]

# Select initial boards
selected_boards = random.sample(board_configs, 8)
env = SubprocVecEnv(
    [
        lambda mo=mo, day=day: make_hybrid_env(mo, day, agent_pieces=5, mask_islands=True)
        for mo, day in selected_boards
    ]
)

model = MaskablePPO(
    "MlpPolicy",
    env,
    tensorboard_log="./hybrid_ppo_logs_1/",
    verbose=1,
    batch_size=1000,
    n_steps=2048 * 2,
    ent_coef=0.15,
)

for i in range(0, total_timesteps, checkpoint_interval):
    reset = i == 0
    remaining_steps = min(checkpoint_interval, total_timesteps - i)

    # Switch boards periodically
    if i % board_switch_interval == 0:
        selected_boards = random.sample(board_configs, 8)
        env = SubprocVecEnv(
            [
                lambda mo=mo, day=day: make_hybrid_env(mo, day, agent_pieces=5, mask_islands=True)
                for mo, day in selected_boards
            ]
        )
        model.set_env(env)
        print(f"Switching to boards: {selected_boards}")

    model.learn(
        total_timesteps=remaining_steps,
        reset_num_timesteps=reset,
        callback=[TimerCallback(), GradNormCallback()],
    )

    # if i > 0 and i % 500_000 == 0:
    #    model.save(f"hybrid_model_v1_{i}")

model.save("hybrid_model_v1")

Using cpu device
Switching to boards: [(9, 8), (6, 21), (8, 25), (8, 2), (11, 18), (6, 5), (5, 12), (9, 7)]
Logging to ./hybrid_ppo_logs_1/PPO_3
Step 1000, 5s elapsed, 457s remaining
Step 2000, 9s elapsed, 431s remaining
Step 3000, 13s elapsed, 430s remaining
Step 4000, 18s elapsed, 421s remaining
Step 5000, 22s elapsed, 415s remaining
Step 6000, 26s elapsed, 407s remaining
Step 7000, 30s elapsed, 402s remaining
Step 8000, 34s elapsed, 396s remaining
Step 9000, 39s elapsed, 392s remaining
Step 10000, 43s elapsed, 385s remaining
Step 11000, 47s elapsed, 380s remaining
Step 12000, 51s elapsed, 376s remaining
Step 13000, 56s elapsed, 372s remaining
Step 14000, 60s elapsed, 368s remaining
Step 15000, 64s elapsed, 364s remaining
Step 16000, 68s elapsed, 359s remaining
Step 17000, 73s elapsed, 356s remaining
Step 18000, 77s elapsed, 352s remaining
Step 19000, 82s elapsed, 348s remaining
Step 20000, 86s elapsed, 344s remaining
Step 21000, 90s elapsed, 340s remaining
Step 22000, 95s elapsed, 3

# Evaluation

In [None]:
from hybrid_env import HybridAPADEnv


def evaluate_agent(model, n_puzzles=50):
    """Evaluate agent on ability to create solvable 5-piece states.

    Returns:
        dict with agent_solvable_rate, random_solvable_rate, mean_solver_time_ms
    """
    agent_solvable = 0
    random_solvable = 0
    solver_times = []

    for _ in range(n_puzzles):
        month = random.randint(1, 12)
        day = random.randint(1, 30)

        # Agent's 5-piece placement
        test_env = HybridAPADEnv(month, day, agent_pieces=5)
        obs, info = test_env.reset()

        for _ in range(5):
            action_masks = info["action_mask"]
            action, _ = model.predict(obs, action_masks=action_masks, deterministic=True)
            obs, reward, terminated, truncated, info = test_env.step(action)

            if terminated or truncated:
                break

        if "solver_time_ms" in info:
            agent_solvable += reward > 0
            solver_times.append(info["solver_time_ms"])

        # Random 5-piece placement
        random_env = HybridAPADEnv(month, day, agent_pieces=5)
        obs, info = random_env.reset()

        for _ in range(5):
            action_masks = info["action_mask"]
            valid_actions = np.where(action_masks)[0]
            if len(valid_actions) == 0:
                break
            action = np.random.choice(valid_actions)
            obs, reward, terminated, truncated, info = random_env.step(action)

            if terminated or truncated:
                break

        if "solver_time_ms" in info:
            random_solvable += reward > 0

    return {
        "agent_solvable_rate": agent_solvable / n_puzzles,
        "random_solvable_rate": random_solvable / n_puzzles,
        "mean_solver_time_ms": np.mean(solver_times) if solver_times else 0,
        "agent_advantage": (agent_solvable - random_solvable) / n_puzzles,
    }


results = evaluate_agent(model, n_puzzles=100)
print(f"Agent solvable rate: {results['agent_solvable_rate']:.1%}")
print(f"Random solvable rate: {results['random_solvable_rate']:.1%}")
print(f"Agent advantage: {results['agent_advantage']:+.1%}")
print(f"Mean solver time: {results['mean_solver_time_ms']:.1f}ms")