# Curriculum Learning - Late Game First

Train agent on partial board states from known solutions for April 14.
Start with 2 pieces remaining (easiest), progress to 7 pieces remaining (hardest).

Episode structure: agent sees board with N-M pieces placed, chooses 1 action, gets reward based on whether it's in valid solution set.

In [1]:
from sb3_contrib import MaskablePPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from torch.distributions import Distribution

from apad_puzzle_rl.training_utils import (
    CurriculumProgressionCallback,
    GradNormCallback,
    TimerCallback,
    make_curriculum_env,
)

# Disable validation to avoid Simplex constraint issues
Distribution.set_default_validate_args(False)

# Training

In [6]:
# Create vectorized environments - start at easiest level (2 pieces remaining)
n_envs = 6
starting_level = 2  # pieces remaining

env = SubprocVecEnv(
    [lambda: make_curriculum_env(pieces_remaining=starting_level) for _ in range(n_envs)]
)

# Create model
model = MaskablePPO(
    "MlpPolicy",
    env,
    tensorboard_log="./curriculum_ppo_logs_1/",
    verbose=1,
    batch_size=256,
    n_steps=256,
    ent_coef=0.0001,
    learning_rate=3e-3,
)

# Train with automatic curriculum progression
model.learn(
    total_timesteps=2_000_000,
    callback=[
        TimerCallback(),
        GradNormCallback(),
        CurriculumProgressionCallback(
            env,
            min_episodes=100,
            verbose=1,  # level_thresholds={2: 0.9, 3: 0.97, 4: 0.98, 5: 0.97, 6: 0.85, 7: 0.80}
        ),
    ],
)

model.save("curriculum_model_v6")

Using cpu device
Logging to ./curriculum_ppo_logs_1/PPO_14
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.63     |
| time/              |          |
|    fps             | 1015     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 1536     |
| train/             |          |
|    grad_norm       | 0        |
---------------------------------
Step 3000, 3s elapsed, 2256s remaining
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1          |
|    ep_rew_mean          | 0.68       |
| time/                   |            |
|    fps                  | 890        |
|    iterations           | 2          |
|    time_elapsed         | 3          |
|    total_timesteps      | 3072       |
| train/                  |            |
|    approx_kl            | 0.09031832 |
|    clip_fraction        | 0.659      |
| 

KeyboardInterrupt: 

# Evaluation

In [17]:
import numpy as np

from apad_puzzle_rl.envs.curriculum_env import CurriculumAPADEnv


def evaluate_curriculum_agent(model, pieces_remaining=2, n_episodes=100):
    """Evaluate agent at specific curriculum level.

    Returns:
        dict with success_rate, random_baseline, advantage
    """
    agent_success = 0
    random_success = 0

    for _ in range(n_episodes):
        # Agent episode
        test_env = CurriculumAPADEnv(4, 14, pieces_remaining=pieces_remaining)
        obs, info = test_env.reset()

        action_masks = info["action_mask"]
        action, _ = model.predict(obs, action_masks=action_masks, deterministic=True)
        obs, reward, terminated, truncated, info = test_env.step(action)

        agent_success += reward > 0.5

        # Random baseline
        random_env = CurriculumAPADEnv(4, 14, pieces_remaining=pieces_remaining)
        obs, info = random_env.reset()

        valid_actions = np.where(info["action_mask"])[0]
        if len(valid_actions) > 0:
            action = np.random.choice(valid_actions)
            obs, reward, terminated, truncated, info = random_env.step(action)
            random_success += reward > 0.5

    agent_rate = agent_success / n_episodes
    random_rate = random_success / n_episodes

    return {
        "success_rate": agent_rate,
        "random_baseline": random_rate,
        "advantage": agent_rate - random_rate,
    }


# Evaluate at each curriculum level
for pieces_remaining in range(2, 8):
    results = evaluate_curriculum_agent(model, pieces_remaining=pieces_remaining)
    print(f"\nLevel: {pieces_remaining} pieces remaining")
    print(f"  Agent success:  {results['success_rate']:.1%}")
    print(f"  Random baseline: {results['random_baseline']:.1%}")
    print(f"  Advantage:      {results['advantage']:+.1%}")


Level: 2 pieces remaining
  Agent success:  89.0%
  Random baseline: 54.0%
  Advantage:      +35.0%

Level: 3 pieces remaining
  Agent success:  85.0%
  Random baseline: 34.0%
  Advantage:      +51.0%

Level: 4 pieces remaining
  Agent success:  87.0%
  Random baseline: 11.0%
  Advantage:      +76.0%

Level: 5 pieces remaining
  Agent success:  88.0%
  Random baseline: 3.0%
  Advantage:      +85.0%

Level: 6 pieces remaining
  Agent success:  71.0%
  Random baseline: 1.0%
  Advantage:      +70.0%

Level: 7 pieces remaining
  Agent success:  86.0%
  Random baseline: 2.0%
  Advantage:      +84.0%


# Complete Game Tests

Test curriculum-trained model on full 8-piece games

In [29]:
from apad_puzzle_rl.envs.apad_env import APADEnv

mean_step_count = 0

# Play until we get a win on April 14
env = APADEnv(4, 14)
obs, info = env.reset()
step_count = 0
i = 0
interval = 500.0

while step_count < 8:
    i += 1
    if i % interval == 0:
        print(i, round(mean_step_count, 3))

    obs, info = env.reset()
    done = False
    step_count = 0

    while not done:
        action, _ = model.predict(obs, deterministic=False, action_masks=info["action_mask"])
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        step_count += 1
        if done:
            mean_step_count = (mean_step_count + step_count) / 2.0

    if terminated:
        break

print(f"Found win after {i} attempts")
env.visualize()

500 4.004
1000 4.004
1500 4.008
2000 4.5
2500 4.813
3000 4.501
3500 4.517
4000 4.766
4500 4.013
5000 4.474
5500 4.5
6000 4.25
6500 4.57
7000 4.067
7500 4.063
8000 4.798
8500 4.075


KeyboardInterrupt: 

In [30]:
from apad_puzzle_rl.envs.apad_env import APADEnv


def evaluate_win_rate(model, month, day, n_episodes=100):
    """Evaluate model win rate on complete games."""
    env = APADEnv(month, day)
    wins = 0

    for i in range(n_episodes):
        if i % 10 == 0:
            print(i)
        obs, info = env.reset()
        done = False

        while not done:
            action, _ = model.predict(obs, deterministic=False, action_masks=info["action_mask"])
            obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated

        wins += terminated

    return wins / n_episodes


# Test on April 14 (trained date)
print("April 14 (trained):")
win_rate_414 = evaluate_win_rate(model, 4, 14, n_episodes=100)
print(f"Win rate: {win_rate_414:.1%}\n")

# Test on April 15 (generalization)
print("April 15 (unseen):")
win_rate_415 = evaluate_win_rate(model, 4, 15, n_episodes=100)
print(f"Win rate: {win_rate_415:.1%}")

April 14 (trained):
0
10
20
30
40
50
60
70
80
90
Win rate: 0.0%

April 15 (unseen):
0
10
20
30
40
50
60
70
80
90
Win rate: 0.0%
