# Setup

In [1]:
%load_ext autoreload
%autoreload 2
#!pip install sb3-contrib

In [2]:
from sb3_contrib import MaskablePPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.monitor import Monitor
import time
from collections import deque
from stable_baselines3.common.vec_env import SubprocVecEnv
from math import exp, log
import random

In [3]:
class TimerCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.start_time = time.time()
    
    def _on_step(self):
        if self.num_timesteps % 1000 == 0:
            elapsed = time.time() - self.start_time
            rate = self.num_timesteps / elapsed
            remaining = (self.locals['total_timesteps'] - self.num_timesteps) / rate
            print(f"Step {self.num_timesteps}, {elapsed:.0f}s elapsed, {remaining:.0f}s remaining")
        return True

class GradNormCallback(BaseCallback):
    def _on_step(self):
        if hasattr(self.model.policy, 'parameters'):
            total_norm = 0
            for p in self.model.policy.parameters():
                if p.grad is not None:
                    param_norm = p.grad.data.norm(2)
                    total_norm += param_norm.item() ** 2
            total_norm = total_norm ** (1. / 2)
            
            self.logger.record("train/grad_norm", total_norm)
        return True

from stable_baselines3.common.callbacks import BaseCallback

# up the difficulty upon reaching certain (rolling) mean episode length thresholds
class CurriculumCallback(BaseCallback):
    def __init__(self, env, verbose = 1):
        super().__init__(verbose = verbose)
        self.env = env
        self.verbose = verbose
        self.thresholds = [7.5, 5] # episode length required before upping difficulty
        self.current_stage = 0
        self.ep_lengths = deque(maxlen=30)

    def _on_step(self):
        infos = self.locals.get("infos", [])
        for info in infos:
            if "episode" not in info:
                continue
                
            ep_len = info["episode"]["l"]
            self.ep_lengths.append(ep_len)
            avg_len = sum(self.ep_lengths) / len(self.ep_lengths)
            if self.current_stage < len(self.thresholds) and avg_len > self.thresholds[self.current_stage]:
                self.current_stage += 1
                self.env.set_difficulty(self.current_stage)
                if self.verbose:
                    print(f"Average episode length: {avg_len:.2f} — Switched to difficulty: {self.current_stage}")
        return True

In [4]:
from apad_env import APADEnv

In [5]:
def make_env(mo=None, day=None):
   if mo is None or day is None:
       env = APADEnv()  # random board
   else:
       env = APADEnv(mo, day)  # fixed board
   return Monitor(env)

# Training strategy: repeat boards + increased batch size

In [6]:
# Repeat boards setup
n_boards = 100
board_configs = []
for _ in range(n_boards):
   mo = random.randint(1, 12)
   day = random.randint(1, 30)
   board_configs.append((mo, day))

In [None]:
selected_boards = random.sample(board_configs, 8)
env = SubprocVecEnv([lambda mo=mo, day=day: make_env(mo, day) for mo, day in selected_boards])
model = MaskablePPO(
    "MlpPolicy",
    env,
    tensorboard_log="./maskable_ppo_logs_16/",
    #ent_coef= ent_coef_i,
    #learning_rate=0.0004,
    verbose=1,
    batch_size=1000,
    n_steps=2048*2,
    # gamma=0.999,
)

total_timesteps = 1000000
checkpoint_interval = 50000
for i in range(0, total_timesteps, checkpoint_interval):
    reset = True if i == 0 else False
    remaining_steps = min(checkpoint_interval, total_timesteps - i)
    progress = i / total_timesteps

    # new boards to train on
    selected_boards = random.sample(board_configs, 8)
    env = SubprocVecEnv([lambda mo=mo, day=day: make_env(mo, day) for mo, day in selected_boards])
    model.set_env(env)
    print(f"Switching to boards: {selected_boards}")
    
    model.learn(total_timesteps=remaining_steps, reset_num_timesteps=reset, callback=[TimerCallback(),GradNormCallback()])
    
    if i % 1000000 == 0:
        model.save(f"mppo_model_v1.3_{i}")

model.save(f"mppo_model_v1.3")

# Old

## Training Strategy: entropy rampdown + repeat boards + increased batch size

In [12]:
# Entropy schedule constants
ent_coef_i = 0.2
ent_coef_f = 0.015
D = log(ent_coef_i/ent_coef_f)

In [13]:
# Repeat boards setup
n_boards = 100
board_configs = []
for _ in range(n_boards):
   mo = random.randint(1, 12)
   day = random.randint(1, 30)
   board_configs.append((mo, day))

In [10]:
#current_mo, current_day = board_configs[0]
#env = SubprocVecEnv([lambda mo=current_mo, day=current_day: make_env(mo, day) for _ in range(8)])
selected_boards = random.sample(board_configs, 8)
env = SubprocVecEnv([lambda mo=mo, day=day: make_env(mo, day) for mo, day in selected_boards])
model = MaskablePPO(
    "MlpPolicy",
    env,
    tensorboard_log="./maskable_ppo_logs_16/",
    ent_coef= ent_coef_i,
    #learning_rate=0.0004,
    verbose=1,
    batch_size=1000,
    #gamma=0.999,
)

total_timesteps = 3000000
checkpoint_interval = 50000
for i in range(0, total_timesteps, checkpoint_interval):
    reset = True if i == 0 else False
    remaining_steps = min(checkpoint_interval, total_timesteps - i)
    progress = i / total_timesteps

    # new boards to train on
    selected_boards = random.sample(board_configs, 8)
    env = SubprocVecEnv([lambda mo=mo, day=day: make_env(mo, day) for mo, day in selected_boards])
    model.set_env(env)
    print(f"Switching to boards: {selected_boards}")
   
    if progress < 0.9:
        model.ent_coef = ent_coef_i * exp(-1*D*progress)
    
    print(f"ENTCOEF {model.ent_coef}")
    
    model.learn(total_timesteps=remaining_steps, reset_num_timesteps=reset, callback=[TimerCallback(),GradNormCallback()])
    
    if i % 1000000 == 0:
        model.save(f"mppo_model_v1.2_{i}")

model.save(f"mppo_model_v1.2")

Using cpu device
Switching to boards: [(7, 21), (6, 4), (11, 17), (7, 23), (6, 23), (2, 5), (4, 8), (8, 22)]
ENTCOEF 0.2
Logging to ./maskable_ppo_logs_16/PPO_3
Step 1000, 6s elapsed, 278s remaining
Step 2000, 11s elapsed, 268s remaining
Step 3000, 17s elapsed, 260s remaining
Step 4000, 22s elapsed, 253s remaining
Step 5000, 27s elapsed, 247s remaining
Step 6000, 33s elapsed, 242s remaining
Step 7000, 38s elapsed, 236s remaining
Step 8000, 44s elapsed, 232s remaining
Step 9000, 50s elapsed, 226s remaining
Step 10000, 55s elapsed, 221s remaining
Step 11000, 61s elapsed, 215s remaining
Step 12000, 66s elapsed, 210s remaining
Step 13000, 72s elapsed, 205s remaining
Step 14000, 77s elapsed, 199s remaining
Step 15000, 83s elapsed, 193s remaining
Step 16000, 88s elapsed, 187s remaining
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 2.48     |
|    ep_rew_mean     | -0.0712  |
| time/              |          |
|    fps             | 181      |
|    

KeyboardInterrupt: 

## Training strategy: incremental difficulty rampup

### step 1: no date, 6 pieces = win
v0

In [None]:
env = APADEnv(-1,-1,2)
model = None
model = MaskablePPO(
    "MlpPolicy",
    env,
    tensorboard_log="./maskable_ppo_logs_14/",
    ent_coef=0.1,
    learning_rate=0.003,
    verbose=1,
)
model.learn(total_timesteps=75000, callback=[TimerCallback(), GradNormCallback()])
model.save(f"mppo_model_v0_2")

### step 2:

a) no date, 7 pieces

v0a


In [None]:
model = MaskablePPO.load("mppo_model_v0")
env = APADEnv(-1,-1,1)
model.set_env(env)
model.learn(100000, reset_num_timesteps=True)
model.save(f"mppo_model_v0a")

b) day only, 6 pieces

v0b

In [None]:
model = MaskablePPO.load("mppo_model_v0")
env = APADEnv(-1,None,2)
model.set_env(env)
model.learn(100000, reset_num_timesteps=True)
model.save(f"mppo_model_v0b")

c) month only, 6 pieces

v0c

In [None]:
model = MaskablePPO.load("mppo_model_v0")
env = APADEnv(None,-1,2)
model.set_env(env)
model.learn(100000, reset_num_timesteps=True)
model.save(f"mppo_model_v0c")

## Training strategy: Vanilla

In [None]:
#total_timesteps = 100000
#checkpoint_interval = 25000
#for i in range(0, total_timesteps, checkpoint_interval):
#    remaining_steps = min(checkpoint_interval, total_timesteps - i)
#    if i == 0:
#        model.learn(total_timesteps=remaining_steps, reset_num_timesteps=True, callback=[TimerCallback(),GradNormCallback()])
#    else:
#        model.learn(total_timesteps=remaining_steps, reset_num_timesteps=False, callback=[TimerCallback(),GradNormCallback()])
#

In [None]:
env = APADEnv(-1)
#env.reset()
model = None
model = MaskablePPO(
    "MlpPolicy",
    env,
    tensorboard_log="./maskable_ppo_logs_10/",
    ent_coef=0.03,
    verbose=1,
)
total_timesteps = 300000
checkpoint_interval = 50000
for i in range(0, total_timesteps, checkpoint_interval):
    remaining_steps = min(checkpoint_interval, total_timesteps - i)
    if i == 0:
        model.learn(total_timesteps=remaining_steps, reset_num_timesteps=True, callback=[TimerCallback(),GradNormCallback()])
    else:
        model.learn(total_timesteps=remaining_steps, reset_num_timesteps=False, callback=[TimerCallback(),GradNormCallback()])
    model.save(f"mppo_model_{11}")

In [None]:
env = APADEnv(-1,-1)
env.reset()
model = None
model = MaskablePPO(
    "MlpPolicy",
    env,
    #n_steps = 512,
    tensorboard_log="./maskable_ppo_logs_9/",
    verbose=1,
)
model.learn(total_timesteps=50000, reset_num_timesteps=False, callback=[TimerCallback(), GradNormCallback()])
model.save(f"mppo_model_9")

In [None]:
for i in range(4):
    env.reset()
    model = None
    model = MaskablePPO(
        "MlpPolicy",
        env,
        #n_steps = 512,
        tensorboard_log="./maskable_ppo_logs_9/",
        verbose=1,
    )
    model.learn(total_timesteps=50000, reset_num_timesteps=True, callback=[TimerCallback(), GradNormCallback()])
    model.save(f"mppo_model_{i}")

In [None]:
model.save("mppo_model_25k_2025-06-19_1500")