# Setup

In [1]:
%load_ext autoreload
%autoreload 2
#!pip install sb3-contrib

In [2]:
from sb3_contrib import MaskablePPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.monitor import Monitor
import time
from collections import deque
from stable_baselines3.common.vec_env import SubprocVecEnv
from math import exp, log
import random

In [3]:
#from stable_baselines3.common.utils import get_schedule_fn

In [4]:
class TimerCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.start_time = time.time()
    
    def _on_step(self):
        if self.num_timesteps % 1000 == 0:
            elapsed = time.time() - self.start_time
            rate = self.num_timesteps / elapsed
            remaining = (self.locals['total_timesteps'] - self.num_timesteps) / rate
            print(f"Step {self.num_timesteps}, {elapsed:.0f}s elapsed, {remaining:.0f}s remaining")
        return True

class GradNormCallback(BaseCallback):
    def _on_step(self):
        if hasattr(self.model.policy, 'parameters'):
            total_norm = 0
            for p in self.model.policy.parameters():
                if p.grad is not None:
                    param_norm = p.grad.data.norm(2)
                    total_norm += param_norm.item() ** 2
            total_norm = total_norm ** (1. / 2)
            
            self.logger.record("train/grad_norm", total_norm)
        return True

from stable_baselines3.common.callbacks import BaseCallback

# up the difficulty upon reaching certain (rolling) mean episode length thresholds
class CurriculumCallback(BaseCallback):
    def __init__(self, env, verbose = 1):
        super().__init__(verbose = verbose)
        self.env = env
        self.verbose = verbose
        self.thresholds = [7.5, 5] # episode length required before upping difficulty
        self.current_stage = 0
        self.ep_lengths = deque(maxlen=30)

    def _on_step(self):
        infos = self.locals.get("infos", [])
        for info in infos:
            if "episode" not in info:
                continue
                
            ep_len = info["episode"]["l"]
            self.ep_lengths.append(ep_len)
            avg_len = sum(self.ep_lengths) / len(self.ep_lengths)
            if self.current_stage < len(self.thresholds) and avg_len > self.thresholds[self.current_stage]:
                self.current_stage += 1
                self.env.set_difficulty(self.current_stage)
                if self.verbose:
                    print(f"Average episode length: {avg_len:.2f} — Switched to difficulty: {self.current_stage}")
        return True

In [5]:
# !! Discussion below points to the constraint Simplex being a bug, not a problem with my env or training.
# These lines turn off many validation checks, including the Simplex one causing us problems.
#
# Alternatively, the discussions also suggest modifying the check threshold in torch/distributions/constraints.py:: class _Simplex.
#        # Current:
#        return torch.all(value >= 0, dim=-1) & ((value.sum(-1) - 1).abs() < 1e-6)
#        
#        # Fix:
#        tol = torch.finfo(value.dtype).eps * 10 * value.size(-1) ** 0.5
#        return torch.all(value >= 0, dim=-1) & ((value.sum(-1) - 1).abs() < tol)
#
# Both seem to work. I'm going for the latter. Its possibly is making my training slower and less stable though.
#
# Discussion:
# https://discuss.pytorch.org/t/distributions-categorical-fails-with-constraint-simplex-but-manual-check-passes/163209/9
# https://github.com/pytorch/pytorch/issues/87468
# https://github.com/Stable-Baselines-Team/stable-baselines3-contrib/issues/81

# from torch.distributions import Distribution
# Distribution.set_default_validate_args(False)

# Training Strategy: entropy rampdown

In [6]:
from apad_env import APADEnv

In [7]:
def make_env(mo=None, day=None):
   if mo is None or day is None:
       env = APADEnv()  # random board
   else:
       env = APADEnv(mo, day)  # fixed board
   return Monitor(env)

In [8]:
ent_coef_i = 0.2
ent_coef_f = 0.015
D = log(ent_coef_i/ent_coef_f)

n_boards = 100
board_configs = []
for _ in range(n_boards):
   mo = random.randint(1, 12)
   day = random.randint(1, 30)
   board_configs.append((mo, day))

In [None]:
current_mo, current_day = board_configs[0]
env = SubprocVecEnv([lambda mo=current_mo, day=current_day: make_env(mo, day) for _ in range(8)])
model = MaskablePPO(
    "MlpPolicy",
    env,
    tensorboard_log="./maskable_ppo_logs_16/",
    ent_coef= ent_coef_i,
    #learning_rate=0.0004,
    verbose=1,
)

total_timesteps = 2000000
checkpoint_interval = 50000
for i in range(0, total_timesteps, checkpoint_interval):
    reset = True if i == 0 else False
    remaining_steps = min(checkpoint_interval, total_timesteps - i)
    progress = i / total_timesteps

    # new board to train on
    current_mo, current_day = random.choice(board_configs)
    env = SubprocVecEnv([lambda mo=current_mo, day=current_day: make_env(mo, day) for _ in range(8)])
    model.set_env(env)
    print(f"Switching to board: ({current_mo}, {current_day})")
   
    if progress < 0.9:
        model.ent_coef = ent_coef_i * exp(-1*D*progress)
    
    print(f"ENTCOEF {model.ent_coef}")
    
    model.learn(total_timesteps=remaining_steps, reset_num_timesteps=reset, callback=[TimerCallback(),GradNormCallback()])
    
    #if i % 1000000 == 0:
    model.save(f"mppo_model_v0_{i}")

model.save(f"mppo_model_v0")

Using cpu device
Switching to board: (6, 7)
ENTCOEF 0.2
Logging to ./maskable_ppo_logs_16/PPO_1
Step 1000, 6s elapsed, 270s remaining
Step 2000, 11s elapsed, 263s remaining
Step 3000, 17s elapsed, 259s remaining
Step 4000, 22s elapsed, 255s remaining
Step 5000, 28s elapsed, 249s remaining
Step 6000, 33s elapsed, 245s remaining
Step 7000, 39s elapsed, 239s remaining
Step 8000, 44s elapsed, 233s remaining
Step 9000, 50s elapsed, 227s remaining
Step 10000, 55s elapsed, 221s remaining
Step 11000, 61s elapsed, 216s remaining
Step 12000, 67s elapsed, 211s remaining
Step 13000, 72s elapsed, 205s remaining
Step 14000, 77s elapsed, 199s remaining
Step 15000, 83s elapsed, 193s remaining
Step 16000, 88s elapsed, 188s remaining
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 2.5      |
|    ep_rew_mean     | -0.073   |
| time/              |          |
|    fps             | 181      |
|    iterations      | 1        |
|    time_elapsed    | 90       |
| 

# Old

## Training strategy: incremental difficulty rampup

### step 1: no date, 6 pieces = win
v0

In [None]:
env = APADEnv(-1,-1,2)
model = None
model = MaskablePPO(
    "MlpPolicy",
    env,
    tensorboard_log="./maskable_ppo_logs_14/",
    ent_coef=0.1,
    learning_rate=0.003,
    verbose=1,
)
model.learn(total_timesteps=75000, callback=[TimerCallback(), GradNormCallback()])
model.save(f"mppo_model_v0_2")

### step 2:

a) no date, 7 pieces

v0a


In [None]:
model = MaskablePPO.load("mppo_model_v0")
env = APADEnv(-1,-1,1)
model.set_env(env)
model.learn(100000, reset_num_timesteps=True)
model.save(f"mppo_model_v0a")

b) day only, 6 pieces

v0b

In [None]:
model = MaskablePPO.load("mppo_model_v0")
env = APADEnv(-1,None,2)
model.set_env(env)
model.learn(100000, reset_num_timesteps=True)
model.save(f"mppo_model_v0b")

c) month only, 6 pieces

v0c

In [None]:
model = MaskablePPO.load("mppo_model_v0")
env = APADEnv(None,-1,2)
model.set_env(env)
model.learn(100000, reset_num_timesteps=True)
model.save(f"mppo_model_v0c")

## Training strategy: Vanilla

In [None]:
#total_timesteps = 100000
#checkpoint_interval = 25000
#for i in range(0, total_timesteps, checkpoint_interval):
#    remaining_steps = min(checkpoint_interval, total_timesteps - i)
#    if i == 0:
#        model.learn(total_timesteps=remaining_steps, reset_num_timesteps=True, callback=[TimerCallback(),GradNormCallback()])
#    else:
#        model.learn(total_timesteps=remaining_steps, reset_num_timesteps=False, callback=[TimerCallback(),GradNormCallback()])
#

In [None]:
env = APADEnv(-1)
#env.reset()
model = None
model = MaskablePPO(
    "MlpPolicy",
    env,
    tensorboard_log="./maskable_ppo_logs_10/",
    ent_coef=0.03,
    verbose=1,
)
total_timesteps = 300000
checkpoint_interval = 50000
for i in range(0, total_timesteps, checkpoint_interval):
    remaining_steps = min(checkpoint_interval, total_timesteps - i)
    if i == 0:
        model.learn(total_timesteps=remaining_steps, reset_num_timesteps=True, callback=[TimerCallback(),GradNormCallback()])
    else:
        model.learn(total_timesteps=remaining_steps, reset_num_timesteps=False, callback=[TimerCallback(),GradNormCallback()])
    model.save(f"mppo_model_{11}")

In [None]:
env = APADEnv(-1,-1)
env.reset()
model = None
model = MaskablePPO(
    "MlpPolicy",
    env,
    #n_steps = 512,
    tensorboard_log="./maskable_ppo_logs_9/",
    verbose=1,
)
model.learn(total_timesteps=50000, reset_num_timesteps=False, callback=[TimerCallback(), GradNormCallback()])
model.save(f"mppo_model_9")

In [None]:
for i in range(4):
    env.reset()
    model = None
    model = MaskablePPO(
        "MlpPolicy",
        env,
        #n_steps = 512,
        tensorboard_log="./maskable_ppo_logs_9/",
        verbose=1,
    )
    model.learn(total_timesteps=50000, reset_num_timesteps=True, callback=[TimerCallback(), GradNormCallback()])
    model.save(f"mppo_model_{i}")

In [None]:
model.save("mppo_model_25k_2025-06-19_1500")