In [1]:
import time
import numpy as np
from stable_baselines3 import PPO

import sys
import os

# Start from the current working directory (where notebook is)
cwd = os.getcwd()

# Go two levels up (to the "grandparent")
grandparent_dir = os.path.abspath(os.path.join(cwd, "..", ".."))

# Add to sys.path if not already there
if grandparent_dir not in sys.path:
    sys.path.insert(0, grandparent_dir)

from SpotmicroEnv import SpotmicroEnv
from reward_function import reward_function, RewardState

pybullet build time: Apr  4 2025 18:56:19


# Training
The following cells will launch a training session for a policy
The first cell will only load the necessary assets and set everything up, while the second one will load tensorboard to visualize useful data about the ongoing training.

## Parameters
- You can set the name of the policy being trained by assignin it to the "run" variable.
- You can set the number of checkpoints that will be saved, changing the number within "checkpoint_callback"
- You can adjust learning rate, entropy coefficient, clip range and the rest of the hyperparameters on the last fiew lines of the notebook

In [2]:
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.logger import configure

# ========= CONFIG ==========
TOTAL_STEPS = 3_000_000
run = "stand"
log_dir = f"./logs/{run}"

def clipped_linear_schedule(initial_value, min_value=1e-5):
    def schedule(progress_remaining):
        return max(progress_remaining * initial_value, min_value)
    return schedule

checkpoint_callback = CheckpointCallback(
    save_freq=TOTAL_STEPS // 10,
    save_path=f"./policies/{run}_checkpoints",
    name_prefix=f"ppo_{run}"
)

# ========= ENV ==========
env = SpotmicroEnv(
    use_gui=False,
    reward_fn=reward_function, 
    reward_state=RewardState(), 
    dest_save_file=f"states/{run}.pkl"
)
check_env(env, warn=True)

# ========= MODEL ==========
model = PPO(
    "MlpPolicy", 
    env,
    verbose=0,   # no default printouts
    learning_rate=clipped_linear_schedule(3e-4),
    ent_coef=0.002,
    clip_range=0.1,
    tensorboard_log=log_dir,
)

# Custom logger: ONLY csv + tensorboard (no stdout table)
new_logger = configure(log_dir, ["csv", "tensorboard"])
model.set_logger(new_logger)

rear_right_leg_link_cover

In [3]:
%load_ext tensorboard
%tensorboard --logdir ./logs

In [None]:

# ========= TRAIN ==========
model.learn(
    total_timesteps=TOTAL_STEPS,
    reset_num_timesteps=False,
    callback=checkpoint_callback
)
model.save(f"policies/ppo_{run}")
env.close()

# Testing
The following cells allow to test the policy we have just trained. All we have to do is assign the name of the policy we have trained to the "policy" variable


In [2]:
policy = "stand"

env = SpotmicroEnv(
    use_gui=True, 
    reward_fn=reward_function,
    src_save_file=f"{policy}.pkl"
    )
obs, _ = env.reset()

model = PPO.load(f"ppo_{policy}")
print("Loaded env")

Reward state is None
Loaded env


In [5]:
terminated = False

while not terminated:
    action, states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    time.sleep(1/60.)

#env.plot_reward_components()
obs, _ = env.reset()

Reward state is None


In [6]:
env.close()