# Shuffling behaviour
The purpose of this notebook is to train to move a policy that has learned to _stand_. 
It is the first step towards walking, and usually manifests with a policy that moves around dragging its feet (-> shuffling around)
Again, we will provide a training and an evaluation section, along with the reward function and the "stand" policy

In [1]:
import time
import numpy as np
from stable_baselines3 import PPO

import sys
import os

# Start from the current working directory (where notebook is)
cwd = os.getcwd()

# Go two levels up (to the "grandparent")
grandparent_dir = os.path.abspath(os.path.join(cwd, "..", ".."))

# Add to sys.path if not already there
if grandparent_dir not in sys.path:
    sys.path.insert(0, grandparent_dir)

from SpotmicroEnv import SpotmicroEnv
from reward_function import reward_function, RewardState

pybullet build time: Apr  4 2025 18:56:19


# Training

In [None]:
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.logger import configure

# ========= CONFIG ==========
TOTAL_STEPS = 8_000_000
run = "shuffle"
base="stand"

log_dir = f"./logs/{run}"

def clipped_linear_schedule(initial_value, min_value=1e-5):
    def schedule(progress_remaining):
        return max(progress_remaining * initial_value, min_value)
    return schedule

checkpoint_callback = CheckpointCallback(
    save_freq=TOTAL_STEPS // 10,
    save_path=f"{run}_checkpoints",
    name_prefix=f"ppo_{run}"
)

# ========= ENV ==========
env = SpotmicroEnv(
    use_gui=False,
    reward_fn=reward_function, 
    reward_state=RewardState(),
    src_save_file=f"{base}.pkl",
    dest_save_file=f"{run}.pkl"
)
check_env(env, warn=True)

# ========= MODEL ==========
model = PPO.load(f"ppo_{base}")
model.set_env(env)
model.tensorboard_log = log_dir

# Custom logger: ONLY csv + tensorboard (no stdout table)
new_logger = configure(log_dir, ["csv", "tensorboard"])
model.set_logger(new_logger)

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./logs

model.learn(
    total_timesteps=TOTAL_STEPS,
    reset_num_timesteps=False,
    callback=checkpoint_callback
)
model.save(f"ppo_{run}")
env.close()

# Evaluation

In [5]:
policy = "shuffle"

env = SpotmicroEnv(
    use_gui=True, 
    reward_fn=reward_function,
    reward_state=RewardState(),
    src_save_file=f"{policy}.pkl"
    )
obs, _ = env.reset()

# === Load model ===
model = PPO.load(f"ppo_{policy}")

# === Run rollout ===
for _ in range(3001):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        print("Terminated")
        env.plot_reward_components()
        obs, _ = env.reset()
    time.sleep(1/60)

env.close()



b3Printf: front_right_leg_link_cover

b3Printf: No inertial data for link, using mass=1, localinertiadiagonal = 1,1,1, identity local inertial frame

b3Printf: rear_left_leg_link_cover

b3Printf: No inertial data for link, using mass=1, localinertiadiagonal = 1,1,1, identity local inertial frame

b3Printf: rear_right_leg_link_cover
numActiveThreads = 0
stopping threads
Thread with taskId 0 exiting
Thread TERMINATED
destroy semaphore
semaphore destroyed
destroy main semaphore
main semaphore destroyed
finished
numActiveThreads = 0
btShutDownExampleBrowser stopping threads
Thread with taskId 0 exiting
Thread TERMINATED
destroy semaphore
semaphore destroyed
destroy main semaphore
main semaphore destroyed
startThreads creating 1 threads.
starting thread 0
started thread 0 
argc=2
argv[0] = --unused
argv[1] = --start_demo_name=Physics Server
ExampleBrowserThreadFunc started
X11 functions dynamically loaded using dlopen/dlsym OK!
X11 functions dynamically loaded using dlopen/dlsym OK!
Creat

error: Not connected to physics server.