In [1]:
%pip install -qr https://raw.githubusercontent.com/anton-dergunov/hugging-face-deep-rl/main/requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
import torch
import cv2
from IPython.display import Video
import imageio
import os
import shutil

### Create Environment

In [3]:
# Create a vectorized environment (parallel environments)
# "CartPole-v1" is a simple classic control task
# n_envs=4 means we run 4 copies of the environment in parallel
vec_env = make_vec_env("CartPole-v1", n_envs=4)

### Define & Train PPO Agent

In [4]:
# Initialize PPO agent with Multi-Layer Perceptron policy
# verbose=1 shows training logs
model = PPO("MlpPolicy", vec_env, verbose=1)

# Train the agent for 25,000 timesteps
model.learn(total_timesteps=25000)

# Save the trained model
MODEL_PATH = "models/ppo_cartpole"
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
model.save(MODEL_PATH)

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 20.2     |
|    ep_rew_mean     | 20.2     |
| time/              |          |
|    fps             | 21884    |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.6        |
|    ep_rew_mean          | 30.6        |
| time/                   |             |
|    fps                  | 8109        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.014538579 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | 0.00233     |
|    learning

### Load Trained Model

In [5]:
# Delete current model from memory
del model

# Load the saved model
model = PPO.load(MODEL_PATH)

### Run Trained Agent

In [6]:
def record_cartpole_video(model, video_path, steps=1000):
    TEMP_VIDEOS = "temp_videos"
    # Create env with video recording
    env = gym.make("CartPole-v1", render_mode="rgb_array")
    env = RecordVideo(env, video_folder=TEMP_VIDEOS, name_prefix="cartpole",
                      episode_trigger=lambda ep: True, disable_logger=True)

    frames = []
    obs, info = env.reset()
    episode, step = 0, 0

    for _ in range(steps):
        action, _ = model.predict(obs)
        obs, reward, terminated, truncated, info = env.step(action)
        frame = env.render()  # RGB frame

        # --- Overlay text (small, anti-aliased) ---
        cv2.putText(frame, f"Ep: {episode}", (10, 20),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)
        cv2.putText(frame, f"Step: {step}", (10, 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)

        frames.append(frame)
        step += 1

        if terminated or truncated:
            episode += 1
            step = 0
            obs, info = env.reset()

    env.close()

    # Save a single video file
    os.makedirs(os.path.dirname(video_path), exist_ok=True)
    imageio.mimsave(video_path, frames, fps=30, macro_block_size=None)
    print(f"Saved video to {video_path}")

    if os.path.exists(TEMP_VIDEOS):
        shutil.rmtree(TEMP_VIDEOS)

In [7]:
# Save to video
VIDEO_PATH = "videos/cartpole_demo.mp4"
os.makedirs(os.path.dirname(VIDEO_PATH), exist_ok=True)
record_cartpole_video(model, video_path=VIDEO_PATH, steps=1000)

  from pkg_resources import resource_stream, resource_exists


Saved video to videos/cartpole_demo.mp4


In [9]:
Video(VIDEO_PATH, embed=True)