<a href="https://colab.research.google.com/github/adamclark/jupyter-notebooks/blob/main/RL/delayed-reward/RL_delayed_reward_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install stable-baselines3 -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.5/184.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import gymnasium as gym
import numpy as np
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback

In [10]:
# 🛠️ Custom environment with only terminal reward
class TerminalRewardEnv(gym.Env):
    def __init__(self):
        super(TerminalRewardEnv, self).__init__()
        self.observation_space = spaces.Box(low=0, high=1, shape=(4,), dtype=np.float32)
        self.action_space = spaces.Discrete(2)
        self.current_step = 0
        self.max_steps = 20

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        return self._get_obs(), {}

    def step(self, action):
        self.current_step += 1
        done = self.current_step >= self.max_steps
        truncated = False  # Use True if episode truncated by timeout or external reason

        reward = 0.0
        if done:
            # Reward is only given at the end of the episode
            reward = np.random.choice([1.0, 0.0], p=[0.3, 0.7])  # Random success
        return self._get_obs(), reward, done, truncated, {}

    def _get_obs(self):
        return np.random.rand(4).astype(np.float32)

    def render(self, mode='human'):
        print(f"Step: {self.current_step}, State: {self.state}")

# 📋 Callback to print final reward of each episode
class TerminalRewardLogger(BaseCallback):
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.episode_rewards = []
        self.episode_step_count = 0

    def _on_step(self) -> bool:
        self.episode_step_count += 1
        done = self.locals["dones"][0]
        reward = self.locals["rewards"][0]
        if done:
            self.episode_rewards.append(reward)
            print(f"Episode {len(self.episode_rewards)} took {self.episode_step_count} steps and ended with reward: {reward}")
            self.episode_step_count = 0
        return True

In [12]:
# 🎮 Instantiate env and model
env = TerminalRewardEnv()
model = PPO("MlpPolicy", env, verbose=0)

# 🚀 Train with reward logger
logger = TerminalRewardLogger()
model.learn(total_timesteps=5000, callback=logger)

Episode 1 took 20 steps and ended with reward: 1.0
Episode 2 took 20 steps and ended with reward: 0.0
Episode 3 took 20 steps and ended with reward: 0.0
Episode 4 took 20 steps and ended with reward: 0.0
Episode 5 took 20 steps and ended with reward: 1.0
Episode 6 took 20 steps and ended with reward: 0.0
Episode 7 took 20 steps and ended with reward: 1.0
Episode 8 took 20 steps and ended with reward: 0.0
Episode 9 took 20 steps and ended with reward: 0.0
Episode 10 took 20 steps and ended with reward: 0.0
Episode 11 took 20 steps and ended with reward: 0.0
Episode 12 took 20 steps and ended with reward: 0.0
Episode 13 took 20 steps and ended with reward: 0.0
Episode 14 took 20 steps and ended with reward: 1.0
Episode 15 took 20 steps and ended with reward: 0.0
Episode 16 took 20 steps and ended with reward: 0.0
Episode 17 took 20 steps and ended with reward: 1.0
Episode 18 took 20 steps and ended with reward: 0.0
Episode 19 took 20 steps and ended with reward: 0.0
Episode 20 took 20 st

<stable_baselines3.ppo.ppo.PPO at 0x79a9984fb950>

In [13]:
obs, info = env.reset()  # Reset environment and get initial observation

for _ in range(20):  # Run for 20 steps (or full episode length)
    action, _states = model.predict(obs, deterministic=True)  # Predict action given observation
    obs, reward, done, truncated, info = env.step(action)  # Take action in environment
    env.render()  # (Optional) render the environment if supported

    if done:
        print("Episode finished")
        break

AttributeError: 'TerminalRewardEnv' object has no attribute 'state'