In [1]:
import gymnasium as gym
from gymnasium.wrappers import TransformReward
from stable_baselines3 import DDPG, PPO
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
import numpy as np
import torch
import os

# Create directories for logs and models
log_dir = "./ddpg_logs/"
os.makedirs(log_dir, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

### Energy-Based Reward (Often Considered Best):

This is theoretically well-grounded and directly encourages the physics needed to solve the task. The agent needs to build potential energy (height) and kinetic energy (velocity).

- Concept: Reward the agent based on its mechanical energy (potential + kinetic). Potential energy is proportional to height (sin(3 * position) in this environment's dynamics), and kinetic energy is proportional to velocity^2.
- Potential Function (phi(state)): Define a potential function based on energy. A common form is:
phi(position, velocity) = C1 * sin(3 * position) + C2 * velocity**2
where C1 and C2 are positive scaling constants you need to tune (e.g., C1=1, C2=1 or C1=10, C2=1).
- Shaped Reward (r_shaped): Use potential-based reward shaping. The reward at a step is the original reward plus the change in potential, discounted by gamma (the RL algorithm's discount factor).
r_shaped = r_original + gamma * phi(next_position, next_velocity) - phi(position, velocity)
- Implementation:
Remove the original action penalty (r_original = 0 except for the goal).
Keep the +100 goal reward.
reward = (100 if goal_reached else 0) + gamma * phi(next_state) - phi(current_state)


**Why it's good: Directly rewards increasing energy, which is precisely the strategy needed to climb the hill. It provides dense feedback on every step based on progress in energy.**

In [2]:
class MountainCarContinuousRewardWrapper(gym.Wrapper):
    def __init__(self, env, gamma=0.99):
        super().__init__(env)
        self.gamma = gamma
        self.prev_pos = None
        self.prev_vel = None

    def reset(self, **kwargs):
        observation, info = self.env.reset(**kwargs)
        self.prev_pos = observation[0]
        self.prev_vel = observation[1]
        return observation, info

    def potential(self, position, velocity):
        # Tunable constants
        C1 = 10.0  # Weight for potential energy (height)
        C2 = 1.0   # Weight for kinetic energy (velocity^2)
        # Height approximation based on environment dynamics
        height = np.sin(3 * position)
        # Kinetic energy approximation
        kinetic_energy = 0.5 * velocity**2 # Mass implicitly handled by scaling C2
        return C1 * height + C2 * kinetic_energy

    def reward(self, reward, obs):
        # Note: `reward` argument is the original reward from the underlying env
        # We will mostly ignore it, except for the +100 goal bonus.

        position, velocity = obs
        goal_reached = position >= 0.45

        # Original reward (only the goal part matters)
        original_reward = 100.0 if goal_reached else 0.0

        # Calculate potentials for shaping
        prev_potential = self.potential(self.prev_pos, self.prev_vel)
        current_potential = self.potential(position, velocity)

        # Update history for next step
        self.prev_pos = position
        self.prev_vel = velocity

        # Calculate shaped reward
        shaped_reward = original_reward + self.gamma * current_potential - prev_potential

        # We removed the default action penalty implicitly by calculating from potential
        return shaped_reward
    
    def step(self, action):
        obs, reward, terminated, truncated, info = super().step(action)
        info["original_reward"] = reward  # Store original reward

        reward = self.reward(reward, obs)

        return obs, reward, terminated, truncated, info

In [3]:
# Create a vectorized environment (optional: use `make_vec_env` for parallelism)
# env = gym.make_vec("MountainCarContinuous-v0", num_envs=4)  # Classic continuous control task
env = gym.make("MountainCarContinuous-v0")

def shaped_reward(reward):
    position = env.unwrapped.state[0]
    velocity = env.unwrapped.state[1]
    base_shaping = position + 0.1 * velocity
    elapsed = env.unwrapped._elapsed_steps

    goal_bonus = 0.0
    if position >= 0.45:
        goal_bonus = 100.0  # Manual reward for reaching the goal

    return base_shaping + reward + goal_bonus

# env = TransformReward(env, shaped_reward)
env = MountainCarContinuousRewardWrapper(env)
env = Monitor(env, log_dir)  # Wrap for logging

model = PPO(
    "MlpPolicy",
    env,
    verbose=0,
    device="auto",
    tensorboard_log="./ppo_tensorboard/",
    gamma=0.99,
    learning_rate=3e-4,  # PPO's default LR
    n_steps=2048,  # Steps per environment per update
    batch_size=64,
    n_epochs=10,  # Number of optimization epochs per update
    clip_range=0.2,  # PPO's clipping parameter
    ent_coef=0.01,     # Encourages exploration
)

# Train the model
model.learn(total_timesteps=200_000, progress_bar=True)

# Save the model
model.save("ppo_mountaincar")

env.close()



Output()

In [None]:
# Evaluate the trained agent (with the original reward)
model = PPO.load("ppo_mountaincar")
eval_env = Monitor(gym.make("MountainCarContinuous-v0"))
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")

# Close environments
eval_env.close()


Mean reward: 93.25 ± 0.62


In [5]:
# Visualize the trained agent
def visualize_agent(model, env, episodes=5):
    try:
        for ep in range(episodes):
            obs, _ = env.reset()
            done = False
            while not done:
                action, _ = model.predict(obs, deterministic=True)
                obs, reward, terminated, truncated, _ = env.step(action)
                done = terminated or truncated
                try:
                    env.render()
                except Exception as e:
                    print(f"Render failed (probably closed window): {e}")
                    return  # Exit the visualization early
    finally:
        env.close()

# Create a new environment for visualization (with rendering)
vis_env = gym.make("MountainCarContinuous-v0", render_mode="human")
visualize_agent(model, vis_env, episodes=1)

2025-04-27 19:28:03.715 python[97076:14017939] +[IMKClient subclass]: chose IMKClient_Modern
2025-04-27 19:28:03.715 python[97076:14017939] +[IMKInputSession subclass]: chose IMKInputSession_Modern


In [None]:
from src.generate_demonstrations import collect_paired_demonstrations

print("--- Running Example ---")

# Define parameters based on the user request
MODEL_PATH = "ppo_mountaincar.zip" # Use Continuous version
ENV_ID = "MountainCarContinuous-v0"
CSV_FILE = "ppo_mountaincar_continuous_rollouts.csv"
DIR_NAME = "ppo_mountaincar_continuous_rollouts"
NUM_EPISODES = 5
DETERMINISTIC_ROLLOUT = False # Use stochastic actions for variety

device = "cuda" if torch.cuda.is_available() else "cpu" 
model = PPO.load(MODEL_PATH, device=device)
model_2 = PPO.load(MODEL_PATH, device=device)

print(f"Model loaded from {MODEL_PATH}")

# Create environment
env = gym.make(ENV_ID)
# NOTE: we use the original reward

print("\nCalling generate_and_save_rollouts...")

collect_paired_demonstrations(
    model_2,
    model,
    env,
    DIR_NAME,
    5,
)

env.close()

--- Running Example ---
Model loaded from ppo_mountaincar.zip

Calling generate_and_save_rollouts...

Collecting data from partial model...
Model 'partial' - Episode 1/5: Return=93.49, Length=77
Model 'partial' - Episode 2/5: Return=93.67, Length=85
Model 'partial' - Episode 3/5: Return=93.02, Length=79
Model 'partial' - Episode 4/5: Return=93.54, Length=79
Model 'partial' - Episode 5/5: Return=91.54, Length=113
Saved trajectories for partial model to ppo_mountaincar_continuous_rollouts/partial_model_trajectories.csv

Collecting data from full model...
Model 'full' - Episode 1/5: Return=93.12, Length=78
Model 'full' - Episode 2/5: Return=93.26, Length=77
Model 'full' - Episode 3/5: Return=91.36, Length=110
Model 'full' - Episode 4/5: Return=93.02, Length=79
Model 'full' - Episode 5/5: Return=93.49, Length=77
Saved trajectories for full model to ppo_mountaincar_continuous_rollouts/full_model_trajectories.csv

Created 35 preference pairs based on trajectory returns.
Preference pairs save

: 