# SAC-Dyna-Q Algorithm

## Initialization
1. Initialize environment $ \text{env} $.
2. Initialize SAC agent $ \pi_\theta, Q_\phi^1, Q_\phi^2 $ and replay buffer $ \mathcal{D} $.
3. Set $ N_\text{real\_episodes}, N_\text{synthetic\_samples} $.

## Training Loop
For $ \text{episode} $ in $ N_\text{real\_episodes} $:
1. **Collect Real Data:**
   - Reset environment $ s_0 \leftarrow \text{env.reset()} $.
   - For each step in the environment:
     1. Select $ a_t \sim \pi_\theta(a|s_t) $.
     2. Execute $ a_t $, observe $ s_{t+1}, r_t, \text{done} $.
     3. Store $ (s_t, a_t, r_t, s_{t+1}, \text{done}) $ in $ \mathcal{D} $.

2. **Model-Free SAC Update:**
   - Sample $ (s, a, r, s', \text{done}) $ from $ \mathcal{D} $.
   - Update $ Q_\phi^1, Q_\phi^2, \pi_\theta $ using SAC objectives.

3. **Generate Synthetic Data:**
   - For $ i $ in $ N_\text{synthetic\_samples} $:
     1. Sample $ s $ from $ \mathcal{D} $ or the observation space.
     2. Predict $ a \sim \pi_\theta(a|s) $.
     3. Simulate $ s', r $ using the known dynamics:
        $$
        v_{t+1} = v_t + 0.0015 a_t - 0.0025 \cos(3 x_t), \quad x_{t+1} = x_t + v_t
        $$
     4. Store $ (s, a, r, s', \text{done}) $ in $ \mathcal{D} $.

4. **SAC Update with Synthetic Data:**
   - Repeat step 2 with synthetic data added to $ \mathcal{D} $.

## Testing
- Evaluate $ \pi_\theta $ by running multiple episodes in the real environment.
- Compute the average reward over all episodes.

In [235]:
from os import path
from typing import Optional

import numpy as np

import gymnasium as gym
from gymnasium import spaces
from gymnasium.envs.classic_control import utils

class ModelEnv(gym.Env):

    def __init__(self, render_mode: Optional[str] = None, g=10.0):
        self.max_speed = 8
        self.max_torque = 2.0
        self.dt = 0.05
        self.g = g
        self.m = 1.0
        self.l = 1.0

        self.render_mode = render_mode

        self.screen_dim = 500
        self.screen = None
        self.clock = None
        self.isopen = True

        high = np.array([1.0, 1.0, self.max_speed], dtype=np.float32)
        # This will throw a warning in tests/envs/test_envs in utils/env_checker.py as the space is not symmetric
        #   or normalised as max_torque == 2 by default. Ignoring the issue here as the default settings are too old
        #   to update to follow the gymnasium api
        self.action_space = spaces.Box(
            low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32
        )
        self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32)

    def step(self, u):
        th, thdot = self.state  # th := theta

        g = self.g
        m = self.m
        l = self.l
        dt = self.dt

        u = np.clip(u, -self.max_torque, self.max_torque)[0]
        self.last_u = u  # for rendering
        costs = angle_normalize(th) ** 2 + 0.1 * thdot**2 + 0.001 * (u**2)

        newthdot = thdot + (3 * g / (2 * l) * np.sin(th) + 3.0 / (m * l**2) * u) * dt
        newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
        newth = th + newthdot * dt

        self.state = np.array([newth, newthdot])

        # truncation=False as the time limit is handled by the `TimeLimit` wrapper added during `make`
        return self._get_obs(), -costs, False, False, {}

    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
        super().reset(seed=seed)
        if options is None:
            high = np.array([np.pi, 1.])
        low = -high  # We enforce symmetric limits.
        self.state = self.np_random.uniform(low=low, high=high)
        self.last_u = None

        return self._get_obs(), {}

    def _get_obs(self):
        theta, thetadot = self.state
        return np.array([np.cos(theta), np.sin(theta), thetadot], dtype=np.float32)

    def render(self):
        pass


def angle_normalize(x):
    return ((x + np.pi) % (2 * np.pi)) - np.pi

In [236]:
from stable_baselines3 import SAC
import numpy as np
import gymnasium as gym

def generate_synthetic_data(env, policy, replay_buffer, num_samples):
    """
    Generate synthetic transitions using the ModelEnv and add them to the SAC replay buffer.

    Parameters:
    - env: The ModelEnv instance used to simulate transitions.
    - policy: The SAC policy used to predict actions.
    - replay_buffer: The SAC replay buffer where synthetic data will be stored.
    - num_samples: Number of synthetic transitions to generate.
    """
    for _ in range(num_samples):
        # Reset the environment to a random initial state
        state, _ = env.reset()

        # Predict action using the current policy
        action = policy.predict(state, deterministic=False)[0]

        # Step through the environment
        next_state, reward, terminated, truncated, _ = env.step(action)

        if terminated or truncated:
            done = True
        else:
            done = False

        # Add synthetic transition to the replay buffer (correct argument order)
        replay_buffer.add(state, next_state, action, reward, done, [{}])  # Correct argument order


In [243]:
from stable_baselines3.common.logger import configure

# Create the real environment (MountainCarContinuous-v0)
real_env = gym.make("Pendulum-v1")

# Create the planning environment (ModelEnv)
model_env = ModelEnv()

# Configure logger
logger = configure("./logs", ["stdout", "csv"])

# Initialize the SAC agent
model = SAC(
    "MlpPolicy",
    real_env,
    buffer_size=10000,
    # learning_rate=1e-3,
    verbose=2,
)

model.set_logger(logger)  # Ensure logger is set

# Training parameters
num_real_episodes = 20 # use 40 without synthetic data
num_synthetic_samples = 100 
steps_per_episode = 200  # Max steps per episode
gradient_steps = 2

def train():
    # Training loop
    for episode in range(num_real_episodes):
        print(f"Episode {episode + 1}/{num_real_episodes}")

        total_reward = 0.
        # Real environment interaction
        state, _ = real_env.reset()

        for _ in range(steps_per_episode):
            # Predict action using the current policy
            action, _ = model.predict(state, deterministic=True)

            # Take action in the real environment
            next_state, reward, done, truncated, _ = real_env.step(action)
            # Store real transition in the replay buffer
            model.replay_buffer.add(state, next_state, action, reward, done, [{}])

            # Update the agent using real data
            model.train(gradient_steps=gradient_steps)  # Specify the number of gradient steps

            state = next_state
            total_reward += reward
            if done or truncated:
                break

        print(total_reward)
        # Generate synthetic data for planning
        generate_synthetic_data(model_env, model.policy, model.replay_buffer, num_synthetic_samples)

        # Update the agent using synthetic data
        for _ in range(num_synthetic_samples):
            model.train(gradient_steps=gradient_steps) # Specify the number of gradient steps


Logging to ./logs
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [244]:
train()

Episode 1/20
-1616.9456392136676
Episode 2/20
-1260.8314701933825
Episode 3/20
-1498.4258610966913
Episode 4/20
-1004.0343376745576
Episode 5/20
-1310.1021712788888
Episode 6/20
-1182.2698668967444
Episode 7/20
-1269.0745698775866
Episode 8/20
-948.7601088815923
Episode 9/20
-3.3820523950600925
Episode 10/20
-882.1633047755436
Episode 11/20
-125.80697756636599
Episode 12/20
-392.51316708245866
Episode 13/20
-126.38799574354539
Episode 14/20
-449.9740361074566
Episode 15/20
-123.92257508164793
Episode 16/20
-384.0280549167835
Episode 17/20
-245.66296672527835
Episode 18/20
-114.83711341060992
Episode 19/20
-126.67620911377342
Episode 20/20
-251.446452651077


In [245]:
# Evaluate the trained agent
real_env = gym.make("Pendulum-v1", render_mode="human")
# real_env = gym.make("Pendulum-v1")
total_rewards = []
for _ in range(1):  # Evaluate for 10 episodes
    state, _ = real_env.reset()
    total_reward = 0
    done = False
    while not done:
        action, _ = model.predict(state, deterministic=True)
        action = np.array(action, dtype=np.float32).reshape(real_env.action_space.shape)
        state, reward, terminated, truncated, _ = real_env.step(action)
        total_reward += reward
        real_env.render()
        if terminated or truncated:
            done = True
    total_rewards.append(total_reward)

print(f"Average reward over 100 evaluation episodes: {np.mean(total_rewards)}")

Average reward over 100 evaluation episodes: -371.866648247885


In [110]:
import gymnasium as gym
from stable_baselines3 import PPO

def main():
    # Create the environment
    env = gym.make("Pendulum-v1")

    # Initialize the PPO model
    model = PPO(
        "MlpPolicy",  # Use a Multi-Layer Perceptron policy
        env,
        learning_rate=1e-3,
        verbose=1,
    )

    # Train the model
    model.learn(total_timesteps=100000)

    # Save the trained model
    model.save("ppo_mountaincar_continuous")

    # Load the trained model
    model = PPO.load("ppo_mountaincar_continuous")

    # Evaluate the model
    episodes = 100
    for ep in range(episodes):
        obs, _ = env.reset()
        done = False
        total_reward = 0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, truncated, _ = env.step(action)
            total_reward += reward
            if done or truncated:
                break
        print(f"Episode {ep + 1}: Total Reward = {total_reward}")

if __name__ == "__main__":
    main()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.46e+03 |
| time/              |           |
|    fps             | 7058      |
|    iterations      | 1         |
|    time_elapsed    | 0         |
|    total_timesteps | 2048      |
----------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -1.35e+03    |
| time/                   |              |
|    fps                  | 4785         |
|    iterations           | 2            |
|    time_elapsed         | 0            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0037082315 |
|    clip_fraction        | 0.033        |
|    clip_range           | 0.2         