In [None]:
import gymnasium as gym
import numpy as np

from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.monitor import Monitor

# Function to run Stable-Baselines3 DDPG
def run_SB3_DDPG(env_type, steps, save_dir):
    env = gym.make(env_type)
    env = Monitor(env)

    # The noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.2 * np.ones(n_actions))
    model = DDPG("MlpPolicy", env, action_noise=action_noise, batch_size=64, tau=0.001, verbose=1)
    model.learn(total_timesteps=steps, log_interval=10)
    model.save(f"{save_dir}/SB3_DDPG_{env_type}")

    # training rewards
    episode_rewards = env.get_episode_rewards()

    return episode_rewards
    

environments = ['Hopper-v5', 'HalfCheetah-v5', 'BipedalWalker-v3']
render_mode = ['human', 'human', 'human']
steps = 1_000  # Total training steps
save_dir = os.getcwd()

results = {}

for i, env_type in enumerate(environments):
    # # Run custom DDPG
    # print(f"Running custom DDPG on {env_type}...")
    # custom_scores = run_custom_DDPG(env_type, render_mode[i])

    # Run Stable-Baselines3 DDPG
    print(f"Running Stable-Baselines3 DDPG on {env_type}...")
    sb3_scores = run_SB3_DDPG(env_type, steps, save_dir)

    # Save results
    results[env_type] = {"sb3": sb3_scores}

    # Plot comparison
    plt.figure()
    # plt.plot(np.arange(1, len(custom_scores) + 1), custom_scores, label="Custom DDPG")
    plt.plot(np.arange(1, len(sb3_scores) + 1), sb3_scores, label="SB3 DDPG")
    plt.ylabel('Cumulative Reward')
    plt.xlabel('Episodes')
    plt.title(f"Comparison of DDPG Implementations on {env_type}")
    plt.legend()
    plt.savefig(f"{save_dir}/comparison_{env_type}.png")
    plt.close()

# Save results to CSV
for env, data in results.items():
    pd.DataFrame({
        "SB3_DDPG": data["sb3"]
    }).to_csv(f"{save_dir}/results_comparison_{env}.csv", index=False)


In [None]:
#### RENDERING FROM STABLE BASELINES 3 ####

from stable_baselines3 import DDPG
import gymnasium as gym
import os

# Define environments and model paths
environments = ['Hopper-v5', 'HalfCheetah-v5', 'BipedalWalker-v3']
save_dir = os.getcwd()

# Loop through environments and load corresponding models
for env_type in environments:
    # Load the environment
    env = gym.make(env_type, render_mode="human")
    
    # Load the model
    model_path = f"{save_dir}/SB3_DDPG_{env_type}.zip"
    model = DDPG.load(model_path, env=env)

    # Start evaluation
    obs = env.reset()[0]  # Extract initial observation from reset
    done = False
    cumulative_reward = 0
    print(f"Evaluating {env_type}...")
    
    while not done:
        # Predict action and step through the environment
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, truncate, info = env.step(action)
        cumulative_reward += reward

        # Render the environment
        env.render()

    print(f"Cumulative reward for {env_type}: {cumulative_reward}")
    env.close()
