In [1]:
# Import Reinforcement Learning Algoirthms
from stable_baselines3 import A2C, DDPG, DQN, HER, PPO, SAC, TD3
from sb3_contrib import ARS, MaskablePPO, QRDQN, RecurrentPPO, TQC, TRPO

# Import Logger and Recorder
from stable_baselines3.common.logger import configure
from stable_baselines3.common.vec_env import VecVideoRecorder

# Import OpenAI Gym & Other Important Libraries
import gymnasium as gym
import time
from typing import Type

# Import Data Analysis & Visualization Libraies
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt

2023-09-05 11:12:30.189093: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def test_algorithm(alg: Type['abc.ABCMeta'], env_name : str, timesteps : int):

    """
    Description:
        This function takes an SB3 Reinforcement Learning Algorithm as a parameter 
        and trains the specified OpenAI Gym environment using this algoirthm. The 
        trained model is then executed and a video recording of the agent is saved.

    Parameters:
        - alg (abc.ABCMeta) : Indicates which algorithm to use for training
        - env_name (str)    : Name of the Environment to test in
        - timesteps (int)   : Number of Timesteps for the Learning Stage

    Returns:
        - timesteps (int)      : Number of Timesteps the trained agent performed during execution
        - total_time (float)   : Number of Seconds the Training / Execution Took
        - total_reward (float) : The Total Reward From the Execution Stage of the Trained Agent
    """

    # Start Algorithm Timer
    alg_start = time.time()
    
    # Create environment
    env = None
    try: env = gym.make(env_name, render_mode = "rgb_array")
    except: raise ValueError(f"Unknown Environment: {env_name}")

    # Instantiate the agent
    model = None
    try: model = alg("MlpPolicy", env, verbose = 0)
    except: raise ValueError(f"Unknown Algorithm: {alg}")

    # Set up a new logger
    print("Setting up Logger:")
    log_path = f'./{env_name}/{alg.__name__}/log/'
    formats = ["stdout", "csv", "tensorboard"]
    new_logger = configure(log_path, formats)
    model.set_logger(new_logger)
    
    # Train the agent and display a progress bar
    model.learn(total_timesteps = int(timesteps), progress_bar = True)
    
    # Initialize total reward and timesteps
    total_rew : float = 0
    timesteps : int   = 0
    
    # Create a vectorized environment for recording
    vec_env = VecVideoRecorder(
        venv                 = model.get_env(),
        video_folder         = f'./{env_name}/{alg.__name__}/videos/',
        record_video_trigger = 10_000,
        video_length         = 10_000,
    )
    
    # Execute the Trained Agent
    obs = vec_env.reset()
    while True:
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, done, info = vec_env.step(action)
        vec_env.render("rgb_array")
        total_rew += rewards
        timesteps += 1
        if done: break
    
    # Close the vectorized Environment - Stops Recording
    vec_env.close()

    # Stop the Algorithm Timer
    total_time = time.time() - alg_start

    # Return the timesteps, execution time, and reward
    return timesteps, total_time, total_rew[0]

    # Delete Local Variables for Next Iteration
    del model, env, vec_env

In [3]:
#############################
# NON-CHANGEABLE PARAMETERS  
#############################

test_num = 1                       # Tracks the Test # of Current Iteration
results_dict = {}                  # Stores the Result for each Algorithm
it = 0                             # Iterator for the colors list

# Colors for creating data visuals
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']


#############################
# CHANGEABLE PARAMETERS 
#############################

env_name    = "LunarLander-v2"     # Name of the Environment to Run
train_steps = 500_000              # Number of Timesteps for Learning

# RL Algorithms for Testing
rl_algs = [A2C, DQN, PPO, TRPO]

In [None]:
# Start Test Counter & Start the Program Timer
prog_start = time.time()

# Test each RL algorithm with the defined parameters
for alg in rl_algs:

    # Program Status Output
    print("=============================================")
    print(f"Starting Test #{test_num}")
    print(f"   Environment: {env_name}")
    print(f"   Algorithm:   {alg.__name__}")
    print(f"   Timesteps:   {int(train_steps)}")
    print("=============================================")
    
    # Test the Algorithm with the defined parameters
    total_timesteps, total_time, total_reward = test_algorithm(
        alg       = alg,
        env_name  = env_name,
        timesteps = train_steps
    )

    # Add the Results to the Results Dictionary
    results_dict[alg.__name__] = (f"--------------------\n"
            f"The {alg.__name__} algorithm achieved a total reward of {total_reward:.2f} in {total_timesteps} timesteps.\n"
            f"The {alg.__name__} algorithm took {total_time:.2f} s ({(total_time/60):.2f} min) to execute.\n")

    # Print the Results
    print(results_dict[alg.__name__])

    # Increment the Test Number to start new iteration
    test_num += 1

# Stop the Program Timer
tot_time = time.time() - prog_start

Starting Test #1
   Environment: LunarLander-v2
   Algorithm:   A2C
   Timesteps:   500000
Setting up Logger:
Logging to ./LunarLander-v2/A2C/log/


Output()

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 94.4     |
|    ep_rew_mean        | -189     |
| time/                 |          |
|    fps                | 517      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.28    |
|    explained_variance | 0.0149   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -9.22    |
|    value_loss         | 55.1     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 101      |
|    ep_rew_mean        | -214     |
| time/                 |          |
|    fps                | 620      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|

Moviepy - Building video /Users/aadhilmsyed839/Documents/Seeloz/notebooks/LunarLander-v2/A2C/videos/rl-video-step-0-to-step-10000.mp4.
Moviepy - Writing video /Users/aadhilmsyed839/Documents/Seeloz/notebooks/LunarLander-v2/A2C/videos/rl-video-step-0-to-step-10000.mp4



                                                                                                                                                                                                                                              

Moviepy - Done !
Moviepy - video ready /Users/aadhilmsyed839/Documents/Seeloz/notebooks/LunarLander-v2/A2C/videos/rl-video-step-0-to-step-10000.mp4
--------------------
The A2C algorithm achieved a total reward of -46.90 in 1000 timesteps.
The A2C algorithm took 1164.83 s (19.41 min) to execute.

Starting Test #2
   Environment: LunarLander-v2
   Algorithm:   DQN
   Timesteps:   500000
Setting up Logger:
Logging to ./LunarLander-v2/DQN/log/


Output()

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 97.2     |
|    ep_rew_mean      | -224     |
|    exploration_rate | 0.993    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2460     |
|    time_elapsed     | 0        |
|    total_timesteps  | 389      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 94       |
|    ep_rew_mean      | -167     |
|    exploration_rate | 0.986    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2872     |
|    time_elapsed     | 0        |
|    total_timesteps  | 752      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 91.4     |
|    ep_rew_mean      | -173     |
|    exploration_rate | 0.979    |
| time/               |          |
|    episodes       

Moviepy - Building video /Users/aadhilmsyed839/Documents/Seeloz/notebooks/LunarLander-v2/DQN/videos/rl-video-step-0-to-step-10000.mp4.
Moviepy - Writing video /Users/aadhilmsyed839/Documents/Seeloz/notebooks/LunarLander-v2/DQN/videos/rl-video-step-0-to-step-10000.mp4



                                                                                                                                                                                                                                              

Moviepy - Done !
Moviepy - video ready /Users/aadhilmsyed839/Documents/Seeloz/notebooks/LunarLander-v2/DQN/videos/rl-video-step-0-to-step-10000.mp4
--------------------
The DQN algorithm achieved a total reward of -81.82 in 260 timesteps.
The DQN algorithm took 3928.94 s (65.48 min) to execute.

Starting Test #3
   Environment: LunarLander-v2
   Algorithm:   PPO
   Timesteps:   500000
Setting up Logger:
Logging to ./LunarLander-v2/PPO/log/


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 93.1     |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 1345     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 92           |
|    ep_rew_mean          | -168         |
| time/                   |              |
|    fps                  | 3            |
|    iterations           | 2            |
|    time_elapsed         | 1080         |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0050799632 |
|    clip_fraction        | 0.0164       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | -0.0048      |
|    learning_r

In [None]:
# Print the Results Dictionary HEADER
print("\n=============================================")
print("                Final Results                ")
print("=============================================\n")

for alg in rl_algs: 
    
    # Print the Result of Each Function
    print(f"{alg.__name__}:\n{results_dict[alg.__name__]}\n")

    # Read the Progress CSV file for each algorithm
    filename = f'./{env_name}/{alg.__name__}/log/progress.csv'
    df = pd.read_csv(filename)

    # Extract the Necessary Columns
    x = df['time/total_timesteps']
    y = df['rollout/ep_rew_mean']

    # Add the Data to the Plot
    plt.plot(x, y, label = alg.__name__, color = colors[it], linestyle='-', marker='o')
    it = ((it + 1) % len(colors))

# Add Labels and Title to the Plot
plt.xlabel('Total Timesteps')
plt.ylabel('Total Mean Reward')
plt.title('Algorithm Performance for Lunar Lander Environment')

# Display the Plot and its Legend
plt.legend()
plt.show()

# Print the Termination Message
print("\n=============================================")
print(f"This program took {(tot_time/60):.2f} mins to execute.\n")
print("Terminating Program...")
print("=============================================\n")