# AIPI 531 - HW 1 Q2

## Elisa Chen
NetID: eyc11

# Q1: Training An Agent To Play The Cart Pole Game

In [80]:
#!sudo apt-get update && sudo apt-get install ffmpeg freeglut3-dev xvfb  # For visualization
#!pip install pyglet==1.5.27

In [81]:
#importing necessary libraries
import stable_baselines3
import gym
import numpy as np
import os
from stable_baselines3 import PPO, A2C #policy gradient
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
import base64
from pathlib import Path
from IPython import display as ipythondisplay

## Creating the Gym environment and instantiating an agent

In [84]:
#HYPERPARAMETERS
game = "CartPole-v1"
training_steps = 10000

In [87]:
env = gym.make(game)

model = A2C('MlpPolicy', env, verbose = 0, tensorboard_log="./a2c_cartpole_tensorboard/") #RL agent

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [88]:
#creating a separate environment for evaluation

# Create log dir
log_dir = "/tmp/gym/"
os.makedirs(log_dir, exist_ok=True)

eval_env = gym.make(game)
eval_env = Monitor(env, filename=log_dir, allow_early_resets=True)

# Use deterministic actions for evaluation
eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/',
log_path='./logs/', eval_freq=500,
deterministic=True, render=False)

# Random Agent, before training
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"The mean_reward before training the current policy is: {mean_reward:.2f} +/- {std_reward:.2f}")

The mean_reward before training the current policy is: 8.96 +/- 0.72


## Training the Agent and Evaluating Performance

In [89]:
# Train the agent
model.learn(total_timesteps=training_steps, callback = eval_callback)

Logging to ./a2c_cartpole_tensorboard/A2C_1
Eval num_timesteps=500, episode_reward=26.60 +/- 3.72
Episode length: 26.60 +/- 3.72
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 26.6     |
|    mean_reward        | 26.6     |
| time/                 |          |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.642   |
|    explained_variance | -0.669   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.47     |
|    value_loss         | 10.4     |
------------------------------------
New best mean reward!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 34.4     |
|    ep_rew_mean     | 34.4     |
| time/              |          |
|    fps             | 894      |
|    iterations      | 100      |
|    time_elapsed    | 0        |
|    total_timesteps | 500      |
-------------------------

<stable_baselines3.a2c.a2c.A2C at 0x7f53d79ed7c0>

## Monitor Training with Tensorboard

In [90]:
!tensorboard --logdir ./a2c_cartpole_tensorboard/

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.11.2 at http://localhost:6006/ (Press CTRL+C to quit)
^C


## Training Evaluation

In [91]:
# Evaluate the trained agent

#load the best model
model = A2C.load("./logs/best_model.zip")

mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"The mean_reward after training the model:{mean_reward:.2f} +/- {std_reward:.2f}")

The mean_reward after training the model:497.57 +/- 20.52


As we can observe, the reward has increased significantly after training the model, which is what we'd hope to see.

## Saving to Video

In [92]:
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [93]:
#helper function to record videos

def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = DummyVecEnv([lambda: gym.make('CartPole-v1')])
  # Start the video at step=0 and record 500 steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

#show the video

def show_videos(video_path='', prefix=''):
  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [94]:
record_video('CartPole-v1', model, video_length=500, prefix='ppo-cartpole')

Saving video to /workspaces/Reinforcement-Learning-Repo/hw1/rl-baselines3-zoo/videos/ppo-cartpole-step-0-to-step-500.mp4


In [95]:
show_videos('videos', prefix='ppo')