# **SYDE 552 Notebook**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd 'gdrive'/'MyDrive'/'4B'/'SYDE552'/'project'


In [None]:
!apt-get install -qq ffmpeg freeglut3-dev xvfb  # For visualization

!pip install -q git+https://github.com/DLR-RM/stable-baselines3#egg=stable-baselines3[extra]

!pip install Box2D
!pip install box2d-py
# pip3 install gym[all]
# pip3 install gym[Box_2D]

In [None]:
import gym
import numpy as np
import torch as th
import matplotlib.pyplot as plt

from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.env_util import make_atari_env

from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results

In [None]:
log_dir = "tmp/"
tensorboard_log = "data/tb/"

# env = gym.make("CartPole-v1")
env = gym.make("LunarLander-v2")

# env = make_atari_env('BreakoutNoFrameskip-v4', n_envs=8, seed=661550378)
# # Frame-stacking with 4 frames
# env = VecFrameStack(env, n_stack=4)

# env = Monitor(env, log_dir)

## Algorithm Init

In [None]:
# model = A2C("MlpPolicy",
#             env,
#             verbose=0,
#             gamma=0.99,
#             learning_rate=4e-3,
#             policy_kwargs=dict(net_arch=[256, 256]),
#             tensorboard_log=tensorboard_log,
#             seed=2)

# model = A2C("MlpPolicy",
#             env,
#             verbose=0,
#             learning_rate=4e-3,
#             tensorboard_log=tensorboard_log,
#             seed=2)

# model = DQN("MlpPolicy",
#             env,
#             verbose=1,
#             train_freq=16,
#             gradient_steps=8,
#             gamma=0.99,
#             exploration_fraction=0.2,
#             exploration_final_eps=0.07,
#             target_update_interval=600,
#             learning_starts=1000,
#             buffer_size=10000,
#             batch_size=128,
#             learning_rate=4e-3,
#             policy_kwargs=dict(net_arch=[256, 256]),
#             tensorboard_log=tensorboard_log,
#             seed=2)

# model = DQN("MlpPolicy",
#             env,
#             verbose=0,
#             learning_rate=4e-3,
#             tensorboard_log=tensorboard_log,
#             seed=2)

# model = PPO("MlpPolicy",
#             env,
#             verbose=0,
#             learning_rate=4e-3,
#             tensorboard_log=tensorboard_log,
#             policy_kwargs=dict(net_arch=[64, 64]),
#             seed=2)

model = PPO("MlpPolicy",
            env,
            verbose=0,
            tensorboard_log=tensorboard_log,
            seed=0)


# ppo breakout 
# model = PPO(policy = "CnnPolicy",
#             env = env,
#             batch_size = 256,
#             clip_range = 0.1,
#             ent_coef = 0.01,
#             gae_lambda = 0.9,
#             gamma = 0.99,
#             learning_rate = 2.5e-4,
#             max_grad_norm = 0.5,
#             n_epochs = 4,
#             n_steps = 128,
#             vf_coef = 0.5,
#             tensorboard_log=tensorboard_log,
#             verbose=1,
#             )

# model = A2C("CnnPolicy",
#             env,
#             verbose=0,
#             tensorboard_log=tensorboard_log,
#             seed=2)

# model = DQN("CnnPolicy",
#             env,
#             verbose=1,
#             tensorboard_log=tensorboard_log,
#             buffer_size=40000,
#             seed=0)

In [None]:
print(f'env: {env}\nmodel: {model}')

In [None]:
mean_reward, std_reward = evaluate_policy(model, model.get_env(), deterministic=True, n_eval_episodes=20)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:9.25 +/- 0.89


In [None]:
# import os
# os.makedirs(log_dir, exist_ok=True)

In [None]:
%reload_ext tensorboard

In [None]:
# Optional: Monitor training in tensorboard
%load_ext tensorboard
%tensorboard --logdir $tensorboard_log

In [None]:
timesteps = int(3e5)
model.learn(timesteps, log_interval=10)

In [None]:
mean_reward, std_reward = evaluate_policy(model, model.get_env(), deterministic=True, n_eval_episodes=20)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
# Set up fake display; otherwise rendering will fail
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'
import base64
from pathlib import Path
from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [None]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  eval_env = DummyVecEnv([lambda: gym.make(env_id)])
  # Start the video at step=0 and record 500 steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs, deterministic=False)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

In [None]:
### Cartpole 

# record_video('CartPole-v1', model, video_length=500, prefix='a2c-cartpole')
# show_videos('videos', prefix='a2c-cartpole')

# record_video('CartPole-v1', model, video_length=500, prefix='dqn-cartpole')
# show_videos('videos', prefix='dqn-cartpole')

# record_video('CartPole-v1', model, video_length=500, prefix='ppo-cartpole')
# show_videos('videos', prefix='ppo-cartpole')


### Lunar Lander

# record_video('LunarLander-v2', model, video_length=500, prefix='a2c-lunarlander')
# show_videos('videos', prefix='a2c-lunarlander')

# record_video('LunarLander-v2', model, video_length=500, prefix='dqn-lunarlander')
# show_videos('videos', prefix='dqn-lunarlander')

# record_video('LunarLander-v2', model, video_length=500, prefix='ppo-lunarlander3')
# show_videos('videos', prefix='ppo-lunarlander3')


### Breakout

# record_video('BreakoutNoFrameskip-v4', model, video_length=500, prefix='a2c-breakout')
# show_videos('videos', prefix='a2c-breakout')

# record_video('BreakoutNoFrameskip-v4', model, video_length=100, prefix='dqn-breakout1')
# show_videos('videos', prefix='dqn-breakout1')

# record_video('BreakoutNoFrameskip-v4', model, video_length=500, prefix='ppo-breakout')
# show_videos('videos', prefix='ppo-breakout')



Saving video to /content/gdrive/MyDrive/4B/SYDE552/project/videos/ppo-cartpole-step-0-to-step-500.mp4
