In [None]:
# Stable Baselines only supports tensorflow 1.x for now
%tensorflow_version 1.x
!pip install stable-baselines[mpi]==2.10.0

from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.policies import CnnPolicy  # CNN Policy to learn from images
from stable_baselines.common.vec_env import VecFrameStack, VecVideoRecorder  # Method to stack frames together
# from stable_baselines import ACER  # Actor Critic with Experience Replay
from stable_baselines import PPO2
# from stable_baselines.common.callbacks import BaseCallback
from stable_baselines.common.callbacks import CallbackList, CheckpointCallback, EvalCallback
# from stable_baselines.results_plotter import load_results, ts2xy
# from stable_baselines import results_plotter
# from stable_baselines.bench import Monitor
# import tensorflow as tf


import gym
import numpy as np
import matplotlib.pyplot as plt
import os
import datetime
from google.colab import files


TensorFlow 1.x selected.
Collecting stable-baselines[mpi]==2.10.0
[?25l  Downloading https://files.pythonhosted.org/packages/e5/fe/db8159d4d79109c6c8942abe77c7ba6b6e008c32ae55870a35e73fa10db3/stable_baselines-2.10.0-py3-none-any.whl (248kB)
[K     |████████████████████████████████| 256kB 16.2MB/s 
Installing collected packages: stable-baselines
  Found existing installation: stable-baselines 2.2.1
    Uninstalling stable-baselines-2.2.1:
      Successfully uninstalled stable-baselines-2.2.1
Successfully installed stable-baselines-2.10.0
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [None]:
base_path = "./drive/MyDrive/Spring 2021/CS 354/project/pacman0to80_real/run0to50/output"

# Callback method to save agents during training
def callback_model_save(_locals, _globals):
    # Save the model every 10000 calls
    global n_steps
    if (n_steps) % 50000 == 0:
        print("Saving model after {} steps".format(n_steps))
        _locals['self'].save("{}/tmp/tmp_model_{}calls.pkl".format(base_path,n_steps))
    n_steps += 1
    return True
# Loop to train agents
def train(envName):
    # Hyper Params

    num_env = 16  # number of concurrent environments being created
    timesteps = 50000000  # total timesteps agent will learn for

    # Make environment
    env = make_atari_env(  # make_atari_env handles converting frames to grayscale and 84x84
        envName,
        num_env=num_env,
        seed=0)
  # st
    # Stack 4 frames together
    env = VecFrameStack(env, n_stack=4)

    # Callbacks  


    # Compile model
    tb_path = base_path + '/tensorboard/'
    model = PPO2(
        policy=CnnPolicy,  # the policy -> CNN
        env=env,  # environemnt to learn on 
        verbose=1,
        tensorboard_log= tb_path # log location 
    )

    
    # model_path = './drive/MyDrive/Spring 2021/CS 354/project/bankHeist/ppo2/bankHeist0to40M/output/models/final_ppo2_bankHeist_v1.pkl'
    # model = PPO2.load(model_path, 
    #                   env=env, 
    #                   tensorboard_log = tb_path ,
    #                   verbose = 1)


    # Train model

    model.learn(total_timesteps=timesteps,
                callback=callback_model_save,
                tb_log_name = 'logs_0to50',
                reset_num_timesteps = False)
  
    # Save model
    model.save(base_path+"/models/ppo2_0to50.pkl")


# Test Model 300 seconds for 180k
def test(envName):
    
    envName = envName
    # Variables
    model_path = base_path+"/models/ppo2_0to50.pkl"
    video_len = 5000 # steps in env, if episode terminates will start another
    video_folder = base_path+'/videos'
    num_env = 2 # number of videos playing side by side 

    # Create Env 
    env = make_atari_env(envName, num_env=num_env, seed=0)
    env = VecFrameStack(env, n_stack=4)

    # Load model
    model = PPO2.load(model_path, env = env, verbose = 1)
    #Wrap environemnt to record a video

    now = datetime.datetime.now()  # get a timestamp for video
    date_time = now.strftime("%m-%d-%Y_%H%M")

    env = VecVideoRecorder(env,
                            video_folder,
                            record_video_trigger=lambda x: x == 0,
                            video_length=video_len,
                            name_prefix='ppo2_{}_pacman_0to50_{}'.format(envName, date_time))

    # Record video starting at first timestep
    state = env.reset()

    for i in range(video_len + 1):
        action, _states = model.predict(state)
        state, reward, done, info = env.step(action)

    print('Video done recording')
    env.close()

    # Evaluate Policy with no training vs with training

    env = make_atari_env(envName, num_env=1, seed=0)
    env = VecFrameStack(env, n_stack=4)

    # Trained model 
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

    model = PPO2(
        policy=CnnPolicy,  # the policy -> CNN
        env=env,  # environemnt to learn on 
        verbose=1
    )
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")


In [None]:
if __name__ == '__main__':
    os.makedirs(base_path+ "/models/", exist_ok=True)  # for pickled models
    os.makedirs(base_path+ "/tensorboard/", exist_ok=True)  # for tb log
    os.makedirs(base_path+ "/videos/",
                exist_ok=True)  # for recorded videos outputed in test
    os.makedirs(base_path+ "/tmp/", exist_ok=True)
    n_steps = 1
    train('MsPacmanNoFrameskip-v4')  # train model and save
    test('MsPacmanNoFrameskip-v4')  # record a video of model and output some stats about its test


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| time_elapsed       | 8.47e+04     |
| total_timesteps    | 48146432     |
| value_loss         | 0.50126386   |
-------------------------------------
-------------------------------------
| approxkl           | 0.010568599  |
| clipfrac           | 0.09118652   |
| ep_len_mean        | 1.09e+03     |
| ep_reward_mean     | 2.7e+03      |
| explained_variance | 0.985        |
| fps                | 548          |
| n_updates          | 23510        |
| policy_entropy     | 0.90970933   |
| policy_loss        | -0.017122615 |
| serial_timesteps   | 3009280      |
| time_elapsed       | 8.47e+04     |
| total_timesteps    | 48148480     |
| value_loss         | 0.6993804    |
-------------------------------------
-------------------------------------
| approxkl           | 0.013368022  |
| clipfrac           | 0.12988281   |
| ep_len_mean        | 1.09e+03     |
| ep_reward_mean     | 2.7e+03      |
| explained_variance | 

In [None]:
# Observe training
%load_ext tensorboard
# %tensorboard --logdir '/content/drive/MyDrive/Spring 2021/CS 354/project/pacman/ppo2_v5/'
# %tensorboard --logdir '/content/drive/MyDrive/Spring 2021/CS 354/project/pacman/ppo2_v5/output/tensorboard/ppo2_v5_run40to80_1/'
%tensorboard --logdir '/content/drive/MyDrive/Spring 2021/CS 354/project/pacman0to80_real/run0to50/output/tensorboard/'

In [None]:
# An implementation of TD3 that I did not end up using
from stable_baselines import TD3
from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

def callback_model_save(_locals, _globals):
    # Save the model every 10000 calls
    global n_steps
    if (n_steps) % 50000 == 0:
        print("Saving model after {} steps".format(n_steps))
        _locals['self'].save("{}/tmp/tmp_model_{}calls.pkl".format(base_path,n_steps))
    n_steps += 1
    return True

def train():
  # Make env
  timesteps = 10000000  # total timesteps agent will learn for
  # envTest = gym.make('MsPacmanNoFrameskip-v4')
  n_actions = 9
  # Make environment
  env = make_atari_env(  # make_atari_env handles converting frames to grayscale and 84x84
      'MsPacmanNoFrameskip-v4',
      num_env=1,
      seed=0)

  # Stack 4 frames together
  env = VecFrameStack(env, n_stack=4)

  # The noise objects for TD3

  action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

  tb_path = base_path + '/tensorboard/'
  model = TD3(policy = CnnPolicy, env = env,buffer_size = 1000000, learning_starts = 10000, tensorboard_log = tb_path, action_noise = action_noise, verbose = 1)
  model.learn(total_timesteps = timesteps,log_interval = 10)
  model.save(base_path + '/models/final_td3_pacman_v1_1.pkl')

def test(envName, model_path, base_path):
    base_path = base_path
    # envName = 'MsPacmanNoFrameskip-v4'
    envName = envName
    # Variables
    # model_path = './gdrive/MyDrive/Spring 2021/CS 354/project/ppo2/ppo2_v5/output/models/final_ppo2_pacman_v5.pkl'
    model_path = model_path
    video_len = 5000 # steps in env, if episode terminates will start another
    video_folder = base_path+'/videos'
    num_env = 1 # number of videos playing side by side 

    # Create Env 
    env = make_atari_env(envName, num_env=num_env, seed=0)
    env = VecFrameStack(env, n_stack=4)

    # Load model
    model = TD3.load(model_path, env = env, verbose = 1)
    #Wrap environemnt to record a video

    now = datetime.datetime.now()  # get a timestamp for video
    date_time = now.strftime("%m-%d-%Y_%H%M")

    env = VecVideoRecorder(env,
                            video_folder,
                            record_video_trigger=lambda x: x == 0,
                            video_length=video_len,
                            name_prefix='ppo2_{}_{}'.format(envName, date_time))

    # Record video starting at first timestep
    state = env.reset()

    for i in range(video_len + 1):
        action, _states = model.predict(state)
        state, reward, done, info = env.step(action)

    print('Video done recording')
    env.close()

    # Evaluate Policy with no training vs with training

    env = make_atari_env(envName, num_env=1, seed=0)
    env = VecFrameStack(env, n_stack=4)

    # Trained model 
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

    model = TD3(CnnPolicy, env,buffer_size = 1000000, learning_starts = 10000, action_noise = action_noise, verbose = 1)
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")


base_path = "./gdrive/MyDrive/Spring 2021/CS 354/project/td3/td3_v1/output"
# os.makedirs(base_path+ "/models/", exist_ok=True)  # for pickled models
# os.makedirs(base_path+ "/tensorboard/", exist_ok=True)  # for tb log
# os.makedirs(base_path+ "/videos/",
#             exist_ok=True)  # for recorded videos outputed in test
# os.makedirs(base_path+ "/tmp/", exist_ok=True)
n_steps = 1
train()
model_path = base_path + '/models/final_td3_pacman_v1_1.pkl'
test('MsPacmanNoFrameskip-v4',model_path,base_path )

In [None]:
# GPU 
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Wed May 12 22:04:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Transfer learning from Alien to Bank Heist using 120M steps trained PPO2

base_path = './drive/MyDrive/Spring 2021/CS 354/project/alien_to_pacman/alien120M_to_Bank_80M/output'

def train():

  # Create env 
  transfer_env = 'BankHeistNoFrameskip-v4'
  num_env = 16
  env = make_atari_env(transfer_env, num_env=num_env, seed=0)
  env = VecFrameStack(env, n_stack=4)

  # Import model 
  tb_path = base_path+'/tensorboard/'
  model_path = './drive/MyDrive/Spring 2021/CS 354/project/alien_to_pacman/alien120M_to_Bank_40M/output/models/final_ppo2_transfer_v1.pkl'
  model = PPO2.load(model_path, 
                        env=env, 
                        tensorboard_log = tb_path ,
                        verbose = 1)


  # Train model
  timesteps  = 40000000
  model.learn(total_timesteps=timesteps,
              callback=callback_model_save,
              tb_log_name = 'ppo2_transfer_40Mto80M',
              reset_num_timesteps = True)

  # Save model
  model.save(base_path+"/models/final_ppo2_transfer_v1.pkl")


 
# Test Model 
def test(envName):
    
    envName = envName
    # Variables
    model_path = base_path+"/models/final_ppo2_transfer_v1.pkl"
    video_len = 5000 # steps in env, if episode terminates will start another
    video_folder = base_path+'/videos'
    num_env = 2 # number of videos playing side by side 

    # Create Env 
    env = make_atari_env(envName, num_env=num_env, seed=0)
    env = VecFrameStack(env, n_stack=4)

    # Load model
    model = PPO2.load(model_path, env = env, verbose = 1)
    #Wrap environemnt to record a video

    now = datetime.datetime.now()  # get a timestamp for video
    date_time = now.strftime("%m-%d-%Y_%H%M")

    env = VecVideoRecorder(env,
                            video_folder,
                            record_video_trigger=lambda x: x == 0,
                            video_length=video_len,
                            name_prefix='ppo2_{}_transfer_0to40M_{}'.format(envName, date_time))

    # Record video starting at first timestep
    state = env.reset()

    for i in range(video_len + 1):
        action, _states = model.predict(state)
        state, reward, done, info = env.step(action)

    print('Video done recording')
    env.close()

    # Evaluate Policy with no training vs with training

    env = make_atari_env(envName, num_env=1, seed=0)
    env = VecFrameStack(env, n_stack=4)

    # Trained model 
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

    model = PPO2(
        policy=CnnPolicy,  # the policy -> CNN
        env=env,  # environemnt to learn on 
        verbose=1
    )
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")


if __name__ == '__main__':
    os.makedirs(base_path+ "/models/", exist_ok=True)  # for pickled models
    os.makedirs(base_path+ "/tensorboard/", exist_ok=True)  # for tb log
    os.makedirs(base_path+ "/videos/",
                exist_ok=True)  # for recorded videos outputed in test
    os.makedirs(base_path+ "/tmp/", exist_ok=True)
    n_steps = 1
    train()  # train model and save
    test('BankHeistNoFrameskip-v4')  # record a video of model and output some stats about its test
    test('AlienNoFrameskip-v4')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| ep_len_mean        | 1.96e+03     |
| ep_reward_mean     | 1.25e+03     |
| explained_variance | 0.988        |
| fps                | 563          |
| n_updates          | 19199        |
| policy_entropy     | 0.91759795   |
| policy_loss        | -0.009112738 |
| serial_timesteps   | 2457472      |
| time_elapsed       | 7.26e+04     |
| total_timesteps    | 39319552     |
| value_loss         | 0.035577968  |
-------------------------------------
-------------------------------------
| approxkl           | 0.011449854  |
| clipfrac           | 0.12792969   |
| ep_len_mean        | 1.96e+03     |
| ep_reward_mean     | 1.25e+03     |
| explained_variance | 0.986        |
| fps                | 548          |
| n_updates          | 19200        |
| policy_entropy     | 0.9540668    |
| policy_loss        | -0.008506415 |
| serial_timesteps   | 2457600      |
| time_elapsed       | 7.26e+04     |
| total_timesteps    | 

In [None]:
%load_ext tensorboard
%tensorboard --logdir './drive/MyDrive/Spring 2021/CS 354/project/ppo2/transfer/output/tensorboard/'

In [None]:
print(gym.make('WizardOfWorNoFrameskip-v4').action_space)
# Alein and Bank Heist have same action space - 18
# MsPacman has 9
# Amidar 10
# Wizard of war - 10


Discrete(10)
