In [1]:
import os
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
# DRIVE_PATH = '/content/gdrive/MyDrive/RL-proj'
DRIVE_PATH = '/content/gdrive/MyDrive/cs182/RL-proj'
DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\', '')
if not os.path.exists(DRIVE_PYTHON_PATH):
  %mkdir $DRIVE_PATH

## the space in `My Drive` causes some issues,
## make a symlink to avoid this
# SYM_PATH = '/content/cs182/RL-proj'
# if not os.path.exists(SYM_PATH):
#   !ln -s $DRIVE_PATH $SYM_PATH

Mounted at /content/gdrive


In [None]:
DRIVE_PATH = "/tmp"

In [2]:
%tensorflow_version 1.x
!pip install procgen -q
!pip install -q git+https://github.com/openai/baselines #> ~/pip_install_baselines.log
!pip install mpi4py -q

TensorFlow 1.x selected.
  Building wheel for baselines (setup.py) ... [?25l[?25hdone


In [3]:
import tensorflow as tf
from baselines.ppo2 import ppo2
from baselines.common.models import build_impala_cnn
from baselines.common.mpi_util import setup_mpi_gpus
from procgen import ProcgenEnv
from baselines.common.vec_env import (
    VecExtractDictObs,
    VecMonitor,
    VecFrameStack,
    VecNormalize
)
from baselines import logger
from mpi4py import MPI

import time, os, gym

In [4]:
import numpy as np
import tensorflow as tf
from baselines.a2c import utils
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch

def build_model(unscaled_images, depths=[16,32,32], bn=False, **conv_kwargs):
    """
    Model used in the paper "IMPALA: Scalable Distributed Deep-RL with
    Importance Weighted Actor-Learner Architectures" https://arxiv.org/abs/1802.01561
    """

    layer_num = 0

    def get_layer_num_str():
        nonlocal layer_num
        num_str = str(layer_num)
        layer_num += 1
        return num_str

    def conv_layer(out, depth):
        return tf.layers.conv2d(out, depth, 3, padding='same', name='layer_' + get_layer_num_str())

    def residual_block(inputs):
        depth = inputs.get_shape()[-1].value

        out = tf.nn.relu(inputs)

        out = conv_layer(out, depth)
        out = tf.nn.relu(out)
        out = conv_layer(out, depth)
        return out + inputs

    if bn:
      def conv_sequence(inputs, depth):
        out = conv_layer(inputs, depth)
        out = tf.layers.max_pooling2d(out, pool_size=3, strides=2, padding='same')
        out = tf.layers.batch_normalization(out)
        out = residual_block(out)
        out = residual_block(out)
        return out
    else:
      def conv_sequence(inputs, depth):
        out = conv_layer(inputs, depth)
        out = tf.layers.max_pooling2d(out, pool_size=3, strides=2, padding='same')
        out = residual_block(out)
        out = residual_block(out)
        return out
    
    out = tf.cast(unscaled_images, tf.float32) / 255.

    for depth in depths:
        out = conv_sequence(out, depth)

    out = tf.layers.flatten(out)
    out = tf.nn.relu(out)
    out = tf.layers.dense(out, 256, activation=tf.nn.relu, name='layer_' + get_layer_num_str())

    return out

In [None]:
num_levels = 100
timesteps_per_proc = int(1e6)
num_envs = 64
nsteps = 256
save_interval = 1
bn = True

conv_fn = lambda x: build_model(x, depths=[16,32,32], bn=bn, emb_size=256)

log_dir = DRIVE_PATH+'/log/MultiPPO_{}lvl'.format(num_levels)
if bn:
  log_dir += "_BN"
os.makedirs(log_dir, exist_ok=True)
log_dir = os.path.join(log_dir + "/", time.strftime("%d%m%y_%H:%M:%S", time.localtime()))
os.makedirs(log_dir, exist_ok=True)

comm = MPI.COMM_WORLD
rank = comm.Get_rank()

env_name = "fruitbot"
distribution_mode = "easy"
start_level = 0
test_worker_interval = 0
is_test_worker=False


learning_rate = 5e-4
ent_coef = .01
gamma = .999
lam = .95
nminibatches = 8
ppo_epochs = 3
clip_range = .2
use_vf_clipping = True

mpi_rank_weight = 0 if is_test_worker else 1
num_levels = 0 if is_test_worker else num_levels

if log_dir is not None:
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs)

logger.info("creating environment")
venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode)
venv = VecExtractDictObs(venv, "rgb")

venv = VecMonitor(
    venv=venv, filename=None, keep_buf=100,
)

venv = VecNormalize(venv=venv, ob=False)

eval_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level+num_envs, distribution_mode=distribution_mode)
eval_env = VecExtractDictObs(eval_env, "rgb")

eval_env = VecMonitor(
    venv=eval_env, filename=None, keep_buf=100,
)

eval_env = VecNormalize(venv=eval_env, ob=False)


logger.info("creating tf session")
setup_mpi_gpus()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True #pylint: disable=E1101
sess = tf.Session(config=config)
sess.__enter__()

logger.info("training")
model = ppo2.learn(
    env=venv,
    eval_env=eval_env,
    network=conv_fn,
    total_timesteps=timesteps_per_proc,
    save_interval=save_interval,
    nsteps=nsteps,
    nminibatches=nminibatches,
    lam=lam,
    gamma=gamma,
    noptepochs=ppo_epochs,
    log_interval=1,
    ent_coef=ent_coef,
    mpi_rank_weight=mpi_rank_weight,
    clip_vf=use_vf_clipping,
    comm=comm,
    lr=learning_rate,
    cliprange=clip_range,
    update_fn=None,
    init_fn=None,
    vf_coef=0.5,
    max_grad_norm=0.5,
)

Logging to /content/gdrive/MyDrive/cs182/RL-proj/log/MultiPPO_100lvl_BN/120521_03:47:46
creating environment
creating tf session
training






Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.MaxPooling2D instead.
Instructions for updating:
Use keras.layers.BatchNormalization instead.  In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not be used (consult the `tf.keras.layers.batch_normalization` documentation).
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.Dense instead.




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





Stepping environment...
Done.
---------------------------------------
| eplenmean               | 85.8      |
| eprewmean               | -2.6      |
| eval