In [1]:
import gym
import procgen
import time
import numpy as np

from abc import ABC, abstractmethod

In [17]:
# random actions with rendering
env = gym.make('procgen:procgen-fruitbot-v0', distribution_mode = 'easy')
obs = env.reset()
while True:
    obs, rew, done, info = env.step(env.action_space.sample())
    env.render()
    if done:
        break
    time.sleep(.05)
env.close()

In [2]:
class FruitbotModel(ABC):
    
    def __init__(self):
        super().__init__()
        
    """
    TO IMPLEMENT:
    
    Compute actions according to the model, return a vector of actions to take in the vectorized environment
    
    ==Inputs==
    state: State vector of size (N, 64, 64, 3)
    
    ==Outputs==
    action: An integer vector of actions of size (N), in which each action value is contained within range(0, 15).
        i.e., must be a valid input for venv.step()
    """
    @abstractmethod
    def step(self, state):
        pass

    
    
    """
    Simple implementation for epsilon-greedy exploration in vectorized form. Dependent on implementation for
    FruitbotModel.step()
    """
    def step_with_explore(self, state, epsilon):
        agent_step = self.step(state)
        
        # Keep probability for agent actions
        mask = np.random.sample(agent_step.size) > epsilon
        
        return np.where(mask, agent_step, np.random.choice(a = 15, size = agent_step.size))
        
        
    
    
    """
    Train the model at a certain timestep. 
    """
    @abstractmethod
    def train(self):
        pass
    
    """
    Train the model according to state, action, reward information obtained from the environment. 
    """
    @abstractmethod
    def train(self, state0, action, state1, reward):
        pass
    
    

In [3]:
class BaseModel(FruitbotModel):
    
    def __init__(self, num_envs):
        self.N = num_envs
    
    def step(self, state):
        return np.random.choice(15, self.N)
    
    def train(self, state0, action, state1, reward):
        pass
    
    

In [4]:
"""
Sample training loop

TODO:
* Update to track timesteps

"""

num_envs = 16

venv = procgen.ProcgenEnv(num_envs=num_envs, env_name="fruitbot", distribution_mode = 'easy')
model = BaseModel(num_envs)
state0 = venv.reset()

while True:
    
    # Advance environment by one timestep
    
    action = model.step_with_explore(state0, .25)
    state1, rew, done, info = venv.step(action)
    
    # Train environment on observations
    model.train(state0, action, state1, rew)
    state0 = state1
    venv.render()
    
    if np.all(done):
        break
    # time.sleep(.05)
venv.close()

KeyboardInterrupt: 

In [5]:
venv.close()

In [4]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from mpi4py import MPI
from baselines.ppo2 import ppo2
from baselines.common.models import build_impala_cnn
from baselines.common.mpi_util import setup_mpi_gpus
from procgen import ProcgenEnv
from baselines.common.vec_env import (
    VecExtractDictObs,
    VecMonitor,
    VecFrameStack,
    VecNormalize
)
from baselines import logger
from mpi4py import MPI
import argparse

Instructions for updating:
non-resource variables are not supported in the long term


In [5]:
setup_mpi_gpus()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True #pylint: disable=E1101
sess = tf.Session(config=config)
sess.__enter__()
    

<tensorflow.python.client.session.Session at 0x7f7168592e80>

In [6]:
num_envs = 64
learning_rate = 5e-4
ent_coef = .01
gamma = .999
lam = .95
nsteps = 256
nminibatches = 8
ppo_epochs = 3
clip_range = .2
use_vf_clipping = True
update_fn=None,
init_fn=None,
vf_coef=0.5,
max_grad_norm=0.5
comm = None

In [7]:
num_levels = 50
num_envs = 64
start_level = 0
venv = ProcgenEnv(num_envs=num_envs, env_name="fruitbot", 
                  num_levels=num_levels, start_level=start_level, distribution_mode='easy')
venv = VecExtractDictObs(venv, "rgb")

venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

venv = VecNormalize(venv=venv, ob=False)


In [8]:
conv_fn = lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)



ppo2.learn(
        env=venv,
        network=conv_fn,
        total_timesteps=timesteps_per_proc,
        save_interval=save_interval,
        nsteps=nsteps,
        nminibatches=nminibatches,
        lam=lam,
        gamma=gamma,
        noptepochs=ppo_epochs,
        log_interval=1,
        ent_coef=ent_coef,
        mpi_rank_weight=mpi_rank_weight,
        clip_vf=use_vf_clipping,
        comm=comm,
        lr=learning_rate,
        cliprange=clip_range,
        update_fn=None,
        init_fn=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        load_path=load_path
    )

In [9]:
from collections import deque
from baselines.common import explained_variance, set_global_seeds
from baselines.common.policies import build_policy
try:
    from mpi4py import MPI
except ImportError:
    MPI = None
from baselines.ppo2.runner import Runner

In [10]:
total_timesteps = 20000

In [11]:
load_path = None
load_path = '../train_procgen/models/sample-gpu/checkpoints/03051'

In [12]:
total_timesteps = int(total_timesteps)

policy = build_policy(venv, conv_fn)

# Get the nb of env
nenvs = venv.num_envs

# Get state_space and action_space
ob_space = venv.observation_space
ac_space = venv.action_space

# Calculate the batch_size
nbatch = nenvs * nsteps
nbatch_train = nbatch // nminibatches


# Instantiate the model object (that creates act_model and train_model)
from baselines.ppo2.model import Model
model_fn = Model

model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=0)

if load_path is not None:
        model.load(load_path)




Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.MaxPooling2D instead.
Instructions for updating:
Use keras.layers.Flatten instead.
Instructions for updating:
Use keras.layers.Dense instead.


In [29]:
venv.reset()

AttributeError: 'NoneType' object has no attribute 'libenv_reset'

In [14]:
model.step()

TypeError: step() missing 1 required positional argument: 'observation'

In [27]:
runner = Runner(env=venv, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

AttributeError: 'NoneType' object has no attribute 'libenv_reset'

In [19]:
runner.run()

NameError: name 'runner' is not defined

In [20]:
states = model.initial_state
obs = np.zeros((num_envs,) + venv.observation_space.shape, dtype=venv.observation_space.dtype.name)
obs[:] = venv.reset()
dones = [False for _ in range(num_envs)]

AttributeError: 'NoneType' object has no attribute 'libenv_reset'

In [17]:
for steps in range(10000):
    
    # Advance environment by one timestep
    
    actions, values, states, neglogpacs = model.step(obs, S=states, M=dones)
    
    
    obs[:], rewards, dones, infos = venv.step(actions)
    
   
    venv.render()
    
    if np.all(dones):
        break
    # time.sleep(.05)
venv.close()

KeyboardInterrupt: 

In [18]:
venv.close()