# Training Commands:

```
python train_procgen.py --env-name fruitbot --num-levels 50 --start-level 100 --distribution-mode=easy --exp-name=trialname --num-envs 8 --max-steps 5000000
```

## Rendering Environment in Notebook

Credits to:
https://hub.docker.com/r/jaimeps/rl-gym


Example code:
```
import gym
from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline

env = gym.make('Breakout-v0')
env.reset()
for _ in range(1000):
    plt.imshow(env.render(mode='rgb_array'))
    display.clear_output(wait=True)
    display.display(plt.gcf())
    env.step(env.action_space.sample())
```

In [4]:
from collections import deque
import argparse
import os
import time
import torch
import numpy as np

from procgen import ProcgenEnv
from vec_env import VecExtractDictObs
from vec_env import VecMonitor
from vec_env import VecNormalize
from util import logger

from policies import ImpalaCNN
from ppo import PPO

In [5]:
from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
from train_procgen import *

In [7]:
import argparse
import torch

In [8]:
parser = argparse.ArgumentParser(
    description='Process procgen training arguments.')

# Experiment parameters.
parser.add_argument(
    '--distribution-mode', type=str, default='easy',
    choices=['easy', 'hard', 'exploration', 'memory', 'extreme'])
parser.add_argument('--env-name', type=str, default='starpilot')
parser.add_argument('--num-envs', type=int, default=64)
parser.add_argument('--num-levels', type=int, default=0)
parser.add_argument('--start-level', type=int, default=0)
parser.add_argument('--num-threads', type=int, default=4)
parser.add_argument('--exp-name', type=str, default='trial01')
parser.add_argument('--log-dir', type=str, default='./log')
parser.add_argument('--model-file', type=str, default=None)
parser.add_argument('--method-label', type=str, default='vanilla')

# PPO parameters.
parser.add_argument('--gpu', type=int, default=0)
parser.add_argument('--lr', type=float, default=5e-4)
parser.add_argument('--ent-coef', type=float, default=0.01)
parser.add_argument('--vf-coef', type=float, default=0.5)
parser.add_argument('--gamma', type=float, default=0.999)
parser.add_argument('--lam', type=float, default=0.95)
parser.add_argument('--clip-range', type=float, default=0.2)
parser.add_argument('--max-grad-norm', type=float, default=0.5)
parser.add_argument('--nsteps', type=int, default=256)
parser.add_argument('--batch-size', type=int, default=8)
parser.add_argument('--nepochs', type=int, default=3)
parser.add_argument('--max-steps', type=int, default=25_000_000)
parser.add_argument('--save-interval', type=int, default=100)

configs = parser.parse_args(
    args=[
        '--env-name', 'fruitbot',
        '--num-envs', '1',
        '--num-levels', '500',
        '--start-level', '0',
        '--distribution-mode', 'easy',
        '--max-steps', '1000',
        '--model-file', './log/fruitbot/nlev_200_easy/vanilla/trial01/model_final.pt'
    ]
)

In [9]:
configs

Namespace(batch_size=8, clip_range=0.2, distribution_mode='easy', ent_coef=0.01, env_name='fruitbot', exp_name='trial01', gamma=0.999, gpu=0, lam=0.95, log_dir='./log', lr=0.0005, max_grad_norm=0.5, max_steps=1000, method_label='vanilla', model_file='./log/fruitbot/nlev_200_easy/vanilla/trial01/model_final.pt', nepochs=3, nsteps=256, num_envs=1, num_levels=500, num_threads=4, save_interval=100, start_level=0, vf_coef=0.5)

In [10]:
def create_venv(config, is_valid=False):
    venv = ProcgenEnv(
        num_envs=config.num_envs,
        env_name=config.env_name,
        num_levels=0 if is_valid else config.num_levels,
        start_level=0 if is_valid else config.start_level,
        distribution_mode=config.distribution_mode,
        num_threads=config.num_threads,
        render_mode='rgb_array'
    )
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)
    return VecNormalize(venv=venv, ob=False)

In [11]:
train_venv = create_venv(configs, is_valid=False)
valid_venv = create_venv(configs, is_valid=True)

In [13]:
train_venv.observation_space.shape

(64, 64, 3)

In [15]:
train_venv.action_space.n

15

In [9]:
policy = ImpalaCNN(
    obs_space=train_venv.observation_space,
    num_outputs=train_venv.action_space.n,
)

optimizer = torch.optim.Adam(
    policy.parameters(), lr=configs.lr, eps=1e-5)

ppo_agent = PPO(
    model=policy,
    optimizer=optimizer,
    gpu=configs.gpu,
    gamma=configs.gamma,
    lambd=configs.lam,
    value_func_coef=configs.vf_coef,
    entropy_coef=configs.ent_coef,
    update_interval=configs.nsteps * configs.num_envs,
    minibatch_size=configs.batch_size,
    epochs=configs.nepochs,
    clip_eps=configs.clip_range,
    clip_eps_vf=configs.clip_range,
    max_grad_norm=configs.max_grad_norm,
)

In [10]:
agent = ppo_agent
config = configs
# train_venv, 
test_env = valid_venv
# log_dir

In [11]:
if config.model_file is not None:
    agent.model.load_from_file(config.model_file)
    print('Loaded model')
else:
    print('Testing model from scratch')

test_epinfo_buf = deque(maxlen=100)
test_obs = test_env.reset()
test_steps = np.zeros(config.num_envs, dtype=int)

nbatch = config.num_envs * config.nsteps
n_ops_per_update = nbatch * config.nepochs / (nbatch // config.batch_size)
nupdates = config.max_steps // nbatch
max_steps = config.max_steps // config.num_envs

print('Start testing for {} steps (approximately {} updates)'.format(
    config.max_steps, nupdates))

Loaded model
Start testing for 1000 steps (approximately 3 updates)


In [12]:
from tqdm import tqdm
torch.manual_seed(0)
import random
random.seed(0)
np.random.seed(0)

done_count = 0
step_cnt = 0
total_rewards = 0

pbar = tqdm(total = config.num_levels, position=0, leave=True)

while done_count < config.num_levels:
    # Roll-out in the test environments.
    with agent.eval_mode():
        assert not agent.training
        # test_obs, test_steps, test_epinfo = rollout_one_step(
        #     agent=agent,
        #     env=test_env,
        #     obs=test_obs,
        #     steps=test_steps,
        # )
        # test_epinfo_buf.extend(test_epinfo)

        # Args: agent, env, obs, steps, env_max_steps=1000
        env_max_steps = 1000
        env = test_env
        obs = test_obs
        steps = test_steps

        # Step once.
        action = agent.batch_act(obs)
        new_obs, reward, done, infos = env.step(action)
        steps += 1
        reset = steps == env_max_steps
        steps[done] = 0

        # Save experience.
        agent.batch_observe(
            batch_obs=new_obs,
            batch_reward=reward,
            batch_done=done,
            batch_reset=reset,
        )

        # return new_obs, steps, epinfo
        test_obs = new_obs
        test_steps = steps

        # Rendering
        # if step_cnt % 10 == 0: 
            # render_obj = test_env.render(mode='rgb_array')
            # if render_obj == None:
            #     continue
            # print(render_obj)
            # plt.imshow(render_obj)
            # display.clear_output(wait=True)
            # display.display(plt.gcf())

        step_cnt += 1
        total_rewards += reward[0]
        if done:
            done_count += 1
            
            pbar.update(1)
            pbar.set_description(
                f"AR: {total_rewards / done_count:.3f} " + \
                f"AL: {step_cnt / done_count:.3f}"
            )
            
        # print(reward)

AR: 10.739 AL: 359.718: 100%|██████████| 500/500 [11:57<00:00,  1.09s/it]