# Understand Procgen Env Wrappers

### Summary

1. `VecNormalize` from OpenAI baselines can normalize both observations and rewards; PPO used it to normalize and clip rewards.
2. PPO uses `VecPyTorchProcgen` to normalize observations ([0, 255] -> [0, 1]).
3. `VecMonitor` saves sum of (raw) rewards for a completed episode into `info` before the rewards are normed or clipped.

## Model

In [1]:
import torch
from online.behavior_policies import PPOnet

model = PPOnet((3, 64, 64), 15, base_kwargs={"hidden_size": 256})
cp_path = "/checkpoint/qingfeiyou/offlinerl/ppo/miner/miner-easy-200-ppo-lr0.0005-epoch3-mb8-v0.5-ha0.01_0/final/model_12.70.pt"
checkpoint_states = torch.load(cp_path)
model.load_state_dict(checkpoint_states["model_state_dict"])
model.to("cuda:0")

  from .autonotebook import tqdm as notebook_tqdm


PPOnet(
  (base): ResNetBase(
    (layer1): Sequential(
      (0): Conv2d_tf(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=SAME)
      (1): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (2): BasicBlock(
        (conv1): Conv2d_tf(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (relu): ReLU(inplace=True)
        (conv2): Conv2d_tf(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (3): BasicBlock(
        (conv1): Conv2d_tf(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (relu): ReLU(inplace=True)
        (conv2): Conv2d_tf(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
    (layer2): Sequential(
      (0): Conv2d_tf(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=SAME)
      (1): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (2): BasicBlock(
        (conv1): Conv2d_tf(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))


## Envs

### 1. Normalized Observations and Rewards

In [2]:
import torch
from online.behavior_policies import PPOnet, make_venv

venv = make_venv(
    num_envs=1,
    env_name="miner",
    device="cuda:0",
    **{
        "num_levels": 200,
        "start_level": 0,
        "distribution_mode": "easy",
        "ret_normalization": True,
        "obs_normalization": True,
    },
)
obs = venv.reset()


2023-03-29 08:32:27.202187: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-29 08:32:27.246494: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
eval_episode_rewards = []
raw = []

while len(eval_episode_rewards) < 1:
    with torch.no_grad():
        _, action, _ = model.act(obs)

    obs, reward, _done, infos = venv.step(action)

    eval_episode_rewards.extend((info["episode"]["r"] for info in infos if "episode" in info.keys()))
    raw.append(reward)

print(eval_episode_rewards)
print(raw)
print(sum(raw))

# Mismatch between rewards from info (actually from VecMonitor) and rewards from step (normed and clipped in VecNormalizer)
assert eval_episode_rewards != sum(raw).cpu().squeeze(1).numpy()

[13.0]
[tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[3.0236]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[1.5771]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[1.1763]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[4.9094]])]
tensor([[10.6865]])


### 2. Normalized Observations and Unnormalized Rewards

In [4]:
import torch
from procgen import ProcgenEnv
from baselines.common.vec_env import VecExtractDictObs, VecMonitor, VecNormalize
from online.behavior_policies.envs import VecPyTorchProcgen

envs = ProcgenEnv(num_envs=1, env_name="miner", num_levels=200, start_level=0, distribution_mode="easy")
envs = VecExtractDictObs(envs, "rgb")
envs = VecMonitor(venv=envs, filename=None, keep_buf=100)
envs = VecNormalize(venv=envs, ob=False, ret=False)
envs = VecPyTorchProcgen(envs, device="cuda:0", normalize=True) # Here we normalize observations

print(envs.observation_space)
obs = envs.reset()


Box(3, 64, 64)


In [6]:
eval_episode_rewards = []
raw = []

while len(eval_episode_rewards) < 1:
    with torch.no_grad():
        _, action, _ = model.act(obs)

    obs, reward, done, infos = envs.step(action)

    eval_episode_rewards.extend((info["episode"]["r"] for info in infos if "episode" in info.keys()))
    raw.append(reward)

print(eval_episode_rewards)
print(raw)
print(sum(raw))

# Sanity check that rewards are not normed nor clipped
assert eval_episode_rewards == sum(raw).cpu().squeeze(1).numpy()

[13.0]
[tensor([[0.]]), tensor([[0.]]), tensor([[1.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[1.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[1.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[10.]])]
tensor([[13.]])


### 3. Unnormalized Observations and Unnormalized Rewards

In [7]:
import torch
from procgen import ProcgenEnv
from baselines.common.vec_env import VecExtractDictObs, VecMonitor, VecNormalize
from online.behavior_policies.envs import VecPyTorchProcgen

envs = ProcgenEnv(num_envs=1, env_name="miner", num_levels=200, start_level=0, distribution_mode="easy")
envs = VecExtractDictObs(envs, "rgb")
envs = VecMonitor(venv=envs, filename=None, keep_buf=100)
envs = VecNormalize(venv=envs, ob=False, ret=False) # No rewards normalization
envs = VecPyTorchProcgen(envs, device="cuda:0", normalize=False) # No obs normalization

print(envs.observation_space)
obs = envs.reset()


Box(3, 64, 64)


In [9]:
eval_episode_rewards = []
raw = []

while len(eval_episode_rewards) < 1:
    with torch.no_grad():
        _, action, _ = model.act(obs / 255.0) # since the model was trained with normed obs, we need to provide normed obs during inference

    obs, reward, done, infos = envs.step(action)

    eval_episode_rewards.extend((info["episode"]["r"] for info in infos if "episode" in info.keys()))
    raw.append(reward)

print(eval_episode_rewards)
print(raw)
print(sum(raw))

# Sanity check that rewards are not normed nor clipped
assert eval_episode_rewards == sum(raw).cpu().squeeze(1).numpy()

[13.0]
[tensor([[0.]]), tensor([[1.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[1.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[1.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[10.]])]
tensor([[13.]])
