In [None]:
import reinforceable
from reinforceable import agents 
from reinforceable import envs 
from reinforceable import layers
from reinforceable import utils

import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
import keras

import gymnasium as gym

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

## 1. Set up environment

In [None]:
id = 'BeamRiderNoFrameskip-v4'
full_action_space = False
seed = 1

def make_env(seed, render_mode=None):

    env = gym.make(
        id, 
        render_mode=render_mode, 
        full_action_space=full_action_space)

    # Adding more prepocessing steps (env wrappers) will likely 
    # improve learning of the agent significantly.
    # For instance, scaling of the reward between -1 and 1. 
    env = gym.wrappers.AtariPreprocessing(
        env, 
        noop_max=0, 
        frame_skip=8, 
        screen_size=84, 
        terminal_on_life_loss=False, 
        grayscale_obs=True, 
        grayscale_newaxis=True, 
        scale_obs=False
    )

    env = envs.gym_wrappers.FloatingStateEnv(env)
    env = envs.gym_wrappers.EpisodicLifeEnv(env)
    env = envs.gym_wrappers.NoInfoEnv(env)
    env = envs.gym_wrappers.TimestepEnv(env)

    env.seed(seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    
    return env


In [None]:
batch_shape = (32,)
state_shape = make_env(seed).observation_space.shape
num_actions = make_env(seed).action_space.n

env_constructors = [
    lambda: make_env(seed + i) for i in range(batch_shape[0])
]

output_signature = reinforceable.Timestep(
    state=tf.TensorSpec(batch_shape + state_shape, dtype=tf.float32), 
    reward=tf.TensorSpec(batch_shape + (1,), dtype=tf.float32),
    step_type=tf.TensorSpec(batch_shape + (1,), dtype=tf.int32),                                    
    info={},
)

env = envs.AsyncEnvironment(
    env_constructors,
    output_signature=output_signature)

# Visualize initial state of the first environment
plt.imshow(env.reset().state[0]);

## 2. Build networks

In [None]:
inputs = keras.layers.Input(batch_shape + state_shape, dtype=tf.float32)
states_mask = keras.layers.Input(batch_shape + (1,), dtype=tf.bool)

# Note: TimeDistributed wrapper only really needed for Flatten() below, though good to be explicit.

# Build encoder network
x = inputs
x = keras.layers.TimeDistributed(keras.layers.Conv2D(32, 8, strides=4, activation='relu'))(x)
x = keras.layers.TimeDistributed(keras.layers.Conv2D(64, 4, strides=2, activation='relu'))(x)
x = keras.layers.TimeDistributed(keras.layers.Conv2D(64, 3, strides=1, activation='relu'))(x)
x = keras.layers.TimeDistributed(keras.layers.Flatten())(x)
x = keras.layers.TimeDistributed(keras.layers.Dense(512, activation='relu'))(x)
encodings = layers.StatefulRNN(keras.layers.LSTMCell(128))(x, states_mask)

encoder_network = keras.Model((inputs, states_mask), encodings, name='encoder_network') 

# Build policy network (shares encoder network)
x = keras.layers.TimeDistributed(keras.layers.Dense(512, activation='relu'))(encoder_network.output)
distribs = layers.DenseCategorical((num_actions,))(x)
policy_network = keras.Model(encoder_network.output, distribs, name='policy_network')

# Build value network (shares encoder network)
x = keras.layers.TimeDistributed(keras.layers.Dense(512, activation='relu'))(encoder_network.output)
values = keras.layers.TimeDistributed(keras.layers.Dense(1))(x)
value_network = keras.Model(encoder_network.output, values, name='value_network')

# Visualize networks
print(encoder_network.summary())
print(policy_network.summary())
print(value_network.summary())

## 3. Create agent

In [None]:
optimizer = keras.optimizers.Adam(
   keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=3e-4,
        decay_steps=500_000,
        end_learning_rate=1e-5,
   )
)

agent = agents.RecurrentPPOAgent(
    encoder_network,
    policy_network,
    value_network,
    optimizer=optimizer,
    discount_factor=0.995,
    lambda_factor=0.95,
    use_gae=True,
    use_td_lambda_return=False,
    value_loss_coef=0.2,
    policy_loss_coef=1.0,
    entropy_loss_coef=0.01,
    kl_cutoff_factor=2.0,
    kl_cutoff_coef=1000.0,
    kl_beta_initial=1.0,
    kl_target=0.01,
    kl_tolerance=0.3,
    gradient_clip=0.5,
    value_clip=0.1,
    importance_ratio_clip=0.1,
    reward_normalization=True,
    state_normalization=True,
    advantage_normalization=True,
    summary_writer='/tmp/mylogs/recurrent_ppo_agent',
)



In [None]:
iters = 10000   # total number of iterations (10k calls to driver.run and agent.train)
steps = 4096    # total steps (128 for each of the 32 environments)

observers = {
    'episode length': utils.observers.RollingAverageEpisodeLength(10), 
    'episode return': utils.observers.RollingAverageEpisodeReturn(10), 
    'steps': utils.observers.StepCounter(),
    'episodes': utils.observers.EpisodeCounter(),
}

driver = reinforceable.Driver(agent, env, observers)

for i in (pbar := tqdm(range(iters))):

    data = driver.run(steps=steps)

    loss = agent.train(data, batch_size=32, repeats=4)

    result = driver.result()
    
    pbar.set_description(
        f'average return: {result["episode return"]:.2f}\t-\t'
        f'average length: {int(result["episode length"]):,}\t-\t'
        f'total steps: {int(result["steps"]):,}\t-\t'
        f'total episodes: {int(result["episodes"]):,}\t-\t'
    )

    # Use agent summary writer to add rolling average episode return and length
    # To write to summaries (to to tensorboard), pass a path to summary_writer. 
    # Then from terminal, run `tensorboard --logdir path`
    if agent.summary_writer is not None:
        with agent.summary_writer.as_default():
            tf.summary.scalar('episode_return', result['episode return'], result['steps'])
            tf.summary.scalar('episode_length', result['episode length'], result['steps'])
        
pbar.close()

## 4. Debug 

In [None]:
play_env = envs.AsyncEnvironment(
    [lambda: make_env(seed, render_mode='human')],
    output_signature=reinforceable.Timestep(
        state=tf.TensorSpec((1,) + state_shape, dtype=tf.float32), 
        reward=tf.TensorSpec((1,) + (1,), dtype=tf.float32),
        step_type=tf.TensorSpec((1,) + (1,), dtype=tf.int32),
        info={}
    )
)

In [None]:
episode_return, episode_length = agent._play(play_env, deterministic=False, pad=batch_shape[0]-1)