In [None]:
import reinforceable

import tensorflow as tf
import keras

import gymnasium as gym

import numpy as np

import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

import os

from networks import EncoderNetwork
from networks import PolicyNetwork
from networks import ValueNetwork

from env import Chromatography 

# Try to make runs reproducible:

os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

tf.random.set_seed(42)
np.random.seed(42)

# Helper function to save agent during run:

def save_agent(agent, path, batch_shape=[None, None]):
    agent.save(
        path, 
        reinforceable.Timestep(
            state={
                'chromatogram': tf.TensorSpec(shape=batch_shape + [8192, 1], dtype=tf.float32), 
                'phi_target': tf.TensorSpec(shape=batch_shape + [NUM_ACTIONS], dtype=tf.float32)
            }, 
            step_type=tf.TensorSpec(shape=batch_shape + [1], dtype=tf.int32), 
            reward=tf.TensorSpec(shape=batch_shape + [1], dtype=tf.float32),
            info={}
        ),
    )

## Settings

In [None]:
# Main loop settings
ITERS = 3_500     # main loop
STEPS = 1024      # interaction loop
BATCH_SIZE = 32   # training loop
REPEATS = 4       # training loop

# Environment settings
MAX_TIME = 20.0
PARALLEL_ENVS = 8
NUM_COMPOUNDS = (10, 20)
NUM_EXPERIMENTS = 1               # Episode length
NUM_SEGMENTS = 3              
NUM_ACTIONS = NUM_SEGMENTS + 1    # + 1 for initial phi
CHROMATOGRAM_SHAPE = (8192, 1)
PHI_SHAPE = (NUM_ACTIONS,)
    
# Agent settings and PPO hyperparameters
LR_INITIAL = 1e-4
LR_END = 1e-6
LR_DECAY_STEPS = 50_000

DISCOUNT_FACTOR = 0.99
LAMBDA_FACTOR = 0.95
USE_GAE = True
USE_TD_LAMBDA_RETURN = False 
VALUE_LOSS_COEF = 0.5
POLICY_LOSS_COEF = 1.0
ENTROPY_LOSS_COEF = 0.01
KL_CUTOFF_FACTOR = 2.0
KL_CUTOFF_COEF = 1000.0
KL_BETA_INITIAL = 1.0
KL_TARGET = 0.01
KL_TOLERANCE = 0.3
GRADIENT_CLIPPING = 0.5
VALUE_CLIPPING = 0.2
IMPORTANCE_RATIO_CLIPPING = 0.2
REWARD_NORMALIZATION = False
STATE_NORMALIZATION = False
ADVANTAGE_NORMALIZATION = True

AGENT_NAME = f'Agent-S{NUM_ACTIONS-1:02d}E{NUM_EXPERIMENTS:02d}'
SUMMARY_WRITER = f'./logs/' + AGENT_NAME
AGENT_PATH = f'./saved_agents/' + AGENT_NAME

## Build Env

In [None]:
def get_env(num_actions, num_experiments):
    async_env = reinforceable.envs.AsyncEnvironment(
        [
            lambda i=i: Chromatography(
                time=MAX_TIME,
                num_actions=NUM_ACTIONS,
                num_experiments=NUM_EXPERIMENTS,
                num_compounds=NUM_COMPOUNDS,
                seed=i * 1_000_000
            ) for i in range(PARALLEL_ENVS)
        ],
    )
    return async_env

async_env = get_env(NUM_ACTIONS, NUM_EXPERIMENTS)

## Build Agent

In [None]:
def get_agent():
    
    encoder_network = EncoderNetwork(PARALLEL_ENVS, CHROMATOGRAM_SHAPE, PHI_SHAPE)
    policy_network = PolicyNetwork(encoder_network.output, (NUM_ACTIONS,))
    value_network = ValueNetwork(encoder_network.output)
    
    optimizer = keras.optimizers.Adam(
       keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=LR_INITIAL,
            decay_steps=LR_DECAY_STEPS,
            end_learning_rate=LR_END,
            power=2.0,
       )
    )
    
    agent = reinforceable.agents.RecurrentPPOAgent(
        encoder_network,
        policy_network,
        value_network,
        optimizer=optimizer,
        discount_factor=DISCOUNT_FACTOR,
        lambda_factor=LAMBDA_FACTOR,
        use_gae=USE_GAE,
        use_td_lambda_return=USE_TD_LAMBDA_RETURN,
        value_loss_coef=VALUE_LOSS_COEF,
        policy_loss_coef=POLICY_LOSS_COEF,
        entropy_loss_coef=ENTROPY_LOSS_COEF,
        kl_cutoff_factor=KL_CUTOFF_FACTOR,
        kl_cutoff_coef=KL_CUTOFF_COEF,
        kl_beta_initial=KL_BETA_INITIAL,
        kl_target=KL_TARGET,
        kl_tolerance=KL_TOLERANCE,
        gradient_clip=GRADIENT_CLIPPING,
        value_clip=VALUE_CLIPPING,
        importance_ratio_clip=IMPORTANCE_RATIO_CLIPPING,
        reward_normalization=REWARD_NORMALIZATION,
        state_normalization=STATE_NORMALIZATION,
        advantage_normalization=ADVANTAGE_NORMALIZATION,
        summary_writer=SUMMARY_WRITER,
    )

    return agent

agent = get_agent()

## Run

> Observe training progression, by launching tensorboard (from command line): `tensorboard --logdir ./logs`

In [None]:
observers = {
    'episode_return': reinforceable.utils.observers.RollingAverageEpisodeReturn(128), 
    'steps': reinforceable.utils.observers.StepCounter(),
    'episodes': reinforceable.utils.observers.EpisodeCounter(),
}

driver = reinforceable.Driver(agent, async_env, observers)

best_return = float('-inf')

for i in (pbar := tqdm(range(ITERS))):

    data = driver.run(steps=STEPS)

    loss = agent.train(data, batch_size=BATCH_SIZE, repeats=REPEATS)

    result = driver.result()
    
    pbar.set_description(
        f'average return: {result["episode_return"]:.2f}\t-\t'
        f'total steps: {int(result["steps"]):,}\t-\t'
        f'total episodes: {int(result["episodes"]):,}\t-\t'
    )

    # Use agent's tf summary writer to log rolling average episode returns.
    if agent.summary_writer is not None:
        with agent.summary_writer.as_default():
            tf.summary.scalar('episode_return', result['episode_return'], result['steps'])

    # If agent is peak performing, save it
    if i > (ITERS/400) and result['episode_return'] > best_return:
        best_return = result['episode_return']
        save_agent(agent, AGENT_PATH)
        
pbar.close()

## Load agent

In [None]:
loaded_agent = tf.saved_model.load(AGENT_PATH)

# Obtain initial timestep (containing initial state (separation)):
timestep = async_env.reset()
# Need to add time dimension as agent is recurrent:
timestep = tf.nest.map_structure(lambda x: tf.expand_dims(x, 0), timestep)
# Compute action:
action, _ = loaded_agent(timestep)
# Perform action:
action = tf.nest.map_structure(lambda x: tf.squeeze(x, 0), action)
async_env.step(action)