In [None]:

# The algorithm/agent which should be used is chosen in the cell below.

import sys
import os

from enum import Enum, auto

import tensorflow as tf
import reverb

from tf_agents.agents.dqn import dqn_agent
from tf_agents.agents.ppo import ppo_clip_agent
from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.environments import TFPyEnvironment
from tf_agents.networks import q_network
from tf_agents.networks import actor_distribution_network
from tf_agents.networks import mask_splitter_network
from tf_agents.networks import value_network
from tf_agents.metrics import py_metrics
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_py_policy, random_tf_policy
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.train.utils import spec_utils
from tf_agents.train.utils import train_utils
from tf_agents.train import actor
from tf_agents.train import learner
from tf_agents.train import ppo_learner
from tf_agents.train import triggers
from tf_agents.utils import common

# add parent dir of agents dir to path, so the dir of the envs can be accessed
module_path = os.path.abspath(os.path.join('./..'))

if module_path not in sys.path:
    sys.path.append(module_path)

from environments.mastermind_v2 import MastermindEnvironment
from environments.battleship_v2 import BattleshipEnvironment

In [None]:
class Algorithm(Enum):
    DQN = auto()
    DDQN = auto()
    PPO = auto()
    Reinforce = auto()


collect_env  = MastermindEnvironment()
eval_env  = MastermindEnvironment()
observation_spec, action_spec, time_step_spec = (spec_utils.get_tensor_specs(collect_env))
env_name = type(collect_env).__name__

algorithm = Algorithm.PPO

In [None]:
# replay buffer
replay_buffer_capacity = 20_000
num_timesteps_per_train_call = 2

# training/evaluation
num_training_iterations = 10
initial_collect_steps = 1_000
num_eval_episodes = 50
batch_size = 1
num_epochs = 1

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

# summary
eval_interval = int(num_training_iterations * 0.05)
log_interval = int(num_training_iterations * 0.05)
log_dir = "./logs/" + type(collect_env).__name__ + '/' + algorithm.name

train_step = train_utils.create_train_step()

In [None]:
# initialize an agent of respective algorithm

# function for splitting the observation into 'observation' and mask of 'valid_actions'
splitter_fn = lambda obs: (obs['observation'], obs['valid_actions'])


use_action_mask = True


if use_action_mask:
    observation_spec = observation_spec["observation"]

if algorithm == Algorithm.DQN:
    
    q_net = q_network.QNetwork(
        observation_spec,
        action_spec,
        fc_layer_params=(20,100)
    )

    agent = dqn_agent.DqnAgent(
        time_step_spec,
        action_spec,
        q_net,
        optimizer,
        observation_and_action_constraint_splitter=splitter_fn if use_action_mask else None,
        train_step_counter=train_step,
        gamma=0.9
    )

elif algorithm == Algorithm.DDQN:
    
    q_net = q_network.QNetwork(
        observation_spec,
        action_spec,
        fc_layer_params=(20,100)
    )

    agent = dqn_agent.DdqnAgent(
        time_step_spec,
        action_spec,
        q_net,
        optimizer,
        observation_and_action_constraint_splitter=splitter_fn if use_action_mask else None,
        train_step_counter=train_step,
        gamma=0.9
    )

elif algorithm == Algorithm.PPO:

    # wrapping the network with a MaskSplitterNetwork so the observation can be splitted
    actor_net = mask_splitter_network.MaskSplitterNetwork(
        splitter_fn,
        actor_distribution_network.ActorDistributionNetwork(
            observation_spec,
            action_spec
        ),
        passthrough_mask=True
    )

    # add mask argument to value_network
    value_net = mask_splitter_network.MaskSplitterNetwork(
        splitter_fn,
        value_network.ValueNetwork(
            observation_spec
        ),
        passthrough_mask=True
    )

    agent = ppo_clip_agent.PPOClipAgent(
        time_step_spec,
        action_spec,
        optimizer,
        actor_net=actor_net,
        value_net=value_net,
        train_step_counter=train_step,
        update_normalizers_in_train=False,
        num_epochs=num_epochs,
        normalize_observations=False, #drastically reduces training
        normalize_rewards=False,
        use_gae=True,
        entropy_regularization=0.001,
        importance_ratio_clipping=0.2,
        discount_factor=0.7,
    )

elif algorithm == Algorithm.Reinforce:

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        observation_spec,
        action_spec,
        fc_layer_params=(150,)
    )
    if use_action_mask:
        actor_net = mask_splitter_network.MaskSplitterNetwork(
            splitter_fn,
            actor_net,
            passthrough_mask=True,
            
        )
    agent = reinforce_agent.ReinforceAgent(
        time_step_spec,
        action_spec,
        actor_network=actor_net,
        optimizer=optimizer,
        train_step_counter=train_step,
        gamma=0.9
    )
else:
    raise ValueError("Error: Unknown algorithm")

agent.initialize()

In [None]:
# initialize replay buffer for experience collection

training_table = 'training_table'
normalization_table = 'normalization_table'

tables = [None]

# initiliaze the table for experiences
tables[0] = reverb.Table(
    name=training_table,
    max_size=replay_buffer_capacity,
    sampler=reverb.selectors.Uniform(),
    remover=reverb.selectors.Fifo(),
    rate_limiter=reverb.rate_limiters.MinSize(1)
)

if algorithm == Algorithm.PPO:
    # PPO uses a second replay buffer to collect experience for normalization
    # TODO: elaborate
    table = reverb.Table(
        name=normalization_table,
        sampler=reverb.selectors.Uniform(),
        remover=reverb.selectors.Fifo(),
        rate_limiter=reverb.rate_limiters.MinSize(1),
        max_size=replay_buffer_capacity,
        max_times_sampled=1,
    )
    tables.append(table)

reverb_server = reverb.Server(tables)

# initiliaze replay buffer for collection training experience
replay_buffer_train = reverb_replay_buffer.ReverbReplayBuffer(
    agent.collect_data_spec,
    table_name=training_table,
    sequence_length=num_timesteps_per_train_call if algorithm != Algorithm.Reinforce else None,
    local_server=reverb_server
)

if algorithm == Algorithm.PPO:
    # initialize second replay buffer to collect experience for normalization
    replay_buffer_normalization = reverb_replay_buffer.ReverbReplayBuffer(
        agent.collect_data_spec,
        table_name=normalization_table,
        sequence_length=num_timesteps_per_train_call,
        local_server=reverb_server
    )

    # Identical to ReverbAddTrajectoryObserver, but sequences are not cut when a boundary trajectory is seen.
    rb_observer = reverb_utils.ReverbTrajectorySequenceObserver(
        replay_buffer_train.py_client, 
        table_name=[training_table, normalization_table],
        sequence_length=num_timesteps_per_train_call,
    )

    normalization_dataset = replay_buffer_normalization.as_dataset(
        sample_batch_size=batch_size,
        num_steps=num_timesteps_per_train_call
    ).prefetch(4) 
    
    normalization_dataset_fn = lambda: normalization_dataset

elif algorithm == Algorithm.Reinforce:
    # The Reinforce agent uses an observer that stores entire episodes.
    # Timesteps are cached at every environment step and written at the end of an episode.

    rb_observer = reverb_utils.ReverbAddEpisodeObserver(
        replay_buffer_train.py_client,
        table_name=training_table,
        max_sequence_length=replay_buffer_capacity,
    )
else:
    # The DQN agent uses an observer that stores trajectories.
    # Time steps are cached at every environment step and written when 'sequence_length' time steps are collected.

    rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
        py_client=replay_buffer_train.py_client,
        table_name=training_table,
        sequence_length=num_timesteps_per_train_call,
    )

# get entries from the replay buffer 
experience_dataset = replay_buffer_train.as_dataset(
    sample_batch_size=batch_size,
    num_steps=num_timesteps_per_train_call if algorithm != Algorithm.Reinforce else None
).prefetch(4)

experience_dataset_fn = lambda: experience_dataset

In [None]:
# initialize policies that are used for training and evaluation
# wrapping the agent's policies in 'PyTFEagerPolicy' for faster execution

random_policy = random_py_policy.RandomPyPolicy(time_step_spec, action_spec)

tf_collect_policy = agent.collect_policy
collect_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_collect_policy, use_tf_function=True)

tf_eval_policy = agent.policy
eval_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_eval_policy, use_tf_function=True)

In [None]:
# initialize and run an actor that is used for initial data collection with a random policy

initial_collect_actor = actor.Actor(
    collect_env,
    random_policy,
    train_step,
    steps_per_run=initial_collect_steps,
    observers=[rb_observer]
)

#initial_collect_actor.run()

In [None]:
# Initialize an actor that manages interactions between a policy and the environment. 
# This actor is used for experience collection during trainng with the agent's predefined collect policy.
# 'episodes_per_run' is used for Reinforce algorithm, because the algorithm samples entire episodes. The others use 'steps_per_run'.

env_step_metric = py_metrics.EnvironmentSteps()

collect_actor = actor.Actor(
    collect_env,
    collect_policy,
    train_step,
    steps_per_run=num_timesteps_per_train_call if algorithm != Algorithm.Reinforce else None,
    episodes_per_run=num_timesteps_per_train_call if algorithm == Algorithm.Reinforce else None,
    metrics=actor.collect_metrics(10),
    summary_dir=os.path.join(log_dir, learner.TRAIN_DIR),
    summary_interval=log_interval,
    observers=[rb_observer, env_step_metric]
)

In [None]:
# initialize an actor that is used to evaluate the agent during training 

eval_actor = actor.Actor(
    eval_env,
    eval_policy,
    train_step,
    episodes_per_run=num_eval_episodes,
    metrics=actor.eval_metrics(num_eval_episodes),
    summary_dir=os.path.join(log_dir, 'eval'),
    summary_interval=log_interval
)

In [None]:
# Initialize a learner instance the manages all the learning details during training of the agent.action_spec
# Additionally, simplifies logging of metrics and saving of results, by creating checkpoints.

# save policy and steps-per-second metric
learning_triggers = [
    triggers.PolicySavedModelTrigger(
        os.path.join(log_dir, learner.POLICY_SAVED_MODEL_DIR),
        agent,
        train_step,
        log_interval
    ),
    triggers.StepPerSecondLogTrigger(
        train_step, 
        log_interval
    ),
]

# ppo has it's own implementation of a learner instance, due to it's second replay buffer?

if algorithm == Algorithm.PPO:
    agent_learner = ppo_learner.PPOLearner(
        root_dir=log_dir,
        train_step=train_step,
        agent=agent,
        experience_dataset_fn=experience_dataset_fn,
        normalization_dataset_fn=normalization_dataset_fn,
        summary_interval=log_interval,
        checkpoint_interval=log_interval,
        triggers=learning_triggers,
        num_samples=1
    )
else:
    agent_learner = learner.Learner(
        root_dir=log_dir,
        train_step=train_step,
        agent=agent,
        experience_dataset_fn=experience_dataset_fn,
        summary_interval=log_interval,
        checkpoint_interval=log_interval,
        triggers=learning_triggers,
        max_checkpoints_to_keep=5
    )

In [None]:
%load_ext tensorboard
#%tensorboard --logdir {log_dir.replace("/" + algorithm.name, "")} --bind_all
# alternatively open tensorboard in a separate browser tab by going to http://localhost:6006/

In [None]:
# comment out, so when continuing training, tensorboard doesn't create a new graph, but continues from run (which is automatically loaded)
# agent.train_step_counter.assign(0)

# run training for 'num_training_iterations' iterations
# during training, the actor collects experience and the agent learns via the learner instance
# additionally, for some algorithms the storages are flushed/cleared.


for _ in range(num_training_iterations):
    collect_actor.run()

    if algorithm == Algorithm.PPO:
        rb_observer.flush()

    total_loss = agent_learner.run()

    if algorithm == Algorithm.PPO or algorithm == Algorithm.Reinforce:
        replay_buffer_train.clear()
        if algorithm == Algorithm.PPO:
            replay_buffer_normalization.clear()

    step = agent_learner.train_step_numpy

    if eval_interval and step % eval_interval == 0:
        eval_actor.run_and_log()

    if log_interval and step % log_interval == 0:       
        avg_return = next(m.result() for m in eval_actor.metrics if m.name == "AverageReturn")

        print(f"step {step}")
        print(f"\tloss = {total_loss.loss.numpy():.2f}")
        print(f"\tavg_reward = {avg_return}")

rb_observer.close()
reverb_server.stop()

In [None]:
# rendering the result of the training

def render(env, num_episodes=1, delay=.5):
    policy = eval_actor.policy
    avg_run = 0
    
    for i in range(num_episodes):
        time_step = env.reset()
        j = 0
        
        while not time_step.is_last():
            j+= 1
            action_step = policy.action(time_step)
            time_step = env.step(action_step.action)
            env.render(delay)
        
        avg_run += j
    print(f"Avg run: {avg_run/num_episodes}")


render(eval_env, num_episodes=1, delay=0.35)