 # **Soft Actor-Critic**
Implemented in TensorFlow 2.6 with TF-Agents.
**Soft Actor-Critic** algorithm learns not only rewards, but also tries to maximize the entropy of its actions. In other words, it tries to be as unpredicatable as possible while still getting as many rewards as possible. This encourages the agent to explore the environment, which speeds up training, and makes it less likely to repeatedly execute same action when DQN produces imperfect estimates. This should lead to amazing sample efficiency.

[Soft Actor-Critic Algorithms and Applications](https://arxiv.org/abs/1812.05905)

# Imports

In [1]:
import gym
from offworld_gym.envs.common.channels import Channels
from offworld_gym.envs.common.enums import AlgorithmMode, LearningType

import silence_tensorflow.auto
import os
import logging
import tempfile
import numpy as np
import datetime
import certifi
import urllib3
import shutil

import tensorflow as tf
from tf_agents.agents.ddpg import critic_network
from tf_agents.agents.sac import sac_agent 
from tf_agents.agents.sac import tanh_normal_projection_network
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.environments import wrappers
from tf_agents.metrics import tf_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.policies import greedy_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.specs import tensor_spec
from tf_agents.train import actor
from tf_agents.train import learner
from tf_agents.train.utils import spec_utils
from tf_agents.train.utils import strategy_utils
from tf_agents.train.utils import train_utils
from tf_agents.utils import common

# Hyperparameters

### Offworld-Gym env parameters

In [2]:
real = False                             # True = Real environment | False = Simulated docker environment
experiment_name = 'Buf_1'                # For https://gym.offworld.ai/myexperiments experiment name
resume_experiment = True                 # Resume training? For real env
delete_checkpoints = False               # Delete model checkpoints for 'experiment_name' experiment

learning_type = LearningType.END_TO_END  # Description of training method
algorithm_mode = AlgorithmMode.TRAIN     # .TEST or .TRAIN  (TEST needed for Offworld leaderboard)
channel_type = Channels.DEPTH_ONLY       # Which sensors to use: .RGB_ONLY or .DEPTH_ONLY or .RGBD

# Access token for Real environment from https://gym.offworld.ai/account
import my_Gym_token # I have this in separate file, but it can be added as string
os.environ['OFFWORLD_GYM_ACCESS_TOKEN'] = my_Gym_token.is_secret # 'insert_as_a_string'

# Project root folder. If path unknown, run in terminal: pwm
os.environ['OFFWORLD_GYM_ROOT'] = '/home/karlaru/PycharmProjects/offworld-gym'

# Python environment used for this project (eg miniconda env). If path unknown, run in terminal: which python
os.environ['PYTHONPATH'] = '/home/karlaru/miniconda3/envs/offworld-karl/bin/python'

# Load right environment
if real:
    env_name = 'OffWorldMonolithContinuousReal-v0'  
else:
    env_name = 'OffWorldDockerMonolithContinuousSim-v0'

### Logging

In [3]:
# Show only INFO messages
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)

# Disable connection warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)    

### Model checkpoints

In [4]:
# Tempdir for model checkpoints
tempdir = 'models/' + experiment_name + '/'

# Clean checkpoints if clean training start wanted
if delete_checkpoints == True:
    try:
        shutil.rmtree(tempdir)
        logging.info(f'Successfully deleted checkpoints in {tempdir}')
    except:
        logging.info(f'{tempdir} already empty')

### Replay buffer

In [5]:
# Steps to train with random policy to fill buffer (when 0 then skipped)
steps_random_training = 0

# How often to save random policy training to file (<=steps_random_training)
steps_save_random = 10000

# Max number of steps to keep in replay buffer
replay_max_length = 50000 # == 14.3GB file

# Patch size to get from buffer for one network training interation
sample_batch_size = 64

# Update replay buffer file with new data while training model?
update_replay = True

# Random replay buffer is in project folder, so it doesn't get deleted and can be reused
if real:
    rb_tempdir = 'data/real_buffer/'
else:
    rb_tempdir = 'data/sim_buffer/'
    
    
logging.info(f'Buffer tempdir: {rb_tempdir}')

INFO:Buffer tempdir: data/sim_buffer/


### Training

In [6]:
# Episodes to collect before retraining network
episodes_in_patch = 25

# Total episodes to run
episodes_to_train = 100000

# Iterations to do each time after new data collection
training_iterations = 500

# Computation distribution strategy

Enables running computations on one or more devices in a way that model definition code can remain unchanged when running on different hardware.

In [7]:
# Number of GPU-s available in current machine
num_GPUs = len(tf.config.list_physical_devices('GPU'))

# If no GPU-s available, use CPU
if num_GPUs == 0:
    strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
    logging.info('No GPUs available. Using only CPU.')

# If one GPU available
elif num_GPUs == 1:
    strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    logging.info('Using 1 GPU')

# If more than one GPU available, mirror data for compute in parallel
else:
    strategy = tf.distribute.MirroredStrategy()
    logging.info('Using multible GPUs' + str(num_GPUs))

INFO:Using 1 GPU


# Environment

In Reinforcement Learning (RL), an environment represents the task or problem to be solved. TF-Agents has suites for loading environments such as the OpenAI Gym. OpenAI Gym is written in pure Python. This is converted to TensorFlow using the TFPyEnvironment wrapper. The original environment's API uses Numpy arrays. The TFPyEnvironment converts these to Tensors to make it compatible with Tensorflow agents and policies.

In [8]:
if real == False:
    env = suite_gym.wrap_env(gym_env=gym.make(env_name, channel_type=channel_type))

else:
    env = suite_gym.wrap_env(gym_env=gym.make(env_name, 
                                              experiment_name=experiment_name,
                                              resume_experiment=resume_experiment,
                                              channel_type=channel_type, 
                                              learning_type=learning_type,
                                              algorithm_mode=algorithm_mode
                                              )) 
# Wrap Gym env into TF env
tf_env = tf_py_environment.TFPyEnvironment(env)

INFO:container_id is 23e7072f6d057af2ec2967440aadd5b773f94a63bb268f4d08d74ea5d39d01a1
INFO:[32mFor visualization of simulation, visit gzweb server at http://127.0.1.1:49175[0m


In [9]:
# Get tensor specs
observation_spec, action_spec, time_step_spec = spec_utils.get_tensor_specs(tf_env)

### Observation space

One observation is one frame from depth camera senor. Sensor resolution is 240x320 pix. Values for each pix range from 0 to 255.

In [10]:
observation_spec

BoundedTensorSpec(shape=(1, 240, 320, 1), dtype=tf.float32, name='observation', minimum=array(0., dtype=float32), maximum=array(255., dtype=float32))

### Action space

Robot movement command is defined by 2 element vector and has continuous values

In [11]:
action_spec

BoundedTensorSpec(shape=(2,), dtype=tf.float32, name='action', minimum=array([-0.7, -2.5], dtype=float32), maximum=array([0.7, 2.5], dtype=float32))

### Time step

A TimeStep contains the data emitted by an environment at each step of interaction. A TimeStep holds a step_type, an observation (typically a NumPy array or a dict or list of arrays), and an associated reward and discount.

In [12]:
print(time_step_spec.discount)
print(time_step_spec.observation)
print(time_step_spec.reward)
print(time_step_spec.step_type)

BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32))
BoundedTensorSpec(shape=(1, 240, 320, 1), dtype=tf.float32, name='observation', minimum=array(0., dtype=float32), maximum=array(255., dtype=float32))
TensorSpec(shape=(), dtype=tf.float32, name='reward')
TensorSpec(shape=(), dtype=tf.int32, name='step_type')


# Agent

### Critic
Gives value estimates for Q(s,a)

In [13]:
with strategy.scope():
  critic_net = critic_network.CriticNetwork(
        (observation_spec, action_spec),
        observation_conv_layer_params=((64,20,3), (64,10,2), (32,5,1), (16,3,1)),
        observation_fc_layer_params=(512, 256, 128, 64, 32, 16, 8), 
        observation_dropout_layer_params=None,
        action_fc_layer_params=(512,256, 128, 64), 
        action_dropout_layer_params=None,
        joint_fc_layer_params=(512,256, 128, 64),
        joint_dropout_layer_params=None,
        activation_fn=tf.nn.relu, 
        output_activation_fn=tf.keras.activations.relu, 
        name='CriticNetwork')

### Actor
Generates actions for given observation

In [14]:
with strategy.scope():
  actor_net = actor_distribution_network.ActorDistributionNetwork(
        observation_spec,
        action_spec,
        preprocessing_layers = None,
        preprocessing_combiner=None, 
        conv_layer_params=((64,20,3), (64,10,2), (32,5,1), (16,3,1)),
        fc_layer_params=(512, 256, 128, 64, 32, 16, 8),
        dropout_layer_params=None,
        kernel_initializer=None,
        activation_fn=tf.keras.activations.relu,
        continuous_projection_net=tanh_normal_projection_network.TanhNormalProjectionNetwork,
        name='ActorDistributionNetwork')

### Agent


In [15]:
with strategy.scope():
  train_step = train_utils.create_train_step()
  tf_agent = sac_agent.SacAgent(
        time_step_spec,
        action_spec,
        critic_network=critic_net,
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001),
        actor_network=actor_net,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001),
        alpha_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001),
        actor_loss_weight = 1.0,
        critic_loss_weight = 0.5,
        alpha_loss_weight = 1.0,
        target_update_tau=0.01,
        target_update_period=1,
        td_errors_loss_fn=tf.math.squared_difference,
        gamma=0.99,
        reward_scale_factor=1.0,
        initial_log_alpha = 0.1,
        use_log_alpha_in_alpha_loss = False,
        target_entropy = -2, # -0.8 as good as random in 4k episodes
        gradient_clipping = None,
        debug_summaries = False,
        summarize_grads_and_vars = False,
        train_step_counter=train_step,
        name='Agent')

tf_agent.initialize()

### Critic net

In [16]:
critic_net.summary()

Model: "CriticNetwork"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
observation_encoding/conv2d  multiple                  25664     
_________________________________________________________________
observation_encoding/conv2d  multiple                  409664    
_________________________________________________________________
observation_encoding/conv2d  multiple                  51232     
_________________________________________________________________
observation_encoding/conv2d  multiple                  4624      
_________________________________________________________________
flatten (Flatten)            multiple                  0         
_________________________________________________________________
observation_encoding/dense ( multiple                  8847872   
_________________________________________________________________
observation_encoding/dense ( multiple                

### Actor net

In [17]:
actor_net.summary()

Model: "ActorDistributionNetwork"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EncodingNetwork (EncodingNet multiple                  9514280   
_________________________________________________________________
TanhNormalProjectionNetwork  multiple                  36        
Total params: 9,514,316
Trainable params: 9,514,316
Non-trainable params: 0
_________________________________________________________________


# Metrics

### Tensorboard logs for random buffer

In [18]:
# Folder for random policy logs
log_dir = 'logs/RANDOM'

# Real and Sim in different subfolders
if real:
    log_dir += '/REAL/'
else:
    log_dir += '/SIM/'

### Observer (called after each step)

In [27]:
# Tensorflow Agents metrics
train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(buffer_size=episodes_in_patch),
    tf_metrics.AverageEpisodeLengthMetric(buffer_size=episodes_in_patch)]

# Custom observer for random policy Tensorboard logging
class Observer:
    def __init__(self):
        # Initialize in class to avoid empty event files
        self.summary_writer = tf.summary.create_file_writer(log_dir)
    
    def __call__(self, trajectory):
        with self.summary_writer.as_default(step = train_metrics[0].result().numpy()):             
            if train_metrics[0].result().numpy() % episodes_in_patch == 0:
                # Write to Tensorboard log folder
                tf.summary.scalar(f'Avg Reward per episode', train_metrics[2].result().numpy())
                tf.summary.scalar(f'Avg Steps per episode', train_metrics[3].result().numpy())
              
            print(f"\rTotal steps: {train_metrics[1].result().numpy()} in {train_metrics[0].result().numpy()} episodes", end="")

# Replay buffer

Store data about previous training experiences

In [20]:
# Create replay buffer
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec = tf_agent.collect_data_spec,
    batch_size = 1,                 # Replays are stored one at the time
    max_length = replay_max_length, # Total buffer size
    scope='TFUniformReplayBuffer',
    device='cpu:0',                 # Use CPU for data storage and compute (GPU needed for network training)
    dataset_drop_remainder=True,    
    dataset_window_shift=None, 
    stateful_dataset=False)

In [21]:
# Try loading replay buffer from a file
try:
    tf.train.Checkpoint(replay_buffer=replay_buffer).restore(rb_tempdir+ '-1')
    logging.info(f"Loaded {replay_buffer.gather_all().action[0].shape[0]} steps from {rb_tempdir}")
except:
    logging.info(f"No random policy replay buffer steps found in {rb_tempdir}")

INFO:Loaded 50000 steps from data/sim_buffer/


#### Reading data from buffer

In [22]:
# Create dataset from replay buffer
dataset = replay_buffer.as_dataset(
    sample_batch_size=sample_batch_size,
    num_parallel_calls = 1,
    num_steps=2).prefetch(50)

experience_dataset_fn = lambda: dataset

# Tensorboard

In [23]:
%load_ext tensorboard
%tensorboard --logdir logs

# Random training

Random training helps to converge faster. It is filling replay buffer (database) with sample data. Initial SAC policy could be worse than random!

In [24]:
if steps_random_training > 0:
    # Use random policy
    initial_collect_policy = random_tf_policy.RandomTFPolicy(action_spec = tf_env.action_spec(),
                                                          time_step_spec = tf_env.time_step_spec())
    # Use step driver
    inital_driver = dynamic_step_driver.DynamicStepDriver(
        tf_env, 
        initial_collect_policy, 
        observers = [replay_buffer.add_batch, Observer()] + train_metrics, 
        num_steps = steps_save_random)
    
    for _ in range(int(steps_random_training/steps_save_random)):
        
        # Do 'steps_save_random' steps
        inital_driver.run()
        
        # Save replay buffer
        tf.train.Checkpoint(replay_buffer=replay_buffer).save(rb_tempdir)
        logging.info(f"Saved replay buffer at {replay_buffer.gather_all().action[0].shape[0]} steps")
else:
    logging.info(f"Using replay from file. No new Steps")

INFO:Using replay from file. No new Steps


### Tensorboard logs for training

In [25]:
# Reset TF metrics after random buffer filling
for i in range(4):
    train_metrics[i].reset()
    
# Folder for training logs
log_dir = 'logs/' + experiment_name

# Place sim logs in sim subfolder
if real:
    log_dir += '/REAL/'
else:
    log_dir += '/SIM/'

# Collect driver

Driver for running a policy in an environment. Does steps until num_episodes episodes is done.

In [28]:
# Use episode driver (nr of steps per episode is limited by env)
collect_actor = dynamic_episode_driver.DynamicEpisodeDriver(
    tf_env, 
    py_tf_eager_policy.PyTFEagerPolicy(tf_agent.collect_policy, use_tf_function=True),
    observers = [replay_buffer.add_batch, Observer()] + train_metrics,
    num_episodes = episodes_in_patch)

# Learner

Learner loads checkpoint from tempdir if available, so learning can be resumed from saved point.

In [29]:
agent_learner = learner.Learner(
    tempdir,
    train_step, 
    tf_agent, 
    experience_dataset_fn,
    checkpoint_interval=training_iterations,
    summary_interval=100000, 
    max_checkpoints_to_keep=2,
    strategy=strategy,
    run_optimizer_variable_init=True)

# in debug INFO: ckpt-xxxx <- last number shows how many iterations have been done for current experiment name

INFO:Checkpoint available: models/Buf_1/train/checkpoints/ckpt-98020


# Training

In [None]:
for i in range(int(episodes_to_train/episodes_in_patch)):
     
    # Only learning when mode = TRAIN
    if  algorithm_mode == AlgorithmMode.TRAIN:
        
        # Update policy on replay buffer data.
        agent_learner.run(iterations=training_iterations)    
    
    
    # Collect new data
    collect_actor.run()
    
    # Backup replay buffer into data folder
    if i % 20 == 0 and i > 0 and update_replay:
        tf.train.Checkpoint(replay_buffer=replay_buffer).save(rb_tempdir)
        logging.info(f"Updated replay buffer!")

INFO:Saved checkpoint: models/Buf_1/train/checkpoints/ckpt-98520


Total steps: 235 in 25 episodes

INFO:Saved checkpoint: models/Buf_1/train/checkpoints/ckpt-99020


Total steps: 490 in 50 episodes