### Imports

In [1]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

2025-01-22 09:12:53.426279: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Create the environment
env = gym.make("CartPole-v1")

# Set seed for experiment reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

# Small epsilon value for stabilizing division operations
eps = np.finfo(np.float32).eps.item()

### The Model

The Actor and Critical will be modeled using one neural network that generates action probabilities and the Critic value respectively. 

During the forward pass, the model will take in the state as the input and will output both action probabilities and critic value. The goal is to train a model that chooses actions based on a policy that maximizes expected return.

In [3]:
class ActorCritic(tf.keras.Model):
    """Actor-Critic Network Model"""
    def __init__(self, num_actions: int, num_hidden_units: int):
        #Initialize
        super().__init__()

        self.common = layers.Dense(num_hidden_units, activation="relu")
        self.actor = layers.Dense(num_actions)
        self.critic = layers.Dense(1)

    def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        x = self.common(inputs)
        return self.actor(x), self.critic(x)

In [4]:
num_actions = env.action_space.n
num_hidden_units = 128

#build model
model = ActorCritic(num_actions, num_hidden_units)

### Agent Training

Training the agent will follow these steps
- Run the agent on the environment to collect training data per episode
- Compute expected return at each time step
- Compute the loss for the combined Actor-Critic model
- Compute gradients and update network parameters
- Repeat until success criterion or max episodes has been reached

In [5]:
@tf.numpy_function(Tout=[tf.float32, tf.int32, tf.int32])
def env_step(action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    #returns state, reward and done flag given an action
    state, reward, done, truncated, info = env.step(action)
    return (state.astype(np.float32), np.array(reward, np.int32), np.array(done, np.int32))

In [6]:
def run_episode(initial_state: tf.Tensor, 
                model: tf.keras.Model,
                max_steps: int) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
    "runs a single episode to collect training data"
    action_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
    values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
    rewards = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)

    initial_state_shape = initial_state.shape
    state = initial_state

    for t in tf.range(max_steps):
        #convert state into a batched tesnor (batch size = 1)
        state = tf.expand_dims(state, 0)

        #run the model and to get action probabilities and critic value
        action_logits_t, value = model(state)

        #sample next action from the action probability distribution
        action = tf.random.categorical(action_logits_t, 1)[0, 0]
        action_probs_t = tf.nn.softmax(action_logits_t)

        #store critic values
        values = values.write(t, tf.squeeze(value))

        #store log probability of the action chosen
        action_probs = action_probs.write(t, action_probs_t[0, action])

        #apply action to the environment to get next state and reward
        state, reward, done = env_step(action)
        state.set_shape(initial_state_shape)

        #store reward
        rewards = rewards.write(t, reward)

        if tf.cast(done, tf.bool):
            break

    actions_probs = action_probs.stack()
    values = values.stack()
    rewards = rewards.stack()

    return action_probs, values, rewards


### Compute Expected Returns

Expected returns ensures that the sum of rewards converges, because it implies that rewards now are better than rewards later.

In [7]:
def get_expected_return(rewards: tf.Tensor, gamma: float, standardize: bool = True) -> tf.Tensor:
    #compute expected returns per timestep
    n = tf.shape(rewards)[0]
    returns = tf.TensorArray(dtype=tf.float32, size=n)

    #start from end of rewards and accumulate reward sums
    rewards = tf.cast(rewards[::-1], dtype=tf.float32)
    discounted_sum = tf.constant(0.0)
    discounted_sum_shape = discounted_sum.shape
    for i in tf.range(n):
        reward = rewards[i]
        discounted_sum = reward + gamma * discounted_sum
        discounted_sum.set_shape(discounted_sum_shape)
        returns = returns.write(i, discounted_sum)
    returns = returns.stack()[::-1]

    if standardize:
        returns = ((returns - tf.math.reduce_mean(returns)) / 
                   (tf.math.reduce_std(returns) + eps))
    return returns

### Actor-Critic Loss

The hybrid Actor-Critic model uses a loss function that is a combination of Actor and Critic losses for training.

 The **Actor loss** is based on policy gradients with the Critic as a state dependent baseline and computed with single sample (per-episode) estimates. The **Advantage** indicates how much better an action is given a particuar state over a random action selected according to the policy for that state.
 
  The **Critic loss** involves traing V to be as close as possible to G and can be set up as a regression problem using the Huber loss, which is less sensitive to outliers in the data than squared-error loss.

In [8]:
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

def compute_loss(action_probs: tf.Tensor, values: tf.Tensor, returns: tf.Tensor) -> tf.Tensor:
    #computes the combined Actor-Critic loss
    advantage = returns -values

    action_log_probs = tf.math.log(action_probs)
    actor_loss = -tf.math.reduce_sum(action_log_probs * advantage)

    critic_loss = huber_loss(values, returns)

    return actor_loss + critic_loss

### Train Step & Update Parameters

The steps above are combined into a training step that is run every episode. All steps leading up to the loss function are executed with the `tf.GradientTape` context to enable automatic differentiation. We use the Adam optimizer to apply the gradients to the model parameters. 

The `tf.function` context is applied to the `train_step` function so that it can be compiled into a callable TensorFlow graph, leading to a 10x speedup in training.

In [9]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

@tf.function
def train_step(initial_state: tf.Tensor, model: tf.keras.Model, 
               optimizer: tf.keras.optimizers.Optimizer, gamma: float,
               max_steps_per_episode: int) -> tf.Tensor:
    #runs a model training step
    with tf.GradientTape() as tape:
        #run the model for one episode to collect training data
        action_probs, values, rewards = run_episode(initial_state, model, max_steps_per_episode)
        #calculate the expected returns
        returns = get_expected_return(rewards, gamma)
        #convert training data to appropriate tf tensor shapes
        action_probs, values, returns = [
            tf.expand_dims(x, 1) for x in [action_probs, values, returns]
        ]
        #calculate the loss value to update our network
        loss = compute_loss(action_probs, values, returns)
    #compute the gradients from the loss
    grads = tape.gradient(loss, model.trainable_variables)
    #apply the gradients to the model's parameters
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    episode_reward = tf.math.reduce_sum(rewards)

    return episode_reward

### Running The Training Loop

Training is executed by running the training step until either success criterion or maximum number of episodes is reached.

A running record of episode rewards is kept in queue. Once 100 trials are reached, the oldest reward is removed at the left end of the queue and the newest one is added at the head. A running sum of the rewards is also maintained for computational efficiency.