### Imports

In [2]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

Matplotlib is building the font cache; this may take a moment.


In [3]:
# Create the environment
env = gym.make("CartPole-v1")

# Set seed for experiment reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

# Small epsilon value for stabilizing division operations
eps = np.finfo(np.float32).eps.item()

### The Model

The Actor and Critical will be modeled using one neural network that generates action probabilities and the Critic value respectively. 

During the forward pass, the model will take in the state as the input and will output both action probabilities and critic value. The goal is to train a model that chooses actions based on a policy that maximizes expected return.

In [4]:
class ActorCritic(tf.keras.Model):
    """Actor-Critic Network Model"""
    def __init__(self, num_actions: int, num_hidden_units: int):
        #Initialize
        super().__init__()

        self.common = layers.Dense(num_hidden_units, activation="relu")
        self.actor = layers.Dense(num_actions)
        self.critic = layers.Dense(1)

    def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        x = self.common(inputs)
        return self.actor(x), self.critic(x)

In [5]:
num_actions = env.action_space.n
num_hidden_units = 128

#build model
model = ActorCritic(num_actions, num_hidden_units)