# Actor-Critic CartPole
https://www.tensorflow.org/tutorials/reinforcement_learning/actor_critic
https://keras.io/examples/rl/actor_critic_cartpole/

In [None]:
import gym
import datetime
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

tf.logging.set_verbosity(tf.logging.ERROR)
tf.compat.v1.enable_eager_execution()

# Configuration parameters for the whole setup
seed = 42
gamma = 0.99  # Discount factor for past rewards
max_steps_per_episode = 10000
env = gym.make("CartPole-v0")  # Create the environment
env.seed(seed)
eps = np.finfo(np.float32).eps.item()

num_inputs = 4
num_actions = 2
num_hidden = 128

In [None]:
def create_model():
    x = layers.Input(shape=(num_inputs,))
    common = layers.Dense(num_hidden, activation="relu")(x)
    action = layers.Dense(num_actions, activation="softmax")(common)
    critic = layers.Dense(1)(common)
    model = keras.Model(inputs=x, outputs=[action, critic])
    return model

In [None]:
model = create_model()

optimizer = keras.optimizers.Adam(learning_rate=0.01)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0

while True:  # Run until solved
    state = env.reset()
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):

            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)

            # predict action probabilities and estimated 
            # future rewards from environment state
            action_probs, critic_value = model(state)
            critic_value_history.append(critic_value[0, 0])

            # sample action from action probability distribution
            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            action_probs_history.append(tf.math.log(action_probs[0, action]))

            # apply the sampled action in our environment
            state, reward, done, _ = env.step(action)
            rewards_history.append(reward)
            episode_reward += reward

            if done:
                break

        # update running reward to check condition for solving
        running_reward = episode_reward*.05 + running_reward*.95

        # compute expected value from rewards
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum  # TD target
            returns.insert(0, discounted_sum)

        # normalization
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()

        # computing loss values
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            adv = ret - value  # advantage
            actor_losses.append(-log_prob * adv)  # actor loss

            # the critic must be updated so that it predicts 
            # a better estimate of the future rewards
            critic_losses.append(
                huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))  # TD error
            )

        # Backpropagation
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    # Log details
    episode_count += 1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))

    if running_reward > 195:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

In [None]:
now = datetime.datetime.now()
if not os.path.exists('save_model'):
    os.mkdir('save_model')
model.save_weights(f"./save_model/model{now.strftime('%Y%m%d_%H%M%S')}.h5")

model = create_model()
model.load_weights(f'./save_model/model.h5')

state = tf.convert_to_tensor(env.reset())

while True:
    env.render()

    state = tf.expand_dims(state, 0)
    policy, _ = model(state)
    action = np.argmax(np.squeeze(policy))

    state, _, done, _ = env.step(action)
    state = tf.constant(state, dtype=tf.float32)

    if done:
        break