# Actor Critic CartPole
https://www.tensorflow.org/tutorials/reinforcement_learning/actor_critic

In [None]:
import os
import gym
import tqdm
import pickle
import datetime
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import softmax

import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
tf.compat.v1.enable_eager_execution()
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten

env = gym.make("CartPole-v0")
seed = 42
env.seed(seed)
tf.random.set_random_seed(seed)
np.random.seed(seed)
# Small epsilon value for stabilizing division operations
eps = np.finfo(np.float32).eps.item()


n_episodes = 10000
max_steps_per_episode = 1000
# Cartpole-v0 is considered solved if average reward 
# is >= 195 over 100 consecutive trials
reward_threshold = 195
running_reward = 0
action_space = env.action_space.n  # 2
state_space = env.observation_space.shape[0]  # 4
hidden_space = 128
gamma = 0.99

In [None]:
def create_model(state_space, hidden_space, action_space):
    x = Input(state_space, name='input_state')
    common = Dense(hidden_space, activation="relu", name='common')(x)
    action = Dense(action_space, activation="softmax", name='policy')(common)
    value = Dense(1, name='value')(common)
    model = Model(inputs=x, outputs=[action, value], name='actor-critic')
    return model

def next_action(policy):
    action = tf.random.categorical(policy, 1)[0, 0]
    action_prob = policy[0, action]
    return action, action_prob

def env_step(action):
    state, reward, done, _ = env.step(action)
    return (state.astype(np.float32), np.array(reward, np.int32), np.array(done, np.int32))

def tf_env_step(action):
    return tf.numpy_function(env_step, [action], [tf.float32, tf.int32, tf.int32])

def run_episode(model, initial_state, max_steps=1000):
    state = initial_state
    values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
    action_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
    rewards = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)

    for t in range(max_steps):
        policy, value = model(state, training=True)
        action, action_prob = next_action(policy)
        
        state, reward, done = tf_env_step(action)
        state = tf.expand_dims(state, 0)

        values = values.write(t, tf.squeeze(value))
        action_probs = action_probs.write(t, tf.squeeze(action_prob))
        rewards = rewards.write(t, tf.squeeze(reward))

        if tf.cast(done, tf.bool):
            break
    
    action_probs = action_probs.stack()
    values = values.stack()
    rewards = rewards.stack()

    return values, action_probs, rewards

def get_expected_returns(rewards, gamma, standardize=True):
    n = tf.shape(rewards)[0]
    returns = tf.TensorArray(dtype=tf.float32, size=n)

    rewards = tf.cast(rewards[::-1], dtype=tf.float32)
    discounted_sum = tf.constant(0.0)
    discounted_sum_shape = discounted_sum.shape
    for i in tf.range(n):
        reward = rewards[i]
        discounted_sum = reward + gamma * discounted_sum
        discounted_sum.set_shape(discounted_sum_shape)
        returns = returns.write(i, discounted_sum)
    returns = returns.stack()[::-1]

    if standardize: 
        returns = ((returns - tf.math.reduce_mean(returns)) / (tf.math.reduce_std(returns) + eps))

    return returns

def compute_loss(action_probs, values, returns):
    advantage = returns - values
    actor_loss = -tf.math.reduce_sum(np.log(action_probs) * advantage)
    critic_loss = tf.keras.losses.MeanSquaredError(reduction="auto")(values, returns)
    #critic_loss = tf.losses.huber_loss(values, returns)
    return actor_loss + critic_loss

def train_step(initial_state, model, optimizer, gamma, max_steps_per_episode=1000):
    with tf.GradientTape() as tape:
        values, action_probs, rewards = run_episode(model, initial_state, max_steps_per_episode)
        returns = get_expected_returns(rewards, gamma)
        loss = compute_loss(action_probs, values, returns)
        #tape.watch(model.trainable_variables)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        episode_reward = tf.math.reduce_sum(rewards)
        return episode_reward

initial_state = tf.constant(env.reset(), dtype=tf.float32)
initial_state = tf.expand_dims(initial_state, 0)

model = create_model(state_space, hidden_space, action_space)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

episode_reward = train_step(initial_state, model, optimizer, gamma)
print("episode_reward",episode_reward)

In [None]:
%%time

model = create_model(state_space, hidden_space, action_space)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

with tqdm.trange(n_episodes) as t:
    for i in t:
        initial_state = tf.constant(env.reset(), dtype=tf.float32)
        initial_state = tf.expand_dims(initial_state, 0)
        episode_reward = int(train_step(initial_state, model, optimizer, gamma, max_steps_per_episode))

        running_reward = episode_reward*0.01 + running_reward*.99

        t.set_description(f'Episode {i}')
        t.set_postfix(episode_reward=episode_reward, running_reward=running_reward)

        if running_reward > reward_threshold:  
            break

    print(f'\nSolved at episode {i}: average reward: {running_reward:.2f}!')


### Save model weights

if not os.path.exists('save_model'):
    os.mkdir('save_model')
model.save_weights("./save_model/model.h5")

In [None]:
model = create_model(state_space, hidden_space, action_space)
model.load_weights(f'./save_model/model.h5')

state = tf.constant(env.reset(), dtype=tf.float32)
for i in range(1, 1000):
    env.render()

    state = tf.expand_dims(state, 0)
    policy, _ = model(state)
    action = np.argmax(np.squeeze(policy))

    state, _, done, _ = env.step(action)
    state = tf.constant(state, dtype=tf.float32)

    if done:
        break