In [None]:
import warnings

import gym
import numpy as np
import tensorflow as tf

import utils

keras = tf.keras

from keras.layers import Dense, Input
from keras.optimizers import Adam

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
GAME = "mountain_car_a2c"
VERBOSITY = "0"
SAVE_FREQUENCY = 25

In [None]:
# Hyperparameters
EPISODES = 500
LEARNING_RATE = 0.001
GAMMA = 0.99

In [None]:
def instantiate_model(env):
    input = Input(shape=(env.observation_space.shape))

    dense1 = Dense(32, activation="relu")(input)
    dense2 = Dense(64, activation="relu")(dense1)
    output = Dense(env.action_space.n)(dense2)
    actor = keras.Model(inputs=input, outputs=output)

    dense1 = Dense(32, activation="relu")(input)
    output = Dense(1)(dense1)
    critic = keras.Model(inputs=input, outputs=output)

    return actor, critic


def take_action(env, action):
    next_state, reward, done, _ = env.step(action)
    return next_state, reward, done


def shape_reward(state, next_state, reward):
    return reward + 300 * (abs(next_state[1]) - abs(state[1]))


def update_weights(actor, critic, optimizer, tape, state, next_state, reward,
                   done, action_log_prob):
    advantage = reward + (1 - done) * GAMMA * critic(
        np.expand_dims(next_state, axis=0)) - critic(
            np.expand_dims(state, axis=0))

    critic_loss = tf.math.pow(advantage, 2)
    grads = tape.gradient(critic_loss, critic.trainable_variables)
    optimizer.apply_gradients(zip(grads, critic.trainable_variables))

    actor_loss = -action_log_prob * advantage
    grads = tape.gradient(actor_loss, actor.trainable_variables)
    optimizer.apply_gradients(zip(grads, actor.trainable_variables))

In [None]:
env = gym.make("MountainCar-v0", new_step_api=False)

actor, critic = instantiate_model(env)

optimizer = Adam(learning_rate=LEARNING_RATE)

reward_history = []

# Training
for episode in range(EPISODES + 1):
    state = env.reset()
    episode_reward = 0
    done = False

    # Episode loop
    with tf.GradientTape(persistent=True) as tape:
        while not done:
            action_logits = actor(np.expand_dims(state, axis=0))
            action = tf.random.categorical(action_logits, 1)[0, 0]
            action_probs = tf.nn.softmax(action_logits)

            next_state, reward, done = take_action(env, int(action))
            reward = shape_reward(state, next_state, reward)
            episode_reward += reward

            action_log_prob = tf.math.log(action_probs[0, action])

            update_weights(actor, critic, optimizer, tape, state, next_state,
                           reward, done, action_log_prob)

            state = next_state

    reward_history.append(episode_reward)

    utils.save_progress(actor, reward_history, episode + 1, SAVE_FREQUENCY,
                        GAME)

    utils.log(episode, episode_reward)