In [None]:
import collections
import warnings

import gym
import numpy as np
import tensorflow as tf

import utils

keras = tf.keras

from keras.layers import Conv2D, Dense, Flatten, Input, MaxPooling2D
from keras.optimizers import Adam

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
GAME = "car_racing_a2c"
VERBOSITY = "0"
SAVE_FREQUENCY = 25

In [None]:
# Hyperparameters
EPISODES = 500
LEARNING_RATE = 0.00001
GAMMA = 0.95

FRAME_SKIP = 2
FRAME_STACK_SIZE = 3
NEGATIVE_REWARD_BREAK = 100

ENTROPY_COEFFICIENT = 0.1

In [None]:
def instantiate_model(env):
    input = Input(shape=(84, 84, FRAME_STACK_SIZE))

    conv1 = Conv2D(filters=8,
                   kernel_size=(7, 7),
                   strides=4,
                   activation="relu",
                   kernel_regularizer='l2')(input)
    maxp1 = MaxPooling2D(pool_size=(2, 2), strides=2)(conv1)
    conv2 = Conv2D(filters=16,
                   kernel_size=(3, 3),
                   activation="relu",
                   kernel_regularizer='l2')(maxp1)
    maxp2 = MaxPooling2D(pool_size=(2, 2), strides=2)(conv2)
    flatten = Flatten()(maxp2)

    dense = Dense(512, activation="relu", kernel_regularizer='l2')(flatten)
    output = Dense(env.action_space.n,
                   activation="linear",
                   kernel_regularizer='l2')(dense)
    actor = keras.Model(inputs=input, outputs=output)

    dense = Dense(512, activation="relu", kernel_regularizer='l2')(flatten)
    output = Dense(1, activation="linear", kernel_regularizer='l2')(dense)
    critic = keras.Model(inputs=input, outputs=output)

    return actor, critic


def take_action(env, action):
    reward = 0
    for _ in range(FRAME_SKIP + 1):
        next_state, frame_reward, done, _ = env.step(action)
        reward += frame_reward
        if done:
            break
    return next_state, reward, done


def update_weights(actor, critic, optimizer, tape, state, next_state, reward,
                   done, action_prob, action_log_prob):
    advantage = reward + (1 - done) * GAMMA * critic(
        np.expand_dims(next_state, axis=0)) - critic(
            np.expand_dims(state, axis=0))

    critic_loss = tf.math.pow(advantage, 2)
    grads = tape.gradient(critic_loss, critic.trainable_variables)
    optimizer.apply_gradients(zip(grads, critic.trainable_variables))

    actor_loss = -action_log_prob * advantage - ENTROPY_COEFFICIENT * -(
        action_log_prob * action_prob)
    grads = tape.gradient(actor_loss, actor.trainable_variables)
    optimizer.apply_gradients(zip(grads, actor.trainable_variables))

In [None]:
env = gym.make("CarRacing-v2", new_step_api=False, continuous=False)

actor, critic = instantiate_model(env)

optimizer = Adam(learning_rate=LEARNING_RATE)

reward_history = []

# Training
for episode in range(EPISODES + 1):
    state = utils.CarRacing.process_state(env.reset())
    episode_reward = 0
    done = False

    frame_stack = collections.deque([state] * FRAME_STACK_SIZE,
                                    maxlen=FRAME_STACK_SIZE)
    negative_reward_count = 0

    # Episode loop
    with tf.GradientTape(persistent=True) as tape:
        while not done:
            curr_frame_stack = utils.CarRacing.transpose_frame_stack(
                frame_stack)

            action_logits = actor(np.expand_dims(curr_frame_stack, axis=0))
            action = tf.random.categorical(action_logits, 1)[0, 0]
            action_probs = tf.nn.softmax(action_logits)

            next_state, reward, done = take_action(env, int(action))
            episode_reward += reward
            negative_reward_count = negative_reward_count + 1 if reward < 0 else 0

            frame_stack.append(utils.CarRacing.process_state(next_state))
            next_frame_stack = utils.CarRacing.transpose_frame_stack(
                frame_stack)

            action_prob = action_probs[0, action]
            action_log_prob = tf.math.log(action_probs[0, action])

            if negative_reward_count > NEGATIVE_REWARD_BREAK:
                break

            update_weights(actor, critic, optimizer, tape, curr_frame_stack,
                           next_frame_stack, reward, done, action_prob,
                           action_log_prob)

    reward_history.append(episode_reward)

    utils.save_progress(actor, reward_history, episode + 1, SAVE_FREQUENCY,
                        GAME)

    utils.log(episode, episode_reward)