In [None]:
import collections
import random
import warnings

import gym
import numpy as np
import tensorflow as tf

import utils

keras = tf.keras

from keras.layers import Conv2D, Dense, Flatten, Input, MaxPooling2D
from keras.optimizers import Adam

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
GAME = "car_racing_dqn"
VERBOSITY = "0"
SAVE_FREQUENCY = 25

In [None]:
# Hyperparameters
EPISODES = 500
LEARNING_RATE = 0.001
GAMMA = 0.95

FRAME_SKIP = 2
FRAME_STACK_SIZE = 3
NEGATIVE_REWARD_BREAK = 100

EPSILON = 1
EPSILON_DECAY = 0.99
EPSILON_MINIMUM = 0.01
ACTION_PROBABILITIES = [0.0, 0.2, 0.2, 0.5,
                        0.1]  # nothing, left, right, gas, brake

BATCH_SIZE = 32
REPLAY_BUFFER_SIZE = 5000

In [None]:
def instantiate_model(env):
    input = Input(shape=(84, 84, FRAME_STACK_SIZE))
    conv1 = Conv2D(filters=8,
                   kernel_size=(7, 7),
                   strides=4,
                   activation="relu",
                   kernel_regularizer='l2')(input)
    maxp1 = MaxPooling2D(pool_size=(2, 2), strides=2)(conv1)
    conv2 = Conv2D(filters=16,
                   kernel_size=(3, 3),
                   activation="relu",
                   kernel_regularizer='l2')(maxp1)
    maxp2 = MaxPooling2D(pool_size=(2, 2), strides=2)(conv2)
    flatten = Flatten()(maxp2)
    dense = Dense(512, activation="relu", kernel_regularizer='l2')(flatten)
    output = Dense(env.action_space.n,
                   activation="linear",
                   kernel_regularizer='l2')(dense)
    model = keras.Model(inputs=input, outputs=output)

    model.compile(loss="mse", optimizer=Adam(learning_rate=LEARNING_RATE))

    return model


def take_action(env, action):
    reward = 0
    for _ in range(FRAME_SKIP + 1):
        next_state, frame_reward, done, _ = env.step(action)
        reward += frame_reward
        if done:
            break
    return next_state, reward, done


def train_on_batch(model, replay_buffer):
    batch = random.sample(replay_buffer, BATCH_SIZE)

    states = np.array([x[0] for x in batch])
    actions = np.array([x[1] for x in batch])
    rewards = np.array([x[2] for x in batch])
    next_states = np.array([x[3] for x in batch])
    dones = np.array([x[4] for x in batch])

    targets = rewards + GAMMA * np.amax(
        np.squeeze(model.predict_on_batch(next_states)), axis=1) * (1 - dones)
    targets_full = np.squeeze(model.predict_on_batch(states))
    targets_full[np.arange(BATCH_SIZE), actions] = targets

    model.fit(states, targets_full, verbose=VERBOSITY)

In [None]:
env = gym.make("CarRacing-v2", new_step_api=False, continuous=False)

model = instantiate_model(env)

replay_buffer = collections.deque(maxlen=REPLAY_BUFFER_SIZE)

reward_history = []

# Training
for episode in range(EPISODES + 1):
    state = utils.CarRacing.process_state(env.reset())
    episode_reward = 0
    done = False

    frame_stack = collections.deque([state] * FRAME_STACK_SIZE,
                                    maxlen=FRAME_STACK_SIZE)
    negative_reward_count = 0

    # Episode loop
    while not done:
        curr_frame_stack = utils.CarRacing.transpose_frame_stack(frame_stack)

        if np.random.uniform(0, 1) < EPSILON:
            action = np.random.choice(5, p=ACTION_PROBABILITIES)
        else:
            action = np.argmax(model(np.expand_dims(curr_frame_stack, axis=0)))

        next_state, reward, done = take_action(env, action)
        episode_reward += reward
        negative_reward_count = negative_reward_count + 1 if reward < 0 else 0

        frame_stack.append(utils.CarRacing.process_state(next_state))
        next_frame_stack = utils.CarRacing.transpose_frame_stack(frame_stack)

        # Store transition in replay buffer
        replay_buffer.append(
            (curr_frame_stack, action, reward, next_frame_stack, done))

        if negative_reward_count > NEGATIVE_REWARD_BREAK:
            break

        # Sample batch and update model
        if len(replay_buffer) >= BATCH_SIZE:
            train_on_batch(model, replay_buffer)

    EPSILON *= EPSILON_DECAY
    EPSILON = max(EPSILON_MINIMUM, EPSILON)

    reward_history.append(episode_reward)

    utils.save_progress(model, reward_history, episode + 1, SAVE_FREQUENCY,
                        GAME)

    utils.log(episode, episode_reward, EPSILON)