In [8]:
import json
from tqdm import tqdm
import time
from utils import play_game2
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dense
from tensorflow.keras import Model
from agents.agent import Agent
from agents.agent import mean_huber_loss
from game_environment import SnakeNumpy

In [9]:
# Parameters as from the config file of v17.1
version = 'v17.1'
board_size = 10
n_actions = 4
frames = 2
supervised = False
obstacles = False
max_time_limit = 998
buffer_size = 80000
frame_mode = True

In [10]:
class DeepQLearningAgent(Agent):
    def __init__(self, board_size, frames, buffer_size, n_actions, version, use_target_net=True, gamma=0.99):
        super().__init__(board_size, frames, buffer_size, gamma, n_actions, use_target_net, version)
        self._model = self.model()
        self._target_net = self._model
        self.update_target_net()

    # Manually implpementing the same model as in the v17.1 setup
    def model(self):
        input_board = Input((10, 10, 2,), name='input')
        model = Sequential()
        model.add(input_board)
        model.add(Conv2D(filters=16, kernel_size=[3, 3], input_shape=(10, 10, 2),
                         activation='relu', data_format='channels_last',
                         padding='same', ))
        model.add(Conv2D(filters=32, kernel_size=[3, 3], input_shape=(10, 10, 16),
                         activation='relu', data_format='channels_last', ))
        model.add(Conv2D(filters=64, kernel_size=[5, 5], input_shape=(8,8, 32),
                         activation='relu', data_format='channels_last', ))
        model.add(Flatten())
        model.add(Dense(units=64, activation='relu', name='action_prev_dense'))
        model.add(Dense(units=4, activation='linear', name='action_values'))
        model.compile(optimizer=RMSprop(0.0005), loss=mean_huber_loss)
        return model

    def update_target_net(self):
        self._target_net.set_weights(self._model.get_weights())

    def train_agent(self, batch_size=32, num_games=1, reward_clip=False):
        s, a, r, next_s, done, legal_moves = self._buffer.sample(batch_size)
        if reward_clip:
            r = np.sign(r)
        # calculate the discounted reward, and then train accordingly
        current_model = self._target_net if self._use_target_net else self._model
        next_model_outputs = self._get_model_outputs(next_s, current_model)
        # our estimate of expexted future discounted reward
        discounted_reward = r + (self._gamma * np.max(
            np.where(legal_moves == 1, next_model_outputs, -np.inf),
            axis=1).reshape(-1, 1)) * (1 - done)
        # create the target variable, only the column with action has different value
        target = self._get_model_outputs(s)
        # we bother only with the difference in reward estimate at the selected action
        target = (1 - a) * target + a * discounted_reward
        # fit
        loss = self._model.train_on_batch(self._normalize_board(s), target)
        # loss = round(loss, 5)
        return loss

    def _get_model_outputs(self, board, model=None):
        # to correct dimensions and normalize
        board = self._prepare_input(board)
        # the default model to use
        if model is None:
            model = self._model
        model_outputs = model.predict_on_batch(board)
        return model_outputs

    def _prepare_input(self, board):
        if (board.ndim == 3):
            board = board.reshape((1,) + self._input_shape)
        board = self._normalize_board(board.copy())
        return board.copy()

    def _normalize_board(self, board):
        return board.astype(np.float32) / 4.0

    def save_model(self, file_path='', iteration=None):
        if iteration is not None:
            assert isinstance(iteration, int), "iteration should be an integer"
        else:
            iteration = 0
        self._model.save_weights("{}/model_{:04d}.h5".format(file_path, iteration))
        if self._use_target_net:
            self._target_net.save_weights("{}/model_{:04d}_target.h5".format(file_path, iteration))

    def move(self, board, legal_moves, value=None):
        # use the agent model to make the predictions
        model_outputs = self._get_model_outputs(board, self._model)
        return np.argmax(np.where(legal_moves == 1, model_outputs, -np.inf), axis=1)



In [17]:
# This part is pretty much demystified
# when DeepQLearningAgent is instanciated, it creates initial models based on the input of the given parameters including version for the model to use.
agent = DeepQLearningAgent(
    board_size=board_size, frames=frames, buffer_size=buffer_size, n_actions=n_actions,
    version=version)
agent.model().summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_27 (Conv2D)           (None, 10, 10, 16)        304       
_________________________________________________________________
conv2d_28 (Conv2D)           (None, 8, 8, 32)          4640      
_________________________________________________________________
conv2d_29 (Conv2D)           (None, 4, 4, 64)          51264     
_________________________________________________________________
flatten_7 (Flatten)          (None, 1024)              0         
_________________________________________________________________
action_prev_dense (Dense)    (None, 64)                65600     
_________________________________________________________________
action_values (Dense)        (None, 4)                 260       
Total params: 122,068
Trainable params: 122,068
Non-trainable params: 0
________________________________________________

In [12]:
# some more funny parameters
epsilon, epsilon_end = 1, 0.01
reward_type = 'current'
sample_actions = False
n_games_training = 8 * 16
games_eval = 8
decay = 0.97
episodes = 1 * (10 ** 3)
log_frequency = 250

In [13]:
# play some games to fill buffer
games = 512
env = SnakeNumpy(board_size=board_size, frames=frames,
                 max_time_limit=max_time_limit, games=games,
                 frame_mode=True, obstacles=obstacles, version=version)
ct = time.time()
_ = play_game2(env, agent, n_actions, n_games=games, record=True,
               epsilon=epsilon, verbose=True, reset_seed=False,
               frame_mode=True, total_frames=games * 64)
print('Playing {:d} frames took {:.2f}s'.format(games * 64, time.time() - ct))

Playing 32768 frames took 0.78s


In [14]:
env = SnakeNumpy(board_size=board_size, frames=frames,
                 max_time_limit=max_time_limit, games=n_games_training,
                 frame_mode=True, obstacles=obstacles, version=version)
env2 = SnakeNumpy(board_size=board_size, frames=frames,
                  max_time_limit=max_time_limit, games=games_eval,
                  frame_mode=True, obstacles=obstacles, version=version)

In [15]:
# training loop
model_logs = {'iteration': [], 'reward_mean': [],
              'length_mean': [], 'games': [], 'loss': []}

for index in tqdm(range(episodes)):
    # make small changes to the buffer and slowly train
    _, _, _ = play_game2(env, agent, n_actions, epsilon=epsilon,
                         n_games=n_games_training, record=True,
                         sample_actions=sample_actions, reward_type=reward_type,
                         frame_mode=True, total_frames=n_games_training,
                         stateful=True)

    loss = agent.train_agent(batch_size=64,
                             num_games=n_games_training, reward_clip=True)

    if (index + 1) % log_frequency == 0:
        agent.update_target_net()
        agent.save_model(file_path='models/{:s}'.format(version), iteration=(index + 1))

        # keep some epsilon alive for training
        epsilon = max(epsilon * decay, epsilon_end)


100%|██████████| 1000/1000 [00:18<00:00, 53.08it/s]
