In [1]:
import json
from tqdm import tqdm
import time
from utils import play_game2
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dense
from tensorflow.keras import Model
from agents.DeepQLearningAgent import DeepQLearningAgent
from agents.agent import mean_huber_loss
from game_environment import SnakeNumpy

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Parameters as from the config file of v17.1
version = 'v17.1'
board_size = 10
n_actions = 4
frames = 2
supervised = False
obstacles = False
max_time_limit = 998
buffer_size = 80000
frame_mode = True

In [11]:
agent = DeepQLearningAgent(
    board_size=board_size, frames=frames, buffer_size=buffer_size, n_actions=n_actions,
    version=version)
agent._agent_model().summary()

In [8]:
# some more funny parameters
epsilon, epsilon_end = 1, 0.01
reward_type = 'current'
sample_actions = False
n_games_training = 8 * 16
games_eval = 8
decay = 0.97
episodes = 1 * (10 ** 3)
log_frequency = 250

In [9]:
# play some games to fill buffer
games = 512
env = SnakeNumpy(board_size=board_size, frames=frames,
                 max_time_limit=max_time_limit, games=games,
                 frame_mode=True, obstacles=obstacles, version=version)
ct = time.time()
_ = play_game2(env, agent, n_actions, n_games=games, record=True,
               epsilon=epsilon, verbose=True, reset_seed=False,
               frame_mode=True, total_frames=games * 64)
print('Playing {:d} frames took {:.2f}s'.format(games * 64, time.time() - ct))

Playing 32768 frames took 0.80s


In [10]:
env = SnakeNumpy(board_size=board_size, frames=frames,
                 max_time_limit=max_time_limit, games=n_games_training,
                 frame_mode=True, obstacles=obstacles, version=version)
env2 = SnakeNumpy(board_size=board_size, frames=frames,
                  max_time_limit=max_time_limit, games=games_eval,
                  frame_mode=True, obstacles=obstacles, version=version)

In [82]:
# training loop
model_logs = {'iteration': [], 'reward_mean': [],
              'length_mean': [], 'games': [], 'loss': []}

for index in tqdm(range(episodes)):
    # make small changes to the buffer and slowly train
    _, _, _ = play_game2(env, agent, n_actions, epsilon=epsilon,
                         n_games=n_games_training, record=True,
                         sample_actions=sample_actions, reward_type=reward_type,
                         frame_mode=True, total_frames=n_games_training,
                         stateful=True)

    loss = agent.train_agent(batch_size=64,
                             num_games=n_games_training, reward_clip=True)

    if (index + 1) % log_frequency == 0:
        agent.update_target_net()
        agent.save_model(file_path='models/{:s}'.format(version), iteration=(index + 1))

        # keep some epsilon alive for training
        epsilon = max(epsilon * decay, epsilon_end)


100%|██████████| 1000/1000 [00:21<00:00, 46.18it/s]
