In [13]:
import json
from tqdm import tqdm
import time
from utils import play_game2
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dense
from tensorflow.keras import Model
from agents.DeepQLearningAgent import DeepQLearningAgent
from agents.agent import mean_huber_loss
from game_environment import SnakeNumpy

In [14]:
# Parameters as from the config file of v17.1
version = 'v17.1'
board_size = 10
n_actions = 4
frames = 2
supervised = False
obstacles = False
max_time_limit = 998
buffer_size = 80000
frame_mode = True

In [15]:
agent = DeepQLearningAgent(
    board_size=board_size, frames=frames, buffer_size=buffer_size, n_actions=n_actions,
    version=version)
agent._agent_model().summary()

Model: "model_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 10, 10, 2)]       0         
_________________________________________________________________
conv2d_48 (Conv2D)           (None, 10, 10, 16)        304       
_________________________________________________________________
conv2d_49 (Conv2D)           (None, 8, 8, 32)          4640      
_________________________________________________________________
conv2d_50 (Conv2D)           (None, 4, 4, 64)          51264     
_________________________________________________________________
flatten_16 (Flatten)         (None, 1024)              0         
_________________________________________________________________
action_prev_dense (Dense)    (None, 64)                65600     
_________________________________________________________________
action_values (Dense)        (None, 4)                 260

In [16]:
# some more funny parameters
epsilon, epsilon_end = 1, 0.01
reward_type = 'current'
sample_actions = False
n_games_training = 8 * 16
games_eval = 8
decay = 0.97
episodes = 1 * (10 ** 3)
log_frequency = 250

In [17]:
# play some games to fill buffer
games = 512
env = SnakeNumpy(board_size=board_size, frames=frames,
                 max_time_limit=max_time_limit, games=games,
                 frame_mode=True, obstacles=obstacles, version=version)
ct = time.time()
_ = play_game2(env, agent, n_actions, n_games=games, record=True,
               epsilon=epsilon, verbose=True, reset_seed=False,
               frame_mode=True, total_frames=games * 64)
print('Playing {:d} frames took {:.2f}s'.format(games * 64, time.time() - ct))

Playing 32768 frames took 0.83s


In [18]:
env = SnakeNumpy(board_size=board_size, frames=frames,
                 max_time_limit=max_time_limit, games=n_games_training,
                 frame_mode=True, obstacles=obstacles, version=version)
env2 = SnakeNumpy(board_size=board_size, frames=frames,
                  max_time_limit=max_time_limit, games=games_eval,
                  frame_mode=True, obstacles=obstacles, version=version)

In [22]:
# This is where the magic happens.
# This is where the other methods of the DeepQLearningAgent has to function as intended.
# training loop

# Initializing dict for logs
model_logs = {'iteration': [], 'reward_mean': [],
              'length_mean': [], 'games': [], 'loss': []}

for index in tqdm(range(episodes)): # tqdm: related to loading bar, not of particular significance

    # make small changes to the buffer and slowly train
    # This should run as intended, no need to make changes here
    _, _, _ = play_game2(env, agent, n_actions, epsilon=epsilon,
                         n_games=n_games_training, record=True,
                         sample_actions=sample_actions, reward_type=reward_type,
                         frame_mode=True, total_frames=n_games_training,
                         stateful=True)

    # First encounter with train_agent
    loss = agent.train_agent(batch_size=64,
                             num_games=n_games_training, reward_clip=True)

    if (index + 1) % log_frequency == 0:
        agent.update_target_net()
        agent.save_model(file_path='models/{:s}'.format(version), iteration=(index + 1))

        # keep some epsilon alive for training
        epsilon = max(epsilon * decay, epsilon_end)


  0%|          | 0/1000 [00:04<?, ?it/s]


KeyboardInterrupt: 