In [None]:
from snake_game import Snake
from pg import PG
from dqn import DQN
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

## Policy Gradients

In [None]:
# neural network parameters
layers =[4, (32, 'elu'), (32, 'elu'), (3, 'softmax')]
loss = keras.losses.categorical_crossentropy
optimizer = keras.optimizers.Adam(lr=0.01)
# agent parameter
discount_factor = 0.95
# import from file path, None for freshly initialized model
file = None

agent = PG(layers, loss, optimizer, discount_factor, file)

In [None]:
# set number of iterations and episodes per iteration
iterations = 500
episodes = 25
# set maximum number of steps performed without the score changing
max_steps_per_score = 128
# define whether to train the agent or not
train = True

best = None
mean_rewards = []
for i in range(iterations):
    all_rewards = []
    all_grads = []
    all_points = []
    for e in range(episodes):
        current_rewards = []
        current_grads = []
        # Get initial observation
        env = Snake(gui=not train)
        obs, invalid = env.obs_and_invalid()
        # Store steps per score in dictionary
        steps_per_score = {}
        while steps_per_score.get(env.points, 0) < max_steps_per_score:
            # Get action and corresponding gradient
            action, grads = agent.run_policy(obs, invalid)
            # Perform the action to get new observation and reward data 
            obs, reward, done, invalid = env.step(action)
            # Save reward / gradient in current_rewards / current_gradients
            current_rewards.append(reward)
            current_grads.append(grads)
            # Increase steps of current score by one
            steps_per_score[env.points] = steps_per_score.get(env.points, 0) + 1
            # Exit loop if game over
            if done:
                break
        # Save lists current_rewards / current_grads in all_rewards / all_grads
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
        all_points.append(env.points)
    mean_reward = np.sum([r for e in all_rewards for r in e]) / episodes
    mean_rewards.append(mean_reward)
    if train:
        print('Iteration {0}/{1} - mean reward, score: {2}, {3}'.format(i + 1, iterations, mean_reward, np.mean(all_points)))
        # Save model if it scored best
        if best is None or mean_reward >= best:
            agent.save('snake_pg')
            best = mean_reward
            print('Model saved.')
        # Use collected reward and gradient data to train agent
        agent.apply_grads(all_rewards, all_grads)
# Plot mean rewards
plt.plot(range(iterations), mean_rewards)
plt.xlabel('Iteration')
plt.ylabel('Mean reward')

## Deep Q-Learning

In [None]:
# Snake has 4 input values and 3 actions
n_obs = 4
n_actions = 3
# neural network parameters
hidden_layers = [(96, 'elu'), (96, 'elu')]
optimizer = keras.optimizers.Adam(lr=1e-3)
# agent parameters
discount_factor = 0.95
buffer_size = 50000
# import from file path, None for freshly initialized model
file = None

agent = DQN(n_obs, hidden_layers, n_actions, optimizer, discount_factor, buffer_size, file)

In [None]:
# set number of episodes
episodes = 25000
# set maximum number of steps performed without the score changing
max_steps_per_score = 128
# set the first number of episodes in which the agent is not trained
n_pretrain = 100
# update target model every ... episodes
update_target = 200
batch_size = 128
# set exploration rate decay from ... to ... in ... steps
epsilon_decay = 1, 0.01, 15000
# define wheter to train the agent or not
train = True

best = None
total_rewards = []
scores = []
for e in range(episodes):
    total_reward = 0
    # Initialize environment and get initial state
    env = Snake(gui=not train)
    state, invalid = env.obs_and_invalid()
    # Store steps per score in dictionary
    steps_per_score = {}
    while steps_per_score.get(env.points, 0) < max_steps_per_score:
        # Get agent's action
        epsilon = max(epsilon_decay[0] - e / epsilon_decay[2], epsilon_decay[1]) if train else 0
        action = agent.play_one_step(state, epsilon, invalid)
        # Let environment perform action and update current state
        next_state, reward, done, invalid = env.step(action)
        agent.add_experience(state, action, reward, next_state, done, invalid)
        state = next_state
        total_reward += reward
        # Increase steps of current score by one
        steps_per_score[env.points] = steps_per_score.get(env.points, 0) + 1
        # Exit loop if game over
        if done:
            break
    # Save and print game data
    total_rewards.append(total_reward)
    scores.append(env.points)
    if train:
        print('Episode {0}/{1} - total reward, score: {2}, {3}'.format(e + 1, episodes, total_reward, env.points))
        # Save model if the highest reward has been collected
        if best is None or total_reward >= best: 
            agent.save('snake_dqn')
            best = total_reward
            print('Model saved.')
        # Perform training step
        if e >= n_pretrain:
            agent.training_step(batch_size)
            if e % update_target == 0:
                agent.update_target_model()
# Plot scores and total_rewards
plt.plot(range(episodes), total_rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')