In [1]:
from unityagents import UnityEnvironment
from dqn_agent import Agent
import time
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import OrderedDict
from collections import namedtuple, deque

import matplotlib.pyplot as plt
%matplotlib inline


plt.ion()

In [2]:
def get_state(state_now, prev_states, prev_actions):
    return np.concatenate((state_now,) + tuple(prev_states) + (np.array(prev_actions),))

def dqn(env, agent, scores, n_episodes=1500, max_t=1000, eps_start=1.0, eps_end=0.01,
        eps_decay=0.997, lr_decay_episode=0.997, min_lr=1e-5):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        lr_decay_episode (float): Learning rate decay multiplier (per episode)
        min_lr (float): Minimum learning rate (capped at the bottom at this value)
    """
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    hundred_episodes_start_time = time.time()
    for i_episode in range(1, n_episodes+1):
        for param_group in agent.optimizer_main.param_groups:
            # adjust learning rate
            param_group['lr'] = max(agent.lr0 * lr_decay_episode**i_episode, min_lr)
        episode_start_time = time.time()
        env_info = env.reset(train_mode=False)[env.brain_names[0]] # reset the environment
        state_now = env_info.vector_observations[0]            # get the current state
        prev_states = deque(maxlen=agent.num_frames)
        prev_actions = deque(maxlen=agent.num_frames)
        score = 0
        for t in range(max_t):
            if len(prev_states) == agent.num_frames:
                state = get_state(state_now, prev_states, prev_actions)
                action = agent.act(state, eps)
                have_enough_frames = True
            else:
                state = None
                # take random actions until have enough frames
                action = np.random.randint(agent.action_size)
                have_enough_frames = False
            env_info = env.step(action)[env.brain_names[0]]        # send the action to the environment
            next_state_now = env_info.vector_observations[0]   # get the next state
            reward = env_info.rewards[0]                   # get the reward
            done = env_info.local_done[0]                  # see if episode has finished
            score += reward                                # update the score
            prev_states.append(state_now)
            prev_actions.append(action)
            if have_enough_frames:
                next_state = get_state(next_state_now, prev_states, prev_actions) # note that prev_states and prev_actions are updated at this point to include the current state and action
                agent.step(state, action, reward, next_state, done)
            state_now = next_state_now                             # roll over the state to next time step
            if done:                                       # exit loop if episode finished
                break
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        episode_end_time = time.time()
        print('\rEpisode {}\tAverage Score: {:.2f}, episode took {:.2f} seconds'.format(i_episode, np.mean(scores_window),
              episode_end_time - episode_start_time), end="")
        if i_episode % 100 == 0:
            hundred_episodes_end_time = time.time()
            print('\rEpisode {}\tAverage Score: {:.2f}, 100 episodes took {:.2f} seconds'.format(i_episode,\
                np.mean(scores_window), hundred_episodes_end_time - hundred_episodes_start_time))
            torch.save(agent.QNetwork_main.state_dict(), 'checkpoints/checkpoint_{}.pth'.format(i_episode))
            hundred_episodes_start_time = time.time()
        if np.mean(scores_window) >= 13.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100,\
                np.mean(scores_window)))
            torch.save(agent.QNetwork_main.state_dict(), 'checkpoints/final_checkpoint.pth')
            break

env = UnityEnvironment(file_name="Banana.app")
env_info = env.reset(train_mode=False)[env.brain_names[0]]
brain = env.brains[env.brain_names[0]]

agent = Agent(state_size=len(env_info.vector_observations[0]),
              action_size=brain.vector_action_space_size,
              seed=0,
              hidden_sizes_list=[64, 32, 16],
              num_frames=0,
              lr0=5e-4,
              gamma=0.99,
              num_iters_learn=3)
scores = []
dqn(env, agent, scores)


UnityTimeOutException: The Unity environment took too long to respond. Make sure that :
	 The environment does not need user interaction to launch
	 The Academy and the External Brain(s) are attached to objects in the Scene
	 The environment and the Python interface have compatible versions.

In [None]:
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
from scipy import signal
plt.plot(np.arange(len(scores)), signal.savgol_filter(scores, 53, 3))
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

# Visualize the trained agent

In [None]:
best_model_index = None
best_score = 0
for i in range(100, (len(scores) + 100), 100):
    score = np.mean(scores[i-100:i])
    if score > best_score:
        best_score = score
        best_model_index = i
print(best_model_index, best_score)
checkpoint = torch.load('checkpoints/checkpoint_{}.pth'.format(best_model_index))
agent.QNetwork_main.load_state_dict(checkpoint)

In [None]:
# best_model_index = 1300
# checkpoint = torch.load('checkpoint_{}.pth'.format(best_model_index))
# agent.QNetwork_main.load_state_dict(checkpoint)

In [None]:
env = UnityEnvironment(file_name="Banana.app")
env_info = env.reset(train_mode=False)[env.brain_names[0]] # reset the environment
state_now = env_info.vector_observations[0]            # get the current state
prev_states = deque(maxlen=agent.num_frames)
prev_actions = deque(maxlen=agent.num_frames)
score = 0
while True:
    if len(prev_states) == agent.num_frames:
        state = get_state(state_now, prev_states, prev_actions)
        action = agent.act(state, eps=0.0)
        have_enough_frames = True
    else:
        state = None
        # take random actions until have enough frames
        action = np.random.randint(agent.action_size)
        have_enough_frames = False
    env_info = env.step(action)[env.brain_names[0]]        # send the action to the environment
    next_state_now = env_info.vector_observations[0]   # get the next state
    reward = env_info.rewards[0]                   # get the reward
    done = env_info.local_done[0]                  # see if episode has finished
    score += reward                                # update the score
    prev_states.append(state_now)
    prev_actions.append(action)
    if have_enough_frames:
        next_state = get_state(next_state_now, prev_states, prev_actions) # note that prev_states and prev_actions are updated at this point to include the current state and action
        agent.step(state, action, reward, next_state, done)
    state_now = next_state_now                             # roll over the state to next time step
    if done:                                       # exit loop if episode finished
        break