# Unity Banana Navigation

## Import packages

In [None]:
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
import numpy as np
from dqn_agent import Agent
from datetime import datetime as dt

## Create Unity environment

The Unity will open in a new window and you can see your agent running.

In [None]:
env = UnityEnvironment(file_name="Banana")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=True)[brain_name]
action_size = brain.vector_action_space_size
state = env_info.vector_observations[0]
state_size = len(state)

## DQN Function

This function will run or train the agent. If the agent will reach average score of 13.0 over 100 consecutive episodes it will save the learned weights to the `checkpoint_dqn.pth` file. You can read this weights while initializing new agent to run trained agent

In [None]:
def navigate(agent, n_episodes=1000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.999, train=True):
    """Deep Q-Learning.
    
    Args
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        train (bool): flag deciding if the agent will train or just play through the episode
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    max_avg_score = 0                  # stores max recorded average score over a 100 episode window
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=train)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps if train else 0.0)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]   # get the next state
            reward = env_info.rewards[0]                   # get the reward
            done = env_info.local_done[0]                  # see if episode has finished
            if train:
                agent.step(state, action, reward, next_state, done)
            score += reward                                # update the score
            state = next_state                             # roll over the state to next time step
            if done:                                       # exit loop if episode finished
                break
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        average_score = np.mean(scores_window)
        if average_score >= 13.0 and train and average_score >= max_avg_score:
            if max_avg_score == 0 :
                print('\nEnvironment solved in {:d} episodes!\n\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            print('\nSaving current network state.')
            torch.save(agent.qnetwork_local.state_dict(), 'navigate_{1}_{2}.pth'.format(np.mean(scores_window),dt.now().strftime('%y%b%d%H%M%S')).replace('.',''))
            max_avg_score = average_score
    return scores

## Train the agent

In [None]:
agent = Agent(state_size=state_size, action_size=action_size, seed=0)
n_episodes = 1000
for eps_decay in [float(j)/1000 for j in range(999,990,-1)]:
    scores = navigate(agent = agent, n_episodes = n_episodes, eps_decay = eps_decay)

    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.savefig('images/average_scores_plot_{}_{}'.format(n_episodes,eps_decay))
    plt.show()

## Run trained agent

In [None]:
agent = Agent(state_size=state_size, action_size=action_size, seed=0, filename='checkpoint_dqn.pth')
scores = navigate(agent, n_episodes=100, train=False)

## Close the environment

In [None]:
env.close()