# Banana collector

---

In this notebook, we will train DQN agent to pick up yellow bananas while avoiding blue ones in Unity ML-Agents environment.

### 1. Start the Environment

We begin by importing some necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/) or any other necessary packages.

In [None]:
from unityagents import UnityEnvironment

import matplotlib.pyplot as plt
import numpy as np
import sys
sys.path.append('../')
import torch

In [None]:
import os.path

home_dir    = os.path.expanduser("~")
project_dir = os.path.join(os.path.sep, home_dir, "Documents", "MyPrograms", "rl", "BananaCollector")
file_name   = "Banana_Linux/Banana.x86"

full_path = os.path.join(os.path.sep, project_dir, file_name)


print("Launching Unity from: %s"%full_path)

env = UnityEnvironment(file_name=full_path)


In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environment
print('Number of agents:', len(env_info.agents))

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)

# examine the state space 
state = env_info.vector_observations[0]
print('States look like:', state)
state_size = len(state)
print('States have length:', state_size)

### 2. Train DQN agent!

Now it's your turn to train your own agent to solve the environment!  When training the environment, set `train_mode=True`, so that the line for resetting the environment looks like the following:
```python
env_info = env.reset(train_mode=True)[brain_name]
```

In [None]:
from agents.dqn_agent import Agent
from collections import deque
import config 

param = config.HYPERPARAMS
agent = Agent(state_size=state_size, action_size=action_size, seed=0)


In [None]:
def dqn(n_episodes=2000, max_t=2000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        # Reset environment
        env_info = env.reset(train_mode=True)[brain_name]
        
        # Observe current state
        state = env_info.vector_observations[0]  
        
        # Reset score and done flag
        score = 0
        done = False
        
        # Loop each episode
        while not done:
            
            # Select action with e-greedy policy
            action = agent.act(state, eps)
            
            # Take action
            env_info = env.step(action)[brain_name]  
            
            # Observe the next state
            next_state = env_info.vector_observations[0]   
            
            # Get the reward
            reward = env_info.rewards[0]       
            
            # Check if episode is finished
            done = env_info.local_done[0]  
            
            # Store experience
            agent.step(state, action, reward, next_state, done)
            
            # State transition
            state = next_state
            
            # Update total score
            score += reward
            
        # Save most recent score   
        scores_window.append(score)
        scores.append(score)
        
        # Decay epsilon
        eps = max(eps_end, eps_decay*eps) 
        
        # Print learning progress
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=13.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'models/checkpoint.pth')
            break
            
    return scores

scores = dqn()

In [None]:
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
# close the environment
env.close()