# Project 1: Navigation

### Imports

In [1]:
from unityagents import UnityEnvironment
import numpy as np

import random
from collections import namedtuple, deque
import matplotlib.pyplot as plt

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

### Setup
Next, we instantiate the Unity Environment for Project 1 (vector observation version of 'Banana'), retrieve the Unity brain name and extract this environment's state size (```s_sz```) and action size (```a_sz```) which we will use later.

In [2]:
# start Unity environment
env = UnityEnvironment(file_name="C:/Users/Andre/Desktop/Udacity DRL/deep-reinforcement-learning/p1_navigation/Banana_Windows_x86_64/Banana.exe")

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# get state size and action size
#env_info = env.reset(train_mode=False)[brain_name]
s_sz = brain.vector_observation_space_size# len(env_info.vector_observations[0])
a_sz = brain.vector_action_space_size

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


### QNetwork, Agent and ReplayBuffer classes, their attributes and methods
We are now ready to build an Agent for playing Banana, along with other classes, methods and attributes that will be necessary for implementing a Deep Q Network. Our DQN will use *Fixed Q Targets* and *Experience Replay*.   

We start by defining the **QNetwork** class, which will create the fully-connected neural network layers our agent will rely on for learning to approximate Q(s,a) values. **QNetwork** takes arguments for state size, action size and hidden layer width. State size determines the network's input size, and action size determines the size of the network's output.  

For this assignment, given the relatively small input space (37-dimensional observation space vector), we will use a 2-hidden layer neural network, with each hidden layer containing 64 units.

In [28]:
class QNetwork(nn.Module):
    
    def __init__(self, state_size, action_size, fc1_units=64, fc2_units=64):
        '''Creates a 2-hidden layer neural network with input and output size defined by
        state_size and output_size respectively. Hidden layer width is configured by fc1_units and fc2_units.
        
        Arguments
        =========
            state_size (int): size of state space
            action_size (int): size of action space
            '''
        super(QNetwork, self).__init__()
        #self.seed = torch.manual_seed(seed) # Sets the seed for generating random numbers
        self.bn1 = nn.BatchNorm1d(num_features=state_size)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)
        
    def forward(self, state):
        '''Performs forward propagation on the QNetwork given a state input, to return action values.'''
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

Then we define the **Agent** class.  
Agent's key attributes are:
- **2x instances of QNetwork**: we instantiate two neural networks in order to implement Fixed Q Targets. ```self.qnetwork_local``` is the neural network that learns and stores the weight parameters for approximating the true state-value function. ```self.qnetwork_target``` presents fixed parameter targets for the local network to train up to in each cycle of learning;
- A **Memory** store: used to store the agent's experiences and enable offline learning by Experience Replay. This is instantiated by a separate class ```ReplayBuffer``` which we will discuss in more detail below.
- A **time step counter**: keeps track of how many time steps have passed in the current experience cycle, enabling the agent to know when to do an update cycle on its local and target QNetworks.  

Agent's methods are:
- **Step**: saves the current experience to the replay memory, checks if enough time steps have passed for updating QNetworks and if so, calls the functions necessary to learn by Experience Replay (```memory.sample``` and ```self.learn```).
- **Act**: given a state, propagates it through the local QNetwork to obtain state-action values for every possible action, then selects an action based on an epsilon-greedy policy.
- **Learn**: 

In [29]:
class Agent():
    
    def __init__(self, state_size, action_size):#, seed):
        self.state_size = state_size
        self.action_size = action_size
        #self.seed = random.seed(seed)
        
        # 2x Q-networks to enable Fixed Q Targets
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size).to(device)
        
        # Optimizer - notice only used on local qnetwork
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        
        # Replay memory to enable Experience Replay
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size)
        
        # Timestep counter
        self.t_step = 0
        
    def step(self, state, action, reward, next_state, done):
        # Save the current experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn after every 'update_every' steps
        self.t_step = (self.t_step + 1) % update_every
        
        # if 'update_every' time steps have been reached (modulo division above),
        if self.t_step == 0:
                # and if there are enough samples in memory:
            if len(self.memory) > batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, gamma)
                
    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval() # pytorch method for setting a network to evaluation (stop dropout and batchnorm)
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
    
    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        
        loss = F.mse_loss(Q_expected, Q_targets)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        self.soft_update(self.qnetwork_local, self.qnetwork_target, tau)
    
    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [30]:
class ReplayBuffer:
    
    def __init__(self, action_size, buffer_size, batch_size):#, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=['state', 'action', 'reward', 'next_state', 'done'])
        #self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        experiences = random.sample(self.memory, k = self.batch_size)
        
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [31]:
def dqn(n_episodes = 2000, max_t=300, eps_start=1.0, eps_decay=0.995, eps_end=0.005,
        brain_name=brain_name, output='checkpoint'):
    scores = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0] 
        score = 0
        
        for t in range(max_t):
        
            action = int(agent.act(state, eps=eps))
            env_info = env.step(action)[brain_name] 
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)

            state = next_state
            score += reward
            if done:
                break
    
        scores_window.append(score)
        scores.append(score)
        eps = max(eps_end, eps_end*eps)

        print('\rEpisode {}\tAverage Score {:.2f}'.format(i_episode, np.mean(scores_window)), end='')
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

    torch.save(agent.qnetwork_local.state_dict(), '{}.pth'.format(output))

    return scores

In [43]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

buffer_size = int(1e6)
batch_size = 32
gamma = 0.99
tau = 1e-3
lr = 5e-4
update_every = 8

In [44]:
agent = Agent(state_size=s_sz, action_size=a_sz)
scores = dqn(n_episodes = 500)

Episode 100	Average Score: 0.63
Episode 200	Average Score: 2.72
Episode 300	Average Score: 7.35
Episode 400	Average Score: 8.69
Episode 500	Average Score: 11.30


In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()