# Continuous Control

---

In this notebook, you will learn how to use the Unity ML-Agents environment for the second project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [None]:
env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

### 4. It's Your Turn!

Now it's your turn to train your own agent to solve the environment!  When training the environment, set `train_mode=True`, so that the line for resetting the environment looks like the following:
```python
env_info = env.reset(train_mode=True)[brain_name]
```

In [1]:
import torch
import random
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
from unityagents import UnityEnvironment
from collections import namedtuple, deque

env = UnityEnvironment(file_name='Reacher_Windows_x86_64/Reacher.exe')

brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]
action_size = brain.vector_action_space_size
state_size = env_info.vector_observations.shape[1]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


### Actor and Critic Architectures

In [44]:
def weight_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 3. / np.sqrt(fan_in)
    return (-lim, lim)

class Actor(nn.Module):
    def __init__(self, state_size, action_size, hu=(128, 256, 128)):
        super(Actor, self).__init__()
        
        self.input_layer = nn.Linear(state_size, hu[0])
        self.hl1 = nn.Linear(hu[0], hu[1])
        self.hl2 = nn.Linear(hu[1], hu[2])
        self.output_layer = nn.Linear(hu[2], action_size)
        self.reset_parameters()
        
    def reset_parameters(self):
        self.input_layer.weight.data.uniform_(*weight_init(self.input_layer))
        self.hl1.weight.data.uniform_(*weight_init(self.hl1))
        self.hl2.weight.data.uniform_(*weight_init(self.hl2))
        self.output_layer.weight.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, state):
        x = F.relu(self.input_layer(state))
        x = F.relu(self.hl1(x))
        x = F.relu(self.hl2(x))
        return F.tanh(self.output_layer(x))
    
class Critic(nn.Module):
    def __init__(self, state_size, action_size, hu=(128, 256, 128)):
        super(Critic, self).__init__()
        
        self.input_layer = nn.Linear(state_size, hu[0])
        self.hl1 = nn.Linear(hu[0]+action_size, hu[1])
        self.hl2 = nn.Linear(hu[1], hu[2])
        self.output_layer = nn.Linear(hu[2], 1)
        self.reset_parameters()
        
    def reset_parameters(self):
        self.input_layer.weight.data.uniform_(*weight_init(self.input_layer))
        self.hl1.weight.data.uniform_(*weight_init(self.hl1))
        self.hl2.weight.data.uniform_(*weight_init(self.hl2))
        self.output_layer.weight.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, state, action):
        x = F.relu(self.input_layer(state))
        x = torch.cat((x, action))
        x = F.relu(self.hl1(x))
        x = F.relu(self.hl2(x))
        return self.output_layer(x)

### Default hyperparameters

In [50]:
BUFFER_SIZE = int(1e6)  # replay buffer size
BATCH_SIZE = 128        # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR_ACTOR = 1e-4         # learning rate of the actor 
LR_CRITIC = 3e-4        # learning rate of the critic
WEIGHT_DECAY = 0.0001   # L2 weight decay

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class Agent():
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        ### Actor online and target networks
        self.actor_online = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_online.parameters(), lr=LR_ACTOR)
        
        ### Critic online and target networks
        self.critic_online = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_online.parameters(), lr=LR_CRITIC, weight_decay = WEIGHT_DECAY)
        
        
        ### Noise generator for exploration
        self.noise = OUnoise(action_size)
        
        ### Replay buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
        
    def step(self, state, action, reward, next_state, done):
        # Commit experience to memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Optimise
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
            
    def act(self, state, add_noise=True):
        state = torch.from_numpy(state).float().to(device)
        self.actor_online.eval()
        with torch.no_grad():
            action = self.actor_online(state).cpu().data.numpy()
        self.actor_online.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)