# Collaboration and Competition

---

In this notebook, you will learn how to use the Unity ML-Agents environment for the third project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program.

### 1. Start the Environment

Run the next code cell to install a few packages.

In [None]:
!pip -q install ./python

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [None]:
from unityagents import UnityEnvironment
import random
from collections import namedtuple, deque
import numpy as np
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import copy
import os
%matplotlib inline

### 2. Create an Environment Wrapper Class

Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

The following classes will create an environment wrapper whose methods will factor in the concept of "brains" that exists in the Unity environment.

In [None]:
class RLEnv:
    def __init__(self):
        self.n_actions: int = None
        self.n_states: int = None
        
    def reset(self):
        pass
    
    def step(self, action) -> None:
        pass
    

class TennisEnvironment(RLEnv):
    def __init__(self, fp:str):
        self.unity = UnityEnvironment(file_name=fp)
        
        # Get the default brain
        self.brain_name = self.unity.brain_names[0]
        self.brain = self.unity.brains[self.brain_name]
        
        # Get action space
        self.n_actions = self.brain.vector_action_space_size
        
        # Get state space
        env_info = self.unity.reset(train_mode=True)[self.brain_name]
        self.n_states = len(env_info.vector_observations[0])
        
    def reset(self):
        env_info = self.unity.reset(train_mode=True)[self.brain_name]
        return env_info.vector_observations
        
    def step(self, action):
        env_info = self.unity.step(action)[self.brain_name]
        next_state = env_info.vector_observations
        reward = env_info.rewards
        done = env_info.local_done
        return next_state, reward, done
        
    def close(self):
        self.unity.close()

### 3. Create the Actor Architecture

We will create the actor model architecture. This will be a simple feed-forward neural network that outputs an action given a state.

In [None]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

In [None]:
class Actor(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        
        # Hyperparameters from environment variables
        fc1_units = int(os.getenv("ACTOR_FC1_UNITS"))
        fc2_units = int(os.getenv("ACTOR_FC2_UNITS"))
        fc3_units = int(os.getenv("ACTOR_FC3_UNITS"))
        fc4_units = int(os.getenv("ACTOR_FC4_UNITS"))
        
        # Network components
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, fc3_units)
        self.fc4 = nn.Linear(fc3_units, fc4_units)
        self.fc5 = nn.Linear(fc4_units, action_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(*hidden_init(self.fc3))
        self.fc4.weight.data.uniform_(*hidden_init(self.fc4))
        self.fc5.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        """Build an actor (policy) network that maps states -> actions."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        return F.tanh(self.fc5(x))

### 4. Create the Critic Architecture

We will create the critic model architecture. This will be a simple feed-forward neural network that outputs expected rewards given a state and an action (like a Q-network).

In [None]:
class Critic(nn.Module):
    """Critic (Value) Model."""

    def __init__(self, state_size, action_size, seed):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        
        # Hyperparameters from environment variables
        fc1_units = int(os.getenv("CRITIC_FC1_UNITS"))
        fc2_units = int(os.getenv("CRITIC_FC2_UNITS"))
        fc3_units = int(os.getenv("CRITIC_FC3_UNITS"))
        fc4_units = int(os.getenv("CRITIC_FC4_UNITS"))
        
        # Network components
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units + action_size, fc2_units)
        self.fc3 = nn.Linear(fc2_units, fc3_units)
        self.fc4 = nn.Linear(fc3_units, fc4_units)
        self.fc5 = nn.Linear(fc4_units, 1)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(*hidden_init(self.fc3))
        self.fc4.weight.data.uniform_(*hidden_init(self.fc4))
        self.fc5.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state, action):
        """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
        xs = F.relu(self.fc1(state))
        x = torch.cat((xs, action), dim=1)
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        return self.fc5(x)

Before we train the neural network, we will first make sure to establish a connection to the GPU device if we have available.

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### 5. Create the Replay Buffer

We will next create the replay buffer class.

In [None]:
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        for i in range(len(state)):
            e = self.experience(state[i], action[i], reward[i], next_state[i], done[i])
            self.memory.append(e)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

### 6. Create Class for OU Noise Generation

Ornstein-Uhlenbeck process

In [None]:
class OUNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
        self.state = x + dx
        return self.state

### 7. Create the DDPG Agent Class

We will be setting up an agent that hosts an actor-critic model (DDPG) and learns from input experiences.

In [None]:
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self, env:RLEnv, seed, lr_actor:float, lr_critic:float, buffer_size:int,
                 batch_size:int, gamma:float, tau:float, weight_decay:float=0):
        """Initialize an Agent object.
        
        Params
        ======
            env (RLEnv): the environment
            seed (int): random seed
            lr_actor (float): learning rate of the actor model
            lr_critic (float): learning rate of the critic model
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size for model training
            gamma (float): reward discount factor
            tau (float): constant for soft update of target parameters
            weight_decay (float): L2 weight decay factor
        """
        self.env = env
        self.seed = random.seed(seed)
        
        # Hyperparameters
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.weight_decay = weight_decay
        self.gamma = gamma
        self.tau = tau
        
        # Actor network (local and target)
        self.actor_local = Actor(self.env.n_states, self.env.n_actions, seed).to(device)
        self.actor_target = Actor(self.env.n_states, self.env.n_actions, seed).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic network (local and target)
        self.critic_local = Critic(self.env.n_states, self.env.n_actions, seed).to(device)
        self.critic_target = Critic(self.env.n_states, self.env.n_actions, seed).to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), lr=self.lr_critic)

        # Replay memory
        self.memory = ReplayBuffer(action_size=self.env.n_actions, 
                                   buffer_size=self.buffer_size, 
                                   batch_size=self.batch_size, 
                                   seed=seed)
        
        # OU Noise creation
        self.noise = OUNoise(size=self.env.n_actions, seed=seed, mu=0, 
                             theta=float(os.getenv("NOISE_THETA", "0.15")),
                             sigma=float(os.getenv("NOISE_SIGMA", "0.2")))
        
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
            
        # If enough samples are available in memory, get random subset and learn
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)
            
    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)
    
    def reset(self):
        self.noise.reset()
        
    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
        """
        states, actions, rewards, next_states, dones = experiences
        
        # Step 1: Update CRITIC
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        
        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize critic loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
        
        # Step 2: Update ACTOR
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        
        # Minimize actor loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Soft update for target networks for both actor and critic
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)
        
    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
            
    def train_ddpg(self, n_episodes:int, max_t:int=300, solve_score:float=0, plot=True) -> None:
        """DDPG Learning.
    
        Params
        ======
            n_episodes (int): maximum number of training episodes
            max_t (int): maximum number of timesteps per episode
            solve_score (float): the minimum average score over 100 episodes that is required
                in order for the problem to be considered solved
        """
        scores_deque = deque(maxlen=100)
        scores = []
        
        # Iterate through epochs
        for i_episode in range(1, n_episodes+1):
            state = self.env.reset()
            self.reset()
            score = 0
            
            # Iterate through timesteps until we reach a "done" state
            for t in range(max_t):
                action = self.act(state)
                next_state, reward, done = self.env.step(action)
                self.step(state, action, reward, next_state, done)
                state = next_state
                score += np.max(reward)
                
                # Marks the end of an episode
                if np.any(done):
                    break
                
            # Append to score window
            scores_deque.append(score)
            scores.append(score)
            
            # Print episode statistics
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            
            # Did we solve the environment?
            if np.mean(scores_deque) >= solve_score:
                print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
                self.save_actor_model("checkpoint_actor.pth")
                self.save_critic_model("checkpoint_critic.pth")
                break
        
        # Plot scores if needed
        if plot:
            fig = plt.figure()
            ax = fig.add_subplot(111)
            plt.plot(np.arange(len(scores)), scores)
            plt.ylabel('Score')
            plt.xlabel('Episode #')
            plt.show()
            
    def save_actor_model(self, fp:str) -> None:
        torch.save(self.actor_local.state_dict(), fp)
        
    def save_critic_model(self, fp:str) -> None:
        torch.save(self.critic_local.state_dict(), fp)
    
    def load_actor_model(self, fp:str) -> None:
        self.actor_local.load_state_dict(torch.load(fp))
        
    def load_critic_model(self, fp:str) -> None:
        self.critic_local.load_state_dict(torch.load(fp))

### 8. Train DDPG on the Continuous Environment

We will now initialize a DDPG agent and run it on the Tennis environment. Results will be plotted after the "train_ddpg" method is called.

In [None]:
# Create environment
env = TennisEnvironment(fp="/data/Tennis_Linux_NoVis/Tennis")

# Actor/critic model hyperparameters
os.environ["ACTOR_FC1_UNITS"] = "128"
os.environ["ACTOR_FC2_UNITS"] = "64"
os.environ["ACTOR_FC3_UNITS"] = "32"
os.environ["ACTOR_FC4_UNITS"] = "16"
os.environ["CRITIC_FC1_UNITS"] = "128"
os.environ["CRITIC_FC2_UNITS"] = "64"
os.environ["CRITIC_FC3_UNITS"] = "32"
os.environ["CRITIC_FC4_UNITS"] = "16"

# OU Noise hyperparameters
os.environ["NOISE_THETA"] = "0.15"
os.environ["NOISE_SIGMA"] = "0.05"

# Create agent
agent = Agent(env=env,
              seed=0,
              lr_actor=7e-4,
              lr_critic=8e-4,
              buffer_size=100000,
              batch_size=32,
              gamma=0.95,
              tau=1e-3,
              weight_decay=0)

# Train agent
agent.train_ddpg(n_episodes=10000,
                max_t=1000,
                solve_score=30)

env.close()