# Continuous Control

---

Congratulations for completing the second project of the [Deep Reinforcement Learning Nanodegree](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) program!  In this notebook, you will learn how to control an agent in a more challenging environment, where the goal is to train a creature with four arms to walk forward.  **Note that this exercise is optional!**

### 1. Start the Environment

We begin by importing the necessary packages.  If the code cell below returns an error, please revisit the project instructions to double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [1]:
from unityagents import UnityEnvironment
import numpy as np
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import random

env = UnityEnvironment(file_name='Crawler.x86_64')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)

# size of each action
action_size = brain.vector_action_space_size

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Using device {}'.format(device))
    
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Actor(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, fc1_units, fc2_units):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1_bn = nn.BatchNorm1d(state_size)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2_bn = nn.BatchNorm1d(fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)

        self.fc2a_bn = nn.BatchNorm1d(fc2_units)
        self.fc2a = nn.Linear(fc2_units, fc2_units)

        self.fc3_bn = nn.BatchNorm1d(fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))

        self.fc2a.weight.data.uniform_(*hidden_init(self.fc2a))

        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        """Build an actor (policy) network that maps states -> actions."""
        x = F.relu(self.fc1(self.fc1_bn(state)))
        x = F.relu(self.fc2(self.fc2_bn(x)))

        x = F.relu(self.fc2a(self.fc2a_bn(x)))

        return F.tanh(self.fc3(self.fc3_bn(x)))


class Critic(nn.Module):
    """Critic (Value) Model."""

    def __init__(self, state_size, seed, fc1_units, fc2_units):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            seed (int): Random seed
            fcs1_units (int): Number of nodes in the first hidden layer
            fc2_units (int): Number of nodes in the second hidden layer
        """
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1_bn = nn.BatchNorm1d(state_size)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2_bn = nn.BatchNorm1d(fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)

        self.fc2a_bn = nn.BatchNorm1d(fc2_units)
        self.fc2a = nn.Linear(fc2_units, fc2_units)

        self.fc3_bn = nn.BatchNorm1d(fc2_units)
        self.fc3 = nn.Linear(fc2_units, 1)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))

        self.fc2a.weight.data.uniform_(*hidden_init(self.fc2a))

        self.fc3.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        """Build an critic (value) network that maps states -> value."""
        x = F.relu(self.fc1(self.fc1_bn(state)))
        x = F.relu(self.fc2(self.fc2_bn(x)))

        x = F.relu(self.fc2a(self.fc2a_bn(x)))

        return self.fc3(self.fc3_bn(x))

class PPO_Actor_Critic(nn.Module):
    
    def __init__(self, state_size, action_size, seed, fc1_units=1024, fc2_units=1024):
        super(PPO_Actor_Critic, self).__init__()
        self.actor = Actor(state_size, action_size, seed, fc1_units, fc2_units)
        self.critic = Critic(state_size, seed, fc1_units, fc2_units)  
        self.std = nn.Parameter(torch.ones(1, action_size)*0.15)

    def forward(self, state, action=None, scale=1.):
        """Build Policy.
        
        Returns
        ======
            action (Tensor): predicted action or inputed action
            log_prob (Tensor): log probability of current action distribution
            ent (Tensor): entropy of current action distribution
            value (Tensor): estimate value function
        """
        action_mean = self.actor(state)
        value = self.critic(state)
        
        dist = torch.distributions.Normal(action_mean, F.hardtanh(self.std, min_val=0.06*scale, max_val=0.6*scale))

        if action is None:
            action = dist.sample()
        log_prob = dist.log_prob(action)
        log_prob = torch.sum(log_prob, dim=1, keepdim=True)

        ent = dist.entropy().mean()
        
        return action, log_prob, ent, value

class PPO_Agent:
    def __init__(self, env, lr=0.0001, beta=0, learning_time=5, eps=0.2, tau=0.95, batch_size=128, constraint = 1.0):
        
        #Initialize environment
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]
        env_info = env.reset(train_mode=True)[brain_name]
        num_agents = len(env_info.agents)
        action_size = brain.vector_action_space_size
        states = env_info.vector_observations
        state_size = states.shape[1]
        self.env = env
        random_seed = random.randint(1, 100)
        
        #Initialize some hyper parameters of agent
        self.lr = lr
        self.learning_time = learning_time
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.gamma = 0.99
        self.batch_size = batch_size
        self.beta = beta#parameter for entropy panelty
        self.eps = eps#parameter for clip
        self.tau = tau#parameter for GAE
        
        #Networks and optimizers
        self.seed = random.seed(random_seed)
        self.network = PPO_Actor_Critic(state_size, action_size, random_seed, fc1_units=1024, fc2_units=1024).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),lr=self.lr, eps=1e-8, weight_decay=1e-4)
        
        self.best = -100 # This saves the best average score over 10 episodes in past agents.
        
        self.constraint = constraint
        
    def act(self, states):
        self.network.eval()
        with torch.no_grad():
            states = torch.tensor(states, dtype=torch.float).to(device)
            actions, log_probs, _, v = self.network(states)
            actions = actions.detach().cpu().numpy()
            log_probs = log_probs.detach().cpu().numpy()
            v = v.detach().cpu().numpy()
        self.network.train()
        return actions, log_probs, v
    
    def check(self):
        brain_name = self.env.brain_names[0]
        self.network.eval()
        total_rewards = 0
        reward_flag = False
        for _ in range(20):
            scores = np.zeros(self.num_agents)
            env_info = self.env.reset(train_mode=True)[brain_name]
            for t in range(1500):
                states = env_info.vector_observations
                action, _, _ = self.act(states)
                env_info = self.env.step(action)[brain_name]
                rewards = np.array(env_info.rewards)
                if any(np.isnan(rewards.reshape(-1))):
                    rewards[np.isnan(rewards)] = -5
                    reward_flag = True
                scores += rewards
            total_rewards += np.mean(scores)
        if reward_flag:
            print('\n NaN in rewards during testing!')
        self.network.train()
        self.network.cpu()
        if total_rewards/20 > self.best:
            torch.save(self.network.state_dict(), 'Crawler_Checkpoint.pth')
            self.best = total_rewards/20
            print('  Current network average: {:.4f}. Network Updated. Current best score {:.4f}'.format(total_rewards/20, self.best))
        else:
            self.network.load_state_dict(torch.load('Crawler_Checkpoint.pth'))
            print('  Current network average: {:.4f}. Network Reload. Current best score {:.4f}'.format(total_rewards/20, self.best))
        self.network.to(device)    
    
    def learn(self, states, actions, log_probs, dones, Advantages, Returns):
        '''
        This functions calculates the clipped surrogate function and do one step update to the action network
        And then the critic network will be updated
        The inputs are all lists of tensors. They are already sent to device before into the list
        '''
        
        #Generate dataset for getting small batches
        mydata = TensorDataset(states, actions, log_probs, dones, Advantages, Returns)
        Loader = DataLoader(mydata, batch_size = min(self.batch_size, len(mydata)//64), shuffle = True)
        
        for i in range(self.learning_time):
            for sampled_states, sampled_actions, sampled_log_probs, sampled_dones, sampled_advantages, sampled_returns in iter(Loader):
                _, new_log_probs, entropy, V = self.network(sampled_states, sampled_actions)
                ratio = (new_log_probs - sampled_log_probs).exp()
                
                Actor_Loss = -torch.min(input=ratio*sampled_advantages, other=torch.clamp(ratio, 1-self.eps, 1+self.eps)*sampled_advantages).mean()
                Entropy_Loss = -self.beta * entropy.mean()
                Critic_Loss = 0.5*(V-sampled_returns).pow(2).mean()
                Loss = Actor_Loss+Critic_Loss+Entropy_Loss
                self.optimizer.zero_grad()
                Loss.backward()
                torch.nn.utils.clip_grad_norm_(self.network.parameters(), 1)
                self.optimizer.step()
            
    def train(self, n_episode, max_t=1500, standardlize='row', method='MC', load=False):
        '''
        This function do the training part of the agent. The procedure is like:
            1. initialize environment
            2. Go through a whole episode, recode all states, actions, log_probs, rewards and dones information
            3. call learn function to update the networks
            4. repeat 2-3 for n_episode times.
        '''
        
        if load:
            self.network.cpu()
            self.network.load_state_dict(torch.load('Crawler_Checkpoint.pth'))
            self.network.to(device)
            
        all_rewards = []
        total_window = deque(maxlen=100)
        brain_name = self.env.brain_names[0]
        score_window = deque(maxlen=100)
        
        method = method.upper()
        if method not in ['MC', 'TD']:
            print('method can be only TD or MC. Input not supported! Use TD by default')
            method = 'TD'
        standardlize = standardlize.lower()
        if standardlize not in ['row', 'whole', 'none']:
            print('Standarlization in row or as a whole or none. Input not supported. Use row instead')
            standardlize = 'row'
            
        self.check()
            
        states_history = []
        actions_history = []
        rewards_history = []
        log_probs_history = []
        dones_history = []
        values_history = []
        
        for i in range(1, n_episode+1):
            #initialize environment
            env_info = self.env.reset(train_mode=True)[brain_name]
            
            total = np.zeros(self.num_agents)#Saves every reward signal
            scores_recorder = []#Saves the total reward whenever an agent is 'done'
            episodic_scores = np.zeros(self.num_agents)#Saves the current cumulated reward. Reset to 0 when agent is 'done'
            
            states = env_info.vector_observations
            actions, log_probs, v = self.act(states)
            
            reward_flag = False
            
            for _ in range(max_t):
                states_history.append(torch.tensor(states, dtype=torch.float).to(device))
                actions_history.append(torch.tensor(actions, dtype=torch.float).to(device))
                values_history.append(torch.tensor(v, dtype=torch.float).to(device))
                log_probs_history.append(torch.tensor(log_probs, dtype=torch.float).to(device))#Save as columns
                if any(np.isnan(actions.reshape(-1))):
                    print('\nCurrent episode {}. NaN in action!'.format(i))
                    self.network.cpu()
                    torch.save(self.network.state_dict(), 'Crawler_Checkpoint_NaN_Action.pth')
                    return None
                env_info = self.env.step(actions)[brain_name]
                next_states = env_info.vector_observations
                dones = torch.tensor(env_info.local_done, dtype=torch.float).view(-1,1).to(device)
                rewards = np.array(env_info.rewards)
                if any(np.isnan(rewards.reshape(-1))):
                    rewards[np.isnan(rewards)] = -5
                    reward_flag = True
                #Save reward info before turned into tensor
                total += rewards
                episodic_scores += rewards 
                
                rewards = torch.tensor(rewards, dtype=torch.float).view(-1,1).to(device)
                rewards_history.append(rewards)
                dones_history.append(dones)
                
                states = next_states
                actions, log_probs, v = self.act(states)
                
                for k in range(self.num_agents):
                    if env_info.local_done[k]:
                        scores_recorder.append(episodic_scores[k])
                        episodic_scores[k] = 0
                        
            if reward_flag:
                print('\n Current Episode {}! NaN in rewards!'.format(i))
            
            scores_recorder = np.array(scores_recorder)
            states_history.append(torch.tensor(states, dtype=torch.float).to(device))
            values_history.append(torch.tensor(v, dtype=torch.float).to(device))
            
            Advantages = []
            advantage = 0
            Returns = []
            returns = 0
            #Calculate advantages
            for j in reversed(range(len(states_history)-1)):
                if method == 'MC':
                    returns = rewards_history[j] + (1-dones_history[j])*returns*self.gamma
                else:
                    returns = rewards_history[j] + (1-dones_history[j])*values_history[j+1].detach()*self.gamma
                Returns.append(returns.view(-1))
                delta = rewards_history[j] + (1-dones_history[j])*self.gamma*values_history[j+1].detach() - values_history[j].detach()
                advantage = advantage*self.gamma*self.tau*(1-dones_history[j]) + delta
                Advantages.append(advantage.view(-1))
            Advantages.reverse()
            Advantages = torch.stack(Advantages).detach().to(device)
            if standardlize == 'row':
                Advantages = (Advantages - Advantages.mean(dim=1 ,keepdim=True))/Advantages.std(dim=1, keepdim=True)
            elif standardlize == 'whole':
                Advantages = (Advantages - Advantages.mean())/Advantages.std()
            Advantages = Advantages.view(-1,1)
            Returns.reverse()
            Returns = torch.stack(Returns).detach().to(device)
            Returns = Returns.view(-1,1)
            
            states_history = torch.cat(states_history[:-1], 0)
            actions_history = torch.cat(actions_history, 0)
            log_probs_history = torch.cat(log_probs_history, 0)
            dones_history = torch.cat(dones_history, 0)
            
            self.learn(states_history, actions_history, log_probs_history, dones_history, Advantages, Returns)
            
            states_history = []
            actions_history = []
            rewards_history = []
            log_probs_history = []
            dones_history = []
            values_history = []
            
            score_window.append(np.nanmean(scores_recorder))
            total_window.append(np.mean(total))
            all_rewards.append(np.nanmean(scores_recorder))
            print('\rEpisode {}. Total score {:.4f}, average score {:.4f}, past total average {:.4f}, past average {:.4f}, best {:.4f}, worst {:.4f}'.format(i, np.mean(total), np.mean(scores_recorder), np.mean(total_window),np.mean(score_window), np.max(scores_recorder), np.min(scores_recorder)),end='')
            
            if i % 50 == 0:
                self.check()
                
        np.save('PPO_rewards.npy',np.array(all_rewards))
        return all_rewards
    
agent = PPO_Agent(env=env, beta=0, learning_time=2, batch_size=1024,  constraint = 1.5)
#agent.train(n_episode=1000, max_t=3000, standardlize='row', method='TD', load=False)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: CrawlerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 129
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 20
        Vector Action descriptions: , , , , , , , , , , , , , , , , , , , 


Using device cpu


In [2]:
agent.network.load_state_dict(torch.load('Crawler_Checkpoint.pth'))
agent.network.eval()

PPO_Actor_Critic(
  (actor): Actor(
    (fc1_bn): BatchNorm1d(129, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (fc1): Linear(in_features=129, out_features=1024, bias=True)
    (fc2_bn): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (fc2): Linear(in_features=1024, out_features=1024, bias=True)
    (fc2a_bn): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (fc2a): Linear(in_features=1024, out_features=1024, bias=True)
    (fc3_bn): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (fc3): Linear(in_features=1024, out_features=20, bias=True)
  )
  (critic): Critic(
    (fc1_bn): BatchNorm1d(129, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (fc1): Linear(in_features=129, out_features=1024, bias=True)
    (fc2_bn): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (fc2): Linear(in_features=1024, ou

In [3]:
env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
t = 0
while t<1500:
    t += 1
    states = torch.tensor(states, dtype=torch.float32)
    actions = agent.network.actor(states)              # select an action (for each agent)
    actions = actions.detach().cpu().numpy()
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    rewards += np.array(env_info.rewards)                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if any(np.isnan(np.array(rewards.reshape(-1)))):                                  # exit loop if episode finished
        rewards[np.isnan(rewards)] = -5
        print('NaN appeared')
    scores += rewards
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

Total score (averaged over agents) this episode: 5212.492941924681


In [4]:
env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
t = 0
finished = np.zeros(num_agents)
while not all(finished):
    t += 1
    states = torch.tensor(states, dtype=torch.float32)
    actions = agent.network.actor(states)              # select an action (for each agent)
    actions = actions.detach().cpu().numpy()
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = np.array(env_info.rewards)                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if any(np.isnan(np.array(rewards.reshape(-1)))):                                  # exit loop if episode finished
        rewards[np.isnan(rewards)] = -5
        print('NaN appeared')
    scores += (1-finished)*rewards
    finished[env_info.local_done] = 1
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

Total score (averaged over agents) this episode: 1781.7002083716216


In [5]:
scores

array([1834.05256114, 1828.13262673, 1637.52418523, 1763.39746659,
       1900.30259768, 1708.84459253, 1894.97552009, 1823.04778559,
       1840.52246955, 1622.97052601, 1611.58258032, 1915.04958901])

In [6]:
env.close()