In [None]:
import math
import random
import numpy as np
from collections import deque

from unityagents import UnityEnvironment

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.utils
from torch.distributions import Normal

import csv
import time
import datetime

In [None]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

<h1>CUDA</h1>

In [None]:
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")

<h1>Ornstein-Uhlenbeck Process</h1>
Adding time-correlated noise to the actions taken by the deterministic policy<br>
<a href="https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process">wiki</a>

In [None]:
class OUNoise(object):
    def __init__(self, action_space_size, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):

        self.mu           = mu
        self.theta        = theta
        self.sigma        = max_sigma
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period

        self.action_dim   = action_space_size
        self.low          = -1
        self.high         = 1

        self.reset()

    def reset(self):
        # same
        self.state = np.ones(self.action_dim) * self.mu

    def evolve_state(self):
        x  = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx

        return self.state

    def get_action(self, actions, t=0):

        ou_state = self.evolve_state()

        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)

        return np.clip(actions + ou_state, self.low, self.high)

<h1>Actor Network</h1>

In [None]:
#
# Actor
#
# receives a state and returns a value for each possible action in that state (allows for continuous action spaces)
#
class ActorNetwork(nn.Module):

    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(ActorNetwork, self).__init__()

        self.linear1 = nn.Linear(num_inputs, hidden_size)

        self.linear2 = nn.Linear(hidden_size, hidden_size)

        self.linear3 = nn.Linear(hidden_size, num_actions)

        # random initialisation of last layer weights and bias
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)

    def forward(self, state):

        x = self.linear1(state)
        x = F.relu(x)

        x = self.linear2(x)
        x = F.relu(x)

        # for continuous action spaces we need to use a tanh for each possible action component (e.g. torque1=0.5, torque2=-0.1, velocity1=1.0, velocity2=0.3)
        x = self.linear3(x)
        x = F.tanh(x)

        return x

    def get_actions(self, states):

        # returns a new tensor with a dimension of size one inserted at the specified position: puts the tensor inside a tensor. returns a copy in cpu/gpu memory.
        states  = torch.FloatTensor(states).unsqueeze(0).to(device)

        # performs a forward pass and retrieves the best values for all action components
        actions = self.forward(states)

        # creates a new tensor detached from the graph that created it, inside cpu memory and returns it as a numpy array.
        return actions.detach().cpu().numpy()
    

<h1>Critic Network</h1>

In [None]:
#
# Critic
#
# receives a state and actions for that state from the actor, and returns its evaluation of it
#
class CriticNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(CriticNetwork, self).__init__()

        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)

        self.linear2 = nn.Linear(hidden_size, hidden_size)

        self.linear3 = nn.Linear(hidden_size, 1)

        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)

    def forward(self, states, actions):

        x = torch.cat([states, actions], 1)

        x = self.linear1(x)
        x = F.relu(x)

        x = self.linear2(x)
        x = F.relu(x)

        x = self.linear3(x)

        return x

<h1>Replay Buffer</h1>

In [None]:
# 
# ReplayBuffer
#
# Stores experiences from the 20-agent environments which are used during the learning process
#
class ReplayBuffer:

    def __init__(self, capacity):

        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, action, reward, next_state, done):

        if len(self.buffer) < self.capacity:
            self.buffer.append(None)

        self.buffer[self.position] = (state, action, reward, next_state, done)

        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):

        batch = random.sample(self.buffer, batch_size)

        # how does this work?
        state, action, reward, next_state, done = map(np.stack, zip(*batch))

        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

<h1>Agents Handler</h1>

In [None]:
#
# AgentsHandler
#
# Manages the learning process of the 20-agent environment instances
#
class AgentsHandler:

    def __init__(self, num_agents, state_dim, action_dim, replay_buffer_size=1000000, batch_size=128, hidden_dim=256, critic_lr=1e-3, actor_lr=1e-4):

        # stores the number of agents
        self.num_agents = num_agents
        
        # instantiates a new shared replay buffer
        self.replay_buffer_size = replay_buffer_size
        self.replay_buffer = ReplayBuffer(self.replay_buffer_size)

        # size of batch of experiments to use when training the agents' actor and critic nextworks
        self.batch_size  = batch_size

        # steps to take before learning again
        self.steps_to_learning = 1
        
        # local critic network
        self.critic_net  = CriticNetwork(state_dim, action_dim, hidden_dim).to(device)

        # local actor network
        self.actor_net = ActorNetwork(state_dim, action_dim, hidden_dim).to(device)

        # target critic network
        self.critic_target_net  = CriticNetwork(state_dim, action_dim, hidden_dim).to(device)

        # target actor network
        self.actor_target_net = ActorNetwork(state_dim, action_dim, hidden_dim).to(device)

        # copies the initial parameters from the local network to the target network
        for target_param, param in zip(self.critic_target_net.parameters(), self.critic_net.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.actor_target_net.parameters(), self.actor_net.parameters()):
            target_param.data.copy_(param.data)

        # learning rates
        self.critic_lr  = critic_lr
        self.actor_lr = actor_lr

        # optimisers
        self.critic_optimizer  = optim.Adam(self.critic_net.parameters(),  lr = self.critic_lr)
        self.actor_optimizer = optim.Adam(self.actor_net.parameters(), lr = self.actor_lr)

        # loss function for the critic network
        #self.critic_loss = nn.MSELoss()
        self.critic_loss = nn.SmoothL1Loss()

        # number of steps to take before copying the local networks' parameters to the target networks
        self.actor_target_network_parameter_update_steps = 30
        self.critic_target_network_parameter_update_steps = 30

        # noise function for actions
        self.ou_noise = OUNoise(action_dim)

    def reset_noise(self):

        # resets the noise function
        self.ou_noise.reset()

    def act(self, states, step):

        # gets an action from the actor's network
        self.actor_net.eval()
        actions = self.actor_net.get_actions(states)
        self.actor_net.train()

        # if training then adds some noise to the agent decided actions to encourage exploration
        if training_mode == True:
            actions = self.ou_noise.get_action(actions, step)

        # removes the top dimension
        actions = np.squeeze(actions, axis=0)

        # returns the chosen action
        return actions

    def step(self, step, states, actions, rewards, next_states, dones):

        # stores the current experience in the SHARED replay buffer
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            self.replay_buffer.push(state, action, reward, next_state, done)

        # learn from the accumulated experiences in the replay buffer
        self.learn(step)

    def learn(self, step, gamma=0.99, expected_values_min_clamp=-np.inf, expected_values_max_clamp=np.inf, tau=1e-2):

        # if the replay buffer has enough experiences then we can start learning from them in batch
        if len(self.replay_buffer) > self.batch_size:
            
            # only learn every n steps: for stability reasons
            if step % self.steps_to_learning == 0:

                # collects a batch of experiences to train the actor and critic
                experiences = self.replay_buffer.sample(self.batch_size)

                # collects a batch of experiences to train the actor and critic
                states, actions, rewards, next_states, dones = experiences

                # converts the numpy arrays to tensors and prepare the tensors to run on a cpu/gpu device
                states      = torch.FloatTensor(states).to(device)
                next_states = torch.FloatTensor(next_states).to(device)
                actions     = torch.FloatTensor(actions).to(device)
                rewards     = torch.FloatTensor(rewards).unsqueeze(1).to(device)
                dones       = torch.FloatTensor(np.float32(dones)).unsqueeze(1).to(device)

                #
                # Actor's training
                #

                # gets the best actions from the actor for the sampled states within this experience batch using the current policy
                actor_actions = self.actor_net(states)

                # uses the critic to evaluate the actions indicated by the actor for the sampled states
                critic_evaluations = self.critic_net(states, actor_actions)

                # computes the actor's mean loss for the entire batch
                actor_loss    = -critic_evaluations.mean()

                # gets the next action for the next state using the actor's target network
                next_actions    = self.actor_target_net(next_states)

                # computes the Q-value for the next state and action using the critic's target network
                target_values   = self.critic_target_net(next_states, next_actions.detach())

                # computes the final expected value
                expected_values = rewards + (1.0 - dones) * gamma * target_values

                # clamps the expected value -> we should instead normalise the expected values, right? clamping simply cuts them.
                expected_values = torch.clamp(expected_values, expected_values_min_clamp, expected_values_max_clamp)

                #
                # Critic's training
                #

                # Gets the value of the current state and taken action
                critic_state_action_values = self.critic_net(states, actions)

                # computes the loss between the current value obtained by the critic's local network and the expected value -> MSE/Huber Loss
                critic_loss = self.critic_loss(critic_state_action_values, expected_values.detach())
                
                #
                # Optimisation of both networks
                #

                # takes a step on the policy optimiser and updates the policy network's parameters
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.actor_net.parameters(), 1)
                self.actor_optimizer.step()

                # takes a step on the critic optimiser and updates the policy network's parameters
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.critic_net.parameters(), 1)
                self.critic_optimizer.step()
                
                #
                # Target networks soft-update (every step or every C steps)
                #

                # updates the critic's target network
                if step % self.critic_target_network_parameter_update_steps == 0:

                    for target_param, param in zip(self.critic_target_net.parameters(), self.critic_net.parameters()):
                        target_param.data.copy_(
                            target_param.data * (1.0 - tau) + param.data * tau
                        )

                # updates the actor's target network
                if step % self.actor_target_network_parameter_update_steps == 0:

                    for target_param, param in zip(self.actor_target_net.parameters(), self.actor_net.parameters()):
                        target_param.data.copy_(
                            target_param.data * (1.0 - tau) + param.data * tau
                        )

                    
    def save_agent_model(self, model_path):

        # saves the actor model parameters for later usage in test or production
        torch.save(self.actor_net.state_dict(), model_path)

        
    def load_agent_model(self, model_path):
        
        # loads a saved model for inference purposes
        self.actor_net.load_state_dict(torch.load(model_path))
        

<h1>Auxiliary Functions</h1>

In [None]:
def plot_scores(i_episode, scores):
    clear_output(True)

    plt.figure(figsize=(30,5))

    plt.subplot(131)

    plt.title('Episode %s - Last episode average score: %s' % (i_episode, scores[-1]))
    plt.ylabel('Score')
    plt.xlabel('Episode #')

    plt.grid(True)

    plt.plot(np.arange(1, len(scores) + 1), scores)

    plt.show()

<h1>Training Routine</h1>

In [None]:
def train(number_of_episodes=1000, max_steps_per_episode=1000, average_score_episodes=100, print_every_episodes=10, print_every_steps=100):

    # scores bookkeeping
    last_100_episodes_scores = deque(maxlen = average_score_episodes)
    scores_log = []

    # instantiates the agent
    agents_handler = AgentsHandler(num_agents, state_size, action_size)

    # starts counter in seconds
    time_start = time.time()

    for i_episode in range(1, number_of_episodes + 1):

        # reset the environment
        env_info = env.reset(train_mode = True)[brain_name]

        # get the current state of the environment (for each agent)
        states = env_info.vector_observations               # (num_agents x 33)

        # resets the noise function
        agents_handler.reset_noise()

        # initialize the score (for each agent)
        episode_total_scores = np.zeros(num_agents)

        for step in range(1, max_steps_per_episode + 1):

            # gets an action from the actor's network
            actions = agents_handler.act(states, step)      # (num_agents x 4)

            # takes a step in the environment
            env_info = env.step(actions)[brain_name]

            #  gets rewards, next_states and dones
            rewards = env_info.rewards                      # (num_agents x 1)
            next_states = env_info.vector_observations      # (num_agents x 33)
            dones = env_info.local_done                     # (num_agents x 1)

            # takes a step in the agents
            agents_handler.step(step, states, actions, rewards, next_states, dones)

            # sets the next state
            states = next_states

            # updates the current episode's score (num_agents x 1)
            episode_total_scores += env_info.rewards
            
            # echoes the episode's current total reward
            if step % print_every_steps == 0:
                print("\nEpisode {} step {} average reward so far -> {}".format(i_episode, step, np.mean(episode_total_scores)))

            if np.any(dones):
                break

        # calculates the episode's average score across all agents
        episode_average_score = np.mean(episode_total_scores)
                
        # adds this episode average score to the complete scores log
        scores_log.append(episode_average_score)
        
        # adds this episode's total score to the last 100 scores average deque
        last_100_episodes_scores.append(episode_average_score)
        
        # plots the latest 100 episodes average reward
        if i_episode % print_every_episodes == 0:
            plot_scores(i_episode, scores_log)
            
        # checks if the environment is solved
        if np.mean(last_100_episodes_scores) > 30:
            print("Environment resolved in {} episodes!".format(i_episode))
            break

    # stop the timer
    time_end = time.time()

    # prints the duration in minutes
    print((time_end-time_start) / 60)
    
    # gets the current date/time
    now = datetime.datetime.now()
        
    # builds a new model name
    model_path = 'agent_model_trained_' + now.strftime("%Y%m%d_%H%M%S") + '.pth'
    
    # saves the learned model
    agents_handler.save_agent_model(model_path)

    # echoes model path
    print("Model saved at: ", model_path)
    
    # closes the environment
    env.close()

<h1>Testing Routine</h1>

In [None]:
def test(number_of_episodes=10, max_steps_per_episode=1000, average_score_episodes=100, print_every_episodes=1, print_every_steps=100):
    
    # scores bookkeeping
    scores_log = []

    # instantiates the agent in test mode
    agents_handler = AgentsHandler(num_agents, state_size, action_size)
    
    # loads the model into the critic network
    agents_handler.load_agent_model("agent_model_trained_20181019_123626.pth")

    for i_episode in range(1, number_of_episodes + 1):

        # reset the environment
        env_info = env.reset(train_mode = False)[brain_name]

        # get the current state of the environment (for each agent)
        states = env_info.vector_observations               # (num_agents x 33)

        # initialize the score (for each agent)
        episode_total_scores = np.zeros(num_agents)

        for step in range(1, max_steps_per_episode + 1):

            # gets an action from the actor's network
            actions = agents_handler.act(states, step)      # (num_agents x 4)
            
            # takes a step in the environment
            env_info = env.step(actions)[brain_name]

            #  gets rewards, next_states and dones
            rewards = env_info.rewards                      # (num_agents x 1)
            next_states = env_info.vector_observations      # (num_agents x 33)
            dones = env_info.local_done                     # (num_agents x 1)

            # sets the next state
            states = next_states
            
            # updates the current episode's score (num_agents x 1)
            episode_total_scores += env_info.rewards
            
            # echoes the episode's current total reward
            if step % print_every_steps == 0:
                print("\nEpisode {} step {} average reward so far -> {}".format(i_episode, step, np.mean(episode_total_scores)))

        # calculates the episode's average score across all agents
        episode_average_score = np.mean(episode_total_scores)
                
        # adds this episode average score to the complete scores log
        scores_log.append(episode_average_score)
        
        # plots the latest 100 episodes average reward
        if i_episode % print_every_episodes == 0:
            plot_scores(i_episode, scores_log)
            
    # closes the environment
    env.close()

<h1>Control Routines for Training and Testing</h1>

In [None]:
training_mode = False

if training_mode == True:
    
    # echoes current task
    print("Training started... :-)")
    
    # instantiates a new environment in test mode
    env = UnityEnvironment(file_name='Reacher20.app', no_graphics=True, seed=0)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=False)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]

    print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0])
    
    # starts our tests
    train()

In [None]:
testing_mode = True

if testing_mode == True:
    
    # echoes current task
    print("Testing started... :-)")
    
    # instantiates a new environment in test mode
    env = UnityEnvironment(file_name='Reacher20.app', no_graphics=False, seed=10)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=False)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]

    print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0])
    
    # starts our tests
    test()