In [1]:
from unityagents import UnityEnvironment
import numpy as np
import random
from collections import namedtuple, deque, OrderedDict
import torch
import time
import torch.nn.functional as F
from torch import nn
import torch.optim as optim
from scipy import signal

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def np_to_torch(a):
    return torch.from_numpy(a).float().to(device)


def scale_weights(model, scale):
    for param in model.parameters():
        param.data.copy_(param.data * scale)


class ActorNetwork(nn.Module):
    """
    An actor network which simultaneously prescribes an action to take as a fucntion of state.
    """
    
    def __init__(self, state_size, action_size, seed, hidden_sizes_list):
        """Initialize parameters and build model.
        
        Inputs:
            state_size (int): Dimension of each state.
            action_size (int): Dimension of each action.
            seed (int): Random seed.
            hidden_sizes_list (list): The sizes of hidden layers in the network.
        """
        super(ActorNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        model = OrderedDict([
            ['bn_1', nn.BatchNorm1d(state_size)],
            ['fc_1', nn.Linear(state_size, hidden_sizes_list[0])],
            ['relu_1', nn.ReLU()]
        ])
        for i in range(1, len(hidden_sizes_list)):
            model['bn_{}'.format(i + 1)] = nn.BatchNorm1d(hidden_sizes_list[i - 1])
            model['fc_{}'.format(i + 1)] = nn.Linear(hidden_sizes_list[i - 1], hidden_sizes_list[i])
            model['relu_{}'.format(i + 1)] = nn.ReLU()
        model['bn_output'] = nn.BatchNorm1d(hidden_sizes_list[-1])
        model['fc_output'] = nn.Linear(hidden_sizes_list[-1], action_size)
        self.model = nn.Sequential(model)
        self.tanh = nn.Tanh()

    def forward(self, state):
        """ Forward propagation through the network."""
        return self.tanh(self.model(state))
#         return self.model(state)


class CriticNetwork(nn.Module):
    """
    A critic network which predicts the action value for a state.
    Action is inserted as an input to the last hidden layer.
    """
    
    def __init__(self, state_size, action_size, seed, hidden_sizes_list):
        """Initialize parameters and build model.
        
        Inputs:
            state_size (int): Dimension of each state.
            action_size (int): Dimension of each action.
            seed (int): Random seed.
            hidden_sizes_list (list): The sizes of hidden layers in the network.
        """
        super(CriticNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        model_up_to_last_hidden = OrderedDict([
            ['bn_1', nn.BatchNorm1d(state_size)],
            ['fc_1', nn.Linear(state_size, hidden_sizes_list[0])],
            ['relu_1', nn.ReLU()]
        ])
        for i in range(1, len(hidden_sizes_list) - 1):
            model_up_to_last_hidden['bn_{}'.format(i + 1)] = nn.BatchNorm1d(hidden_sizes_list[i - 1])
            model_up_to_last_hidden['fc_{}'.format(i + 1)] = nn.Linear(hidden_sizes_list[i - 1], hidden_sizes_list[i])
            model_up_to_last_hidden['relu_{}'.format(i + 1)] = nn.ReLU()
        self.model_up_to_last_hidden = nn.Sequential(model_up_to_last_hidden)
        self.last_hidden_bn = nn.BatchNorm1d(hidden_sizes_list[-2])
        self.last_hidden_layer = nn.Linear(hidden_sizes_list[-2], hidden_sizes_list[-1])
        self.last_bn = nn.BatchNorm1d(hidden_sizes_list[-1] + action_size)
        self.output_layer = nn.Linear(hidden_sizes_list[-1] + action_size, 1)
        self.relu = nn.ReLU()

    def forward(self, state, action):
        """ Forward propagation through the network."""
        x = self.model_up_to_last_hidden(state)
        x = self.relu(self.last_hidden_layer(self.last_hidden_bn(x)))
        x = torch.cat([x, action], 1)
        x = self.last_bn(x)
        x = self.output_layer(x)
        return x

In [3]:
def weighted_mse_loss(pred, target, weights):
    return torch.sum(weights * (pred - target) ** 2)


class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed, use_prioritized_replay):
        """Initialize a ReplayBuffer object.

        Inputs:
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
            use_prioritized_replay (boolean)
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.weights = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
        self.use_prioritized_replay = use_prioritized_replay
    
    def add(self, states, actions, rewards, next_states, dones):
        """Add a set of new experiences to memory.
        
        Inputs:
            states: Current state
            actions: Action taken in the current state.
            rewards: Reward received at the current step.
            next_states: A state into which the transition occurs.
            dones: An indicator of the transition learding into a terminal state.
        """
        if states.ndim == 1:
            es = [self.experience(states, actions, rewards, next_states, dones)]
        else:
#             print('states')
#             print(states)
#             print('actions')
#             print(actions)
#             print('rewards')
#             print(rewards)
#             print('next_states')
#             print(next_states)
#             print('dones')
#             print(dones)
            es = [self.experience(states[i], actions[i], rewards[i], next_states[i], dones[i]) for i in range(states.shape[0])]
        for e in es:
            self.memory.append(e)
            if self.use_prioritized_replay:
                self.weights.append(10)
            else:
                self.weights.append(1)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
#         if self.use_prioritized_replay:
        indices = random.choices(range(len(self.memory)), weights=self.weights, k=self.batch_size)
        experiences = [self.memory[i] for i in indices]
        weights = [self.weights[i] for i in indices]
#         else:
#             experiences = random.sample(self.memory, k=self.batch_size)
#             indices = None
#             weights = None

        states = np_to_torch(np.vstack([e.state for e in experiences if e is not None]))
        actions = np_to_torch(np.vstack([e.action for e in experiences if e is not None]))
        rewards = np_to_torch(np.vstack([e.reward for e in experiences if e is not None]))
        next_states = np_to_torch(np.vstack([e.next_state for e in experiences if e is not None]))
        dones = np_to_torch(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8))
  
        return (states, actions, rewards, next_states, dones, indices, weights)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)
    
    def update_weights(self, indices, weights):
        assert len(indices) == len(weights)
        for i,w in zip(indices, weights):
            self.weights[i] = w

In [4]:
class OUProcess():
    """
    Orstein-Uhlenbeck process.
    """
    
    def __init__(self, theta, sigma, dims):
        self.theta = theta
        self.sigma = sigma
        self.dims = dims
        self.values = np.zeros(dims)
        
    def sample(self):
        self.values += -self.theta * self.values + self.sigma * np.random.normal(size=self.dims)
        return self.values

class Agent():
    """
    An RL agent which can interact with an environment and learn from replayed experiences.
    Uses an actor-critic method for continuous actions (DDPG).
    """
    
    def __init__(self, num_instances, state_size, action_size, seed, actor_hidden_sizes_list, critic_hidden_sizes_list,
                 num_iters_learn=1, update_every=4, batch_size=128, gamma=0.99, buffer_size=int(1e6),
                 update_target_network_every=1, actor_lr0=1e-4, critic_lr0=1e-3, use_prioritized_replay=True,
                 noise_theta=0.15, noise_sigma=0.2, weight_decay=0.0, actor_reg_loss_weight=0.0):
        """ Initialize an Agent object.
        
        Inputs:
            num_instances (int): Number of actors running in parallel.
            state_size (int): The dimensionality of the state space.
            action_size (int): Number of possible actions an agent can take.
            seed (int): Randomization seed.
            actor_hidden_sizes_list (list): The sizes of hidden layers in the Q network.
            critic_hidden_sizes_list (list): The sizes of hidden layers in the Q network.
            num_iters_learn (int, optional): Number of iterations to take at each step towards the targets.
            update_every (int, optional): How often the main networks are updated (default 1).
            batch_size (int, optional): Batch size for each upate (default 64).
            gamma (float, optional): Temporal discount coefficient (default 0.99).
            buffer_size (int, optional): Maximum capacity of the replay buffer (default 1e6).
            update_target_network_every (int, optional): How often to update the target network (default 1).
            actor_lr0 (float, optional): Initial learning rate for the actor (default 1e-4).
            critic_lr0 (float, optional): Initial learning rate for the critic (default 1e-3).
            use_prioritized_replay (boolean, optional): Whether to use prioritized (instead of regular) experience replay (default True).
            noise_theta (float): Parameter of Ornstein-Uhlenbeck noise process.
            noise_sigma (float): Parameter of Ornstein-Uhlenbeck noise process.
            weight_decay (float): Weight decay parameter for both actor and critic networks.
            actor_reg_loss_weight (float): Weight placed on L2 norm of actions.
        """
        self.num_instances = num_instances
        self.action_size = action_size
        self.state_size = state_size
        self.seed = seed
        self.num_iters_learn = num_iters_learn
        self.update_every = update_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.actor_lr0 = actor_lr0
        self.critic_lr0 = critic_lr0
        self.use_prioritized_replay = use_prioritized_replay
        self.update_target_network_every = update_target_network_every
        self.actor_network_main = ActorNetwork(state_size, action_size, seed, actor_hidden_sizes_list).to(device)
        self.actor_network_target = ActorNetwork(state_size, action_size, seed, actor_hidden_sizes_list).to(device)
        scale_weights(self.actor_network_main, 0.001)
        self.soft_update(self.actor_network_main, self.actor_network_target, tau=1.0)
        self.actor_network_target.eval()
        self.critic_network_main = CriticNetwork(state_size, action_size, seed, actor_hidden_sizes_list).to(device)
        self.critic_network_target = CriticNetwork(state_size, action_size, seed, actor_hidden_sizes_list).to(device)
        scale_weights(self.critic_network_main, 0.001)
        self.soft_update(self.critic_network_main, self.critic_network_target, tau=1.0)
        self.critic_network_target.eval()
        self.actor_optimizer = optim.Adam(self.actor_network_main.parameters(), lr=actor_lr0, weight_decay=weight_decay)
        self.critic_optimizer = optim.Adam(self.critic_network_main.parameters(), lr=critic_lr0, weight_decay=weight_decay)
        self.actor_reg_loss_weight = actor_reg_loss_weight
        self.actor_reg_loss_fn = nn.MSELoss()

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, use_prioritized_replay)
        # Initialize time step (for updating every "update_every" steps)
        self.t_step = 0
        # Initialize OU noise
        self.noise = OUProcess(theta=noise_theta, sigma=noise_sigma, dims=(num_instances, action_size))

    def step(self, states, actions, rewards, next_states, dones):       
        """
        Process a vector of state changes.
        Periodically learn (update network) if enough experiences are available in the replay buffer.
        
        Inputs:
            states: Current state
            actions: Action taken in the current state.
            rewards: Reward received at the current step.
            next_states: A state into which the transition occurs.
            dones: An indicator of the transition learding into a terminal state.
        """
        self.t_step += 1
        if self.t_step % self.update_every == 0: # Learn every "update_every" time steps.
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)
        
        # Save experience in replay memory
        self.memory.add(states, actions, rewards, next_states, dones)
                
    def act(self, states):
        """Returns actions for given states as per current policy.
        
        Inputs:
            states (array_like): current states
            eps (float): epsilon, for epsilon-greedy action selection
        """
        states = torch.from_numpy(states).float().to(device)
        self.actor_network_main.eval()
        with torch.no_grad():
            actions = self.actor_network_main(states)
        self.actor_network_main.train()
        return np.tanh(np.arctanh(actions.numpy()) + self.noise.sample())
                
    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Inputs:
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, indices, weights = experiences
        probs = np.array(weights) / sum(weights)
        is_weights = np.power(1.0 / (len(self.memory) * probs), 0.5)
        is_weights = is_weights / max(is_weights)

        # get targets for the critic network
        with torch.no_grad():
            actor_target_next_actions = self.actor_network_target(next_states)
            critic_targets = (rewards + gamma *\
                self.critic_network_target(next_states, actor_target_next_actions) * (1 - dones)).squeeze()

        
        for _ in range(self.num_iters_learn):
            critic_predictions = self.critic_network_main(states, actions).squeeze()
            critic_errors = (critic_predictions - critic_targets).squeeze().squeeze()
            critic_loss = weighted_mse_loss(critic_predictions, critic_targets, torch.from_numpy(is_weights).float())
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
            
            # objective function for the actor
            actor_proposed_actions = self.actor_network_main(states)
            actor_objective = self.critic_network_target(states, actor_proposed_actions).mean()
            actor_loss = -actor_objective + self.actor_reg_loss_weight * self.actor_reg_loss_fn(actor_proposed_actions,
                                                           torch.zeros_like(actor_proposed_actions))
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
        if self.use_prioritized_replay:
            self.memory.update_weights(indices, np.power(0.05 + critic_errors.abs().data.numpy(), 0.5))
        
        if self.t_step % self.update_target_network_every == 0:
            self.soft_update(self.critic_network_main, self.critic_network_target, 1e-3)
            self.soft_update(self.actor_network_main, self.actor_network_target, 1e-3)

    def soft_update(self, main_model, target_model, tau=1e-3):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Inputs:
            main_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), main_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)


In [5]:
env = UnityEnvironment(file_name='Reacher_1.app')
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
states = env_info.vector_observations
state_size = states.shape[1]


agent = Agent(num_instances=num_agents,
              state_size=state_size,
              action_size=action_size,
              seed=0,
              actor_hidden_sizes_list=[128, 128],
              critic_hidden_sizes_list=[128, 128],
              gamma=0.99,
              num_iters_learn=3,
              actor_lr0=3e-5,
              critic_lr0=3e-4,
              weight_decay=0.00,
              actor_reg_loss_weight=0.0,
              update_every=3,
              noise_sigma=0.05,
              buffer_size=int(1e5),
              batch_size=128,
              use_prioritized_replay=False)
n_episodes=1000
max_t=10000
actor_min_lr = 3e-7
critic_min_lr = 3e-6
lr_decay_episode = 0.996
scores = []

# def dqn(env, agent, scores, n_episodes=1500, max_t=1000, eps_start=0.1, eps_end=0.001,
#         eps_decay=0.993, lr_decay_episode=0.996, min_lr=1e-6):
#     """Deep Q-Learning.
    
#     Params
#     ======
#         n_episodes (int): maximum number of training episodes
#         max_t (int): maximum number of timesteps per episode
#         eps_start (float): starting value of epsilon, for epsilon-greedy action selection
#         eps_end (float): minimum value of epsilon
#         eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
#         lr_decay_episode (float): Learning rate decay multiplier (per episode)
#         min_lr (float): Minimum learning rate (capped at the bottom at this value)
#     """
scores_window = deque(maxlen=100)  # last 100 scores
hundred_episodes_start_time = time.time()
solved = False
for i_episode in range(1, n_episodes+1):
    for param_group in agent.actor_optimizer.param_groups:
        # adjust learning rate
        param_group['lr'] = max(agent.actor_lr0 * lr_decay_episode**i_episode, actor_min_lr)
    for param_group in agent.critic_optimizer.param_groups:
        # adjust learning rate
        param_group['lr'] = max(agent.critic_lr0 * lr_decay_episode**i_episode, critic_min_lr)
    episode_start_time = time.time()
    env_info = env.reset(train_mode=True)[env.brain_names[0]] # reset the environment
    states = env_info.vector_observations            # get the current state
    score = 0
    for t in range(max_t):
        actions = agent.act(states)
        env_info = env.step(actions)[env.brain_names[0]]        # send the actions to the environment
        next_states = env_info.vector_observations   # get the next states
        rewards = env_info.rewards                   # get the reward
        dones = env_info.local_done                  # see if episode has finished
        score += np.mean(rewards)                                # update the score
        agent.step(states, actions, rewards, next_states, dones)
        states = next_states                             # roll over the state to next time step
        if np.any(dones):                                       # exit loop if episode finished
            break
#     print('actions')
#     print(actions)
#     print('-' * 70)
    scores_window.append(score)       # save most recent score
    scores.append(score)              # save most recent score
    episode_end_time = time.time()
    print('\rEpisode {}\tAverage Score: {:.2f}, episode took {:.2f} seconds'.format(i_episode, np.mean(scores_window),
          episode_end_time - episode_start_time), end="")
    if i_episode % 100 == 0:
        hundred_episodes_end_time = time.time()
        print('\rEpisode {}\tAverage Score: {:.2f}, 100 episodes took {:.2f} seconds'.format(i_episode,\
            np.mean(scores_window), hundred_episodes_end_time - hundred_episodes_start_time))
        torch.save(agent.actor_network_main.state_dict(), 'checkpoints/checkpoint_actor_{}.pth'.format(i_episode))
        torch.save(agent.critic_network_main.state_dict(), 'checkpoints/checkpoint_critic_{}.pth'.format(i_episode))
        hundred_episodes_start_time = time.time()
    if (np.mean(scores_window) >= 30.0) & ~solved:
        solved = True
        print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100,\
            np.mean(scores_window)))
        torch.save(agent.actor_network_main.state_dict(), 'checkpoints/final_checkpoint_actor.pth')
        torch.save(agent.critic_network_main.state_dict(), 'checkpoints/final_checkpoint_critic.pth')
#         break

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Episode 49	Average Score: 0.50, episode took 21.80 seconds



Episode 100	Average Score: 0.69, 100 episodes took 4577.58 seconds
Episode 200	Average Score: 0.88, 100 episodes took 2629.98 seconds
Episode 300	Average Score: 0.87, 100 episodes took 2363.56 seconds
Episode 400	Average Score: 0.90, 100 episodes took 2481.15 seconds
Episode 500	Average Score: 0.88, 100 episodes took 2554.07 seconds
Episode 600	Average Score: 0.88, 100 episodes took 8241.52 seconds
Episode 700	Average Score: 0.97, 100 episodes took 2465.34 seconds
Episode 800	Average Score: 0.85, 100 episodes took 2331.99 seconds
Episode 900	Average Score: 0.86, 100 episodes took 2345.59 seconds
Episode 1000	Average Score: 0.94, 100 episodes took 2382.06 seconds


In [6]:
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

NameError: name 'plt' is not defined

In [None]:
plt.plot(np.arange(len(scores)), signal.savgol_filter(scores, 53, 3))
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()