In [None]:
#Code Adapted from https://github.com/TianhongDai/hindsight-experience-replay
#and https://github.com/openai/baselines/tree/master/baselines/her

In [1]:
%%file Networks.py

import torch
import torch.nn as nn

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256,action_dim)

        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.tanh(self.fc4(x))
        return x

class Critic(nn.Module):
    def __init__(self, state_dim,action_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256,1)
        
        self.relu = nn.ReLU()

    def forward(self, x, actions):
        x = torch.cat([x, actions], dim=1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        q = self.fc4(x)
        return q

Writing Networks.py


In [2]:
%%file HER_ReplayBuffer.py
import threading
import numpy as np

class HER_ReplayBuffer:
    #Replay Buffer code adapted from https://github.com/openai/baselines/blob/master/baselines/her/replay_buffer.py
    #Sampler code adapted from https://github.com/openai/baselines/blob/master/baselines/her/her_sampler.py
    def __init__(self, buffer_size, state_dim, goal_dim, action_dim,max_steps,reward_function = None, strategy = 'Method3'):
        self.buffer_size = buffer_size
        self.state_dim = state_dim
        self.goal_dim = goal_dim
        self.action_dim = action_dim
        self.max_steps = max_steps
        self.max_ep_in_buff = int(self.buffer_size / self.max_steps)
        self.reward_function = reward_function
        
        self.current_size = 0
        self.buffer = {'state': np.empty([self.max_ep_in_buff, self.max_steps + 1, self.state_dim]),
                       'achieved_goal':np.empty([self.max_ep_in_buff,self.max_steps+1, self.goal_dim]),
                       'desired_goal':np.empty([self.max_ep_in_buff,self.max_steps,self.goal_dim]),
                       'action':np.empty([self.max_ep_in_buff,self.max_steps,self.action_dim])
                      }
        self.strategy = strategy
        self.epsilon = 0.3
        
        self.lock = threading.Lock()
        
    def add_replay(self,episode):
        roll = False
        if self.current_size + 1 > self.buffer_size:
            roll = True
        else:
            self.current_size += 1
        for ind,key in enumerate(self.buffer.keys()):
            #Roll each buffer entry to the left, insert new entry at end if buffer is full
            if roll:
                self.buffer[key] = np.roll(self.buffer[key],-1,axis = 0)
            self.buffer[key][self.current_size - 1] = episode[ind]
    
    def sample(self,batch_size,temp_buffer = None,k = 4):
        her_ratio = 1 - (1/(1+k))
        if temp_buffer:
            current_size = temp_buffer['actions'].shape[0]
            max_steps = temp_buffer['actions'].shape[1]
        else:
            temp_buffer = {}
            for key in self.buffer.keys():
                temp_buffer[key] = self.buffer[key][:self.current_size]
            temp_buffer['next_state'] = temp_buffer['state'][:,1:,:]
            temp_buffer['next_agoal'] = temp_buffer['achieved_goal'][:,1:,:]
            current_size = self.current_size 
            max_steps = self.max_steps
        #Sampling Using HER 'future' strategy
        episode_inds = np.random.randint(0,current_size,size = batch_size)
        transition_inds = np.random.randint(max_steps,size = batch_size)
        transitions = {key: temp_buffer[key][episode_inds, transition_inds]\
                       for key in temp_buffer.keys()}
        #Sampling k HER transitions for every non HER transition. 
        her_inds = np.where(np.random.uniform(size=batch_size) < her_ratio)
        
        if (self.strategy == 'Vanilla'):
            new_goal_inds = np.random.randint(transition_inds[her_inds]+1,max_steps+1)
        else:
            #Sample new desired goal inds weighted by distance to actual goal
            agoal_inds = np.array([np.arange(start+1,max_steps+1) for start in transition_inds[her_inds]],dtype = object)
            agoals = np.array([temp_buffer['achieved_goal'][e_ind,g_ind] for e_ind,g_ind in zip(episode_inds,agoal_inds)],dtype=object)
            dgoals = temp_buffer['desired_goal'][episode_inds,0]

            dists = np.array([np.sqrt(np.sum(np.square(ag-dg),axis = 1)) for ag,dg in zip(agoals,dgoals)],dtype = object)
            if (self.strategy == 'Method1'):
                new_goal_inds = np.array([ag_inds[np.argmax(d)] for ag_inds,d in zip(agoal_inds,dists)])       
            elif (self.strategy == 'Method2'):
                new_goal_inds = np.array([ag_inds[np.argmax(d)] \
                                          if np.random.rand() > self.epsilon else ag_inds[np.random.randint(0,len(ag_inds))] \
                                          for ag_inds,d in zip(agoal_inds,dists)])     
            elif (self.strategy == 'Method3'):
                weights = np.array([d/sum(d) for d in dists],dtype = object)
                new_goal_inds = np.array([np.random.choice(ag_inds,p=w) for ag_inds,w in zip(agoal_inds,weights)])     
            else:
                raise ValueError('Strategy {} does not exist'.format(self.strategy))
        
        transitions['desired_goal'][her_inds] = temp_buffer['achieved_goal'][episode_inds[her_inds],new_goal_inds]
        
        transitions['reward'] = self.reward_function(transitions['next_agoal'],transitions['desired_goal'],None)
        transitions['reward'] = np.expand_dims(transitions['reward'],1)
        # transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) for k in transitions.keys()}
        return transitions

Writing HER_ReplayBuffer.py


In [3]:
%%file Normalizer.py
import threading
import numpy as np
from mpi4py import MPI

class Normalizer:
    def __init__(self,dim_size,eps=1e-2,clip_range = np.inf):
        self.dim_size = dim_size #Dimensional size of the input
        self.eps = eps
        self.clip_range = clip_range
        
        #Local sum of each process
        self.local_sum = np.zeros(self.dim_size, np.float32)
        self.local_sumsq = np.zeros(self.dim_size, np.float32)
        self.local_count = np.zeros(1, np.float32)

        self.total_sum = np.zeros(self.dim_size, np.float32)
        self.total_sumsq = np.zeros(self.dim_size, np.float32)
        self.total_count = np.ones(1, np.float32)
        
        # get the mean and std
        self.mean = np.zeros(self.dim_size, np.float32)
        self.std = np.ones(self.dim_size, np.float32)
        
        self.lock = threading.Lock()
        
    def update_local(self,input_arr):
        input_arr = input_arr.reshape(-1, self.dim_size)

        with self.lock:
            self.local_sum += input_arr.sum(axis=0)
            self.local_sumsq += (np.square(input_arr)).sum(axis=0)
            self.local_count[0] += input_arr.shape[0]
    
    def sync(self, local_sum, local_sumsq, local_count):
        local_sum[...] = self.mpi_sum(local_sum)
        local_sumsq[...] = self.mpi_sum(local_sumsq)
        local_count[...] = self.mpi_sum(local_count)
        return local_sum, local_sumsq, local_count

    def mpi_sum(self, x):
        buf = np.zeros_like(x)
        MPI.COMM_WORLD.Allreduce(x, buf, op=MPI.SUM)
        buf /= MPI.COMM_WORLD.Get_size()
        return buf
    
    def update_all(self):
        sync_sum, sync_sumsq, sync_count = self.sync(self.local_sum,\
                                                     self.local_sumsq, \
                                                     self.local_count)

        with self.lock:
            self.local_sum = self.local_sum * 0
            self.local_sumsq = self.local_sumsq * 0
            self.local_count = self.local_count * 0

        self.total_sum += sync_sum
        self.total_sumsq += sync_sumsq
        self.total_count += sync_count
        
        self.mean = self.total_sum/self.total_count
        self.std = np.sqrt(np.maximum(np.square(self.eps), \
                    (self.total_sumsq / self.total_count) - \
                    np.square(self.total_sum / self.total_count)))
        
    def normalize(self,input_arr):
        return np.clip((input_arr-self.mean)/self.std, -self.clip_range, self.clip_range)

Writing Normalizer.py


In [4]:
%%file mpi_utils.py
from mpi4py import MPI
import numpy as np
import torch

# sync_networks across the different cores
def sync_networks(network):
    """
    netowrk is the network you want to sync
    """
    comm = MPI.COMM_WORLD
    flat_params = _get_flat_params_or_grads(network, mode='params')
    comm.Bcast(flat_params, root=0)
    # set the flat params back to the network
    _set_flat_params_or_grads(network, flat_params, mode='params')

def sync_grads(network):
    flat_grads = _get_flat_params_or_grads(network, mode='grads')
    comm = MPI.COMM_WORLD
    global_grads = np.zeros_like(flat_grads)
    comm.Allreduce(flat_grads, global_grads, op=MPI.SUM)
    _set_flat_params_or_grads(network, global_grads, mode='grads')

# get the flat grads or params
def _get_flat_params_or_grads(network, mode='params'):
    """
    include two kinds: grads and params
    """
    attr = 'data' if mode == 'params' else 'grad'
    return np.concatenate([getattr(param, attr).cpu().numpy().flatten() for param in network.parameters()])

def _set_flat_params_or_grads(network, flat_params, mode='params'):
    """
    include two kinds: grads and params
    """
    attr = 'data' if mode == 'params' else 'grad'
    # the pointer
    pointer = 0
    for param in network.parameters():
        getattr(param, attr).copy_(torch.tensor(flat_params[pointer:pointer + param.data.numel()]).view_as(param.data))
        pointer += param.data.numel()

Writing mpi_utils.py


In [27]:
%%file DDPG_HER.py

import torch
import numpy as np
import copy 
from mpi4py import MPI
from mpi_utils import sync_networks, sync_grads
from HER_ReplayBuffer import HER_ReplayBuffer
from Networks import Actor, Critic
from Normalizer import Normalizer

class DDPG_HER:
    def __init__(self,env,num_epochs = 50,strategy='vanilla',use_cuda = False):
        self.num_epochs = num_epochs
        self.num_cycles = 50
        self.num_episodes = 2
        self.num_updates = 40
        self.prob_rand_action = 0.2
        self.buffer_size = int(1e6)
        self.batch_size = 256
        self.gamma = 0.98
        self.lr = 0.001
        self.tau = 0.05 #(1-decay_rate, decay_rate = 0.95)
        self.observation_limit = 200
        self.clip_range = 5
        self.max_steps = env._max_episode_steps
        self.noise_eps = 0.2 #Noise std
        self.random_eps = 0.3 #Probability of choosing a random action
        self.action_max = env.action_space.high[0]
        self.l2_lambda = 1
        self.env = env
        
        self.state_dim = env.observation_space['observation'].shape[0]
        self.goal_dim = env.observation_space['achieved_goal'].shape[0]
        self.action_dim = env.action_space.shape[0]
        
        self.device = 'cuda' if use_cuda else 'cpu'
        
        self.actor = Actor(self.state_dim+self.goal_dim,self.action_dim).to(self.device)
        self.actor_target = copy.deepcopy(self.actor).to(self.device)
        
        self.critic = Critic(self.state_dim+self.goal_dim,self.action_dim).to(self.device)
        self.critic_target = copy.deepcopy(self.critic).to(self.device)        
        
        sync_networks(self.actor)
        sync_networks(self.critic)
        
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr = self.lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr = self.lr)
        
        self.replay_buffer = HER_ReplayBuffer(self.buffer_size, self.state_dim, self.goal_dim, \
                                              self.action_dim,self.max_steps,self.env.compute_reward)
        
        self.state_normalizer = Normalizer(self.state_dim,clip_range = self.clip_range)
        self.goal_normalizer = Normalizer(self.goal_dim,clip_range = self.clip_range)
        
        
        self.actor_loss_arr = []
        self.critic_loss_arr = []
        self.success_rate_arr = []
        
    def update_target_networks(self):
        for target_param, source_param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_((self.tau * source_param.data) + ((1-self.tau) * target_param.data))
        for target_param, source_param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_((self.tau * source_param.data) + ((1-self.tau) * target_param.data))
    
    def preprocess_inputs(self,state,goal):
        state_norm = self.state_normalizer.normalize(state)
        goal_norm = self.goal_normalizer.normalize(goal)
        processed_input = torch.tensor(np.concatenate([state_norm,goal_norm], axis = -1),\
                                       dtype = torch.float32).to(self.device)#.unsqueeze(0)
        return processed_input
    
    def select_action(self,state_input):
        #Epsilon-Greedy Strategy
        if np.random.rand() <= self.random_eps:
            action = self.env.action_space.sample()
        else:
            action = self.actor(state_input).cpu().numpy().squeeze()
        #add noise
        noisy_action = action + np.random.normal(0,self.noise_eps,size = self.action_dim)
        noisy_action = np.clip(action,-self.action_max,self.action_max)
        return noisy_action
    
    def unpack_observation(self,observation):
        state = observation['observation']
        a_goal = observation['achieved_goal']
        d_goal = observation['desired_goal']
        return state,a_goal,d_goal
    
    def update_normalizer(self, episode):
        state_arr, a_goal_arr, d_goal_arr, action_arr = episode
        next_state_arr = state_arr[:, 1:, :]
        next_agoal_arr = a_goal_arr[:, 1:, :]
        # get the number of normalization transitions
        num_transitions = action_arr.shape[1]
        # create the new buffer to store them
        temp_buffer = {'state': state_arr, 
                       'achieved_goal': a_goal_arr,
                       'desired_goal': d_goal_arr, 
                       'actions': action_arr, 
                       'state_next': next_state_arr,
                       'next_agoal': next_agoal_arr,
                       }
        transitions = self.replay_buffer.sample(num_transitions, \
                                                temp_buffer = temp_buffer)
        state, g = transitions['state'], transitions['desired_goal']
        # pre process the obs and g
        clipped_curr_state_arr = np.clip(transitions['state'],-self.observation_limit, self.observation_limit)
        clipped_d_goal_arr = np.clip(transitions['desired_goal'],-self.observation_limit, self.observation_limit)
        # update
        self.state_normalizer.update_local(clipped_curr_state_arr)
        self.goal_normalizer.update_local(clipped_d_goal_arr)
        # recompute the stats
        self.state_normalizer.update_all()
        self.goal_normalizer.update_all()
        
    def update_network(self):
        transitions = self.replay_buffer.sample(self.batch_size)
        curr_state_arr = np.clip(transitions['state'],-self.observation_limit, self.observation_limit)
        next_state_arr = np.clip(transitions['next_state'],-self.observation_limit, self.observation_limit)
        d_goal_arr = np.clip(transitions['desired_goal'],-self.observation_limit, self.observation_limit)
        processed_curr_states = self.preprocess_inputs(curr_state_arr,d_goal_arr)
        processed_next_states = self.preprocess_inputs(next_state_arr,d_goal_arr)
        action_arr = torch.tensor(transitions['action'],\
                                 dtype = torch.float32).to(self.device)
        reward_arr = torch.tensor(transitions['reward'],\
                                 dtype = torch.float32).to(self.device)
        with torch.no_grad():
            next_action_arr = self.actor_target(processed_next_states)
            y = reward_arr + self.gamma * self.critic_target(processed_next_states,next_action_arr).detach()
            y.detach()
            #Clip Q value
            y = torch.clamp(y,-1/(1-self.gamma),0)
            
        critic_loss = torch.mean((y - self.critic(processed_curr_states,action_arr))**2)
        
        actions = self.actor(processed_curr_states)
        actor_loss = -torch.mean(self.critic(processed_curr_states,actions))
        #L2 Regularlization
        actor_loss += self.l2_lambda * torch.mean((actions/self.action_max)**2)
        
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        sync_grads(self.actor)
        self.actor_optimizer.step()
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        sync_grads(self.critic)
        self.critic_optimizer.step()
        
        self.critic_loss_arr.append(float(critic_loss))
        self.actor_loss_arr.append(float(actor_loss))
    
    def train(self):
        for epoch in range(self.num_epochs):
            for cycle in range(self.num_cycles):
                state_arr, a_goal_arr, d_goal_arr, action_arr = ([[] for _ in range(self.num_episodes)] for _ in range(4))
                for episode in range(self.num_episodes):
                    observation = self.env.reset()
                    curr_state, a_goal, d_goal = self.unpack_observation(observation)
                    for step in range(self.max_steps):
                        with torch.no_grad():
                            processed_state = self.preprocess_inputs(curr_state,d_goal)
                            action = self.select_action(processed_state)
                            # print('p_state',processed_state,'action',action)
                            # return
                        next_observation,_,_,info = self.env.step(action)
                        next_state,next_a_goal, _ = self.unpack_observation(next_observation)

                        state_arr[episode].append(curr_state)
                        a_goal_arr[episode].append(a_goal)
                        d_goal_arr[episode].append(d_goal)
                        action_arr[episode].append(action)

                        curr_state = next_state
                        a_goal = next_a_goal

                    state_arr[episode].append(curr_state)
                    a_goal_arr[episode].append(a_goal)
                
                state_arr = np.array(state_arr)
                a_goal_arr = np.array(a_goal_arr)
                d_goal_arr = np.array(d_goal_arr)
                action_arr = np.array(action_arr)
                
                for states,agoals,dgoals,actions in zip(state_arr,a_goal_arr,d_goal_arr,action_arr):
                    self.replay_buffer.add_replay([states, agoals, dgoals, actions])
 
                self.update_normalizer([state_arr,a_goal_arr,d_goal_arr,action_arr])
                    
                for batch in range(self.num_updates):
                    self.update_network()
                self.update_target_networks()
            self.success_rate_arr.append(self.test())
            if MPI.COMM_WORLD.Get_rank() == 0:
                print('Epoch:',epoch,\
                      'Avg Critic Loss:',np.mean(self.critic_loss_arr[-self.num_cycles*self.num_updates:]),\
                      'Avg Actor Loss:',np.mean(self.actor_loss_arr[-self.num_cycles*self.num_updates:]),\
                      'Success Rate:',np.mean(self.success_rate_arr[-self.num_cycles:]))

    def test(self):
        num_tests = 10
        success_rate = 0
        for i in range(num_tests):
            observation = self.env.reset()
            curr_state,_,d_goal = self.unpack_observation(observation)
            done = False
            while not done:
                with torch.no_grad():
                    processed_state = self.preprocess_inputs(curr_state,d_goal)
                    action = self.actor(processed_state).detach().cpu().numpy().squeeze()
                next_observation, _, done, info = self.env.step(action)
                next_state,_,_ = self.unpack_observation(next_observation)
                curr_state = next_state
            success_rate += info['is_success']
        local_success_rate = success_rate/num_tests
        global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM)
        return global_success_rate/MPI.COMM_WORLD.Get_size()

Overwriting DDPG_HER.py


In [28]:
%%file main.py
import numpy as np
import gym
import os, sys
from mpi4py import MPI
from DDPG_HER import DDPG_HER
import random
import torch
import argparse
import time

if __name__ == '__main__':
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['MKL_NUM_THREADS'] = '1'
    os.environ['IN_MPI'] = '1'
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--env-name', type=str, default='FetchReach-v1', help='the environment name')
    parser.add_argument('--strategy', type=str, default='vanilla', help='sampling method')
    parser.add_argument('--n-epochs', type=int, default=50, help='the number of epochs to train the agent')
    parser.add_argument('--cuda', action='store_true', help='if use gpu do the acceleration')
    parser.add_argument('--filename', type=str, default='ddpg', help='pickle file name to store object')

    args = parser.parse_args()

    env = gym.make(args.env_name)
    seed = 123
    np.random.seed(seed + MPI.COMM_WORLD.Get_rank())
    torch.manual_seed(seed + MPI.COMM_WORLD.Get_rank())
    env.seed(seed + MPI.COMM_WORLD.Get_rank())
    random.seed(seed + MPI.COMM_WORLD.Get_rank())
    torch.cuda.manual_seed(seed + MPI.COMM_WORLD.Get_rank())
    
    if MPI.COMM_WORLD.Get_rank() == 0:
        print('Current Environment:',args.env_name)
        print('Current Method:',args.strategy)
    env.reset()

    filename = args.filename+str(MPI.COMM_WORLD.Get_rank())+'.pickle'

    ddpg_her_model = DDPG_HER(env,num_epochs = args.n_epochs,strategy = args.strategy,\
                             use_cuda = args.cuda)
    start = time.time()
    ddpg_her_model.train()
    print('Training took',time.time()-start,'seconds')
    with open(filename, 'wb') as file:
        pickle.dump(ddpg_her_model,file)

Overwriting main.py


In [None]:
!mpirun -n 8 python -u main.py --env-name 'FetchPush-v1' --filename='vanilla_her_push'

Current Environment: FetchPush-v1
Current Method: vanilla
Epoch: 0 Avg Critic Loss: 0.005189810669689905 Avg Actor Loss: 0.31690679892105983 Success Rate: 0.0625
Epoch: 1 Avg Critic Loss: 0.0034313689566333777 Avg Actor Loss: 0.6614896542578935 Success Rate: 0.0625
Epoch: 2 Avg Critic Loss: 0.003829998292538221 Avg Actor Loss: 1.0463304397761821 Success Rate: 0.06666666666666667
