## TODO
#### 2. add rollouts periodically for network performance evaluation
#### 5. high dimension observation space
#### 6. fix BN bug (FIXED?)

# Assignment 3: Policy Gradient (REINFORCE) and Actor Critic (DDPG)

Name:

ID:

This exercise requires you to solve various continous control problems in OpenAI Gym. We will first apply Policy Gradient, and later extend this to an Actor Critic method. 

Specifically, you will impliment REINFORCE with batch update and Deep Deterministic Policy Gradient (DDPG) for low and high dimension observation space.

You should test on 'InvertedPendulum-v1', 'Pendulum-v0', 'HalfCheetah-v1', 'Ant-v1', and 'Humanoid-v1'.

REINFORCE is an on-policy method, this requires new samples to be collected for each update. Parralelizing this sample collection will increase speed. In contrast, DDPG is off-polciy and allows reuse of samples through a Replay Buffer like in DQN.

REINFORCE: http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf

DDPG: https://arxiv.org/pdf/1509.02971.pdf

## Imports

In [2]:
#set GPU to use
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="1"

#useful libraries
from tqdm import tqdm #gives progress bars
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from collections import namedtuple
from copy import deepcopy
import random
import time
from collections import namedtuple
import numpy as np

#environment functions
import gym
from gym import wrappers
#import roboschool


#pytorch
import torch
from torch.autograd import Variable
import torch.nn.utils as utils
import torch.nn as nn
import torch
from torch import nn, optim
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from torch.distributions import Normal
from torch.distributions import Categorical



## Accessory Functions

These are useful functions in general.

In [3]:
#----------------------------------------------------
#exponential moving average
#----------------------------------------------------
def numpy_ewma_vectorized_v2(data, window):

    alpha = 2 /(window + 1.0)
    alpha_rev = 1-alpha
    n = data.shape[0]

    pows = alpha_rev**(np.arange(n+1))

    scale_arr = 1/pows[:-1]
    offset = data[0]*pows[1:]
    pw0 = alpha*alpha_rev**(n-1)

    mult = data*pw0*scale_arr
    cumsums = mult.cumsum()
    out = offset + cumsums*scale_arr[::-1]
    return out

#----------------------------------------------------
#timing function
#----------------------------------------------------
class Timer(object):
    def __init__(self, name=None):
        self.name = name

    def __enter__(self):
        self.tstart = time.time()

    def __exit__(self, type, value, traceback):
        if self.name:
            print('[%s]' % self.name,)
        print('Elapsed: %s' % (time.time() - self.tstart))

#----------------------------------------------------
#flip a pytorch vector
#----------------------------------------------------
def flip(x, dim):
    dim = x.dim() + dim if dim < 0 else dim
    inds = tuple(slice(None, None) if i != dim
             else x.new(torch.arange(x.size(i)-1, -1, -1).tolist()).long()
             for i in range(x.dim()))
    return x[inds]

#----------------------------------------------------
#normalize actions
#----------------------------------------------------
#https://github.com/openai/gym/blob/78c416ef7bc829ce55b404b6604641ba0cf47d10/gym/core.py

class NormalizeAction(gym.ActionWrapper):
    def action(self, action):
        #tanh outputs (-1,1) from tanh, need to be [action_space.low,environment.high]
        return (action + 1) / 2 * (self.action_space.high - self.action_space.low) + self.action_space.low
        
    def reverse_action(self, action):
        #reverse of that above
        return (action - self.action_space.low) / (self.action_space.high - self.action_space.low) * 2 - 1

## Set up environment
### Constants

In [4]:
VISUALIZE = False
NUM_EPISODES=12000
MAX_PATH_LENGTH = 500
GAMMA=0.99
BATCH_SIZE = 128
SEED = 0
min_timesteps_per_batch=2000

logging_interval = 100
animate_interval = logging_interval * 5
num_runs = 5
logdir='./tmp/'

env_name = 'HalfCheetah-v2' 
#env_name = 'FetchReach-v1'
#env_name='InvertedPendulum-v1'
#env_name = 'Pendulum-v0'
#env_name = "RoboschoolHalfCheetah-v1"

### Set random seed
### Wrap environment, log videos, setup cuda variables
### Record action and observation space dimensions

In [5]:
# Set random seeds
torch.manual_seed(SEED)
np.random.seed(SEED)

use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor

# Gym things
env = gym.make(env_name)
if VISUALIZE:
    if not os.path.exists(logdir):
        os.mkdir(logdir)
    env = wrappers.Monitor(env, logdir, force=True, video_callable=lambda episode_id: episode_id%logging_interval==0)
env._max_episodes_steps = MAX_PATH_LENGTH

discrete = isinstance(env.action_space, gym.spaces.Discrete)
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n if discrete else env.action_space.shape[0]

if discrete:
    print("This is a discrete action space, probably not the right algorithm to use")

DependencyNotInstalled: No module named mujoco_py. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)

# REINFORCE

In this section you will implement REINFORCE, with modifications for batch training. It will be for use on both discrete and continous action spaces.

In [None]:
#env_name='CartPole-v0'
#env_name='MountainCar-v0'
#env_name='Pendulum-v0'
env_name='InvertedPendulum-v2'
#env_name = 'HalfCheetah-v1'

visualize = False
animate=visualize

n_iter=12000
gamma=0.99
learning_rate=1e-3

min_timesteps_per_batch=2000
max_path_length=None

#saving parameters
logdir='./tmp/'
logging_interval = int(n_iter / 100)
animate_interval = int(n_iter / 20)
seed=0

#tricks?
reward_to_go=True
normalize_advantages=True
nn_baseline=False
cumsum = True

# network arguments
n_layers=2
size=32

class mlp(nn.Module):
    def __init__(self,  
            input_size, 
            output_size,
            n_layers=1, 
            size=64, 
            activation='relu',
            output_activation='softmax'):
        
        super(mlp, self).__init__()
        
        self.output_activation = output_activation
        
        #populate architecture list
        self.architecture = []
        #self.architecture.append(input_size)
        for i in range(n_layers):
            self.architecture.append(size)
            #if i%2 == 0:
            self.architecture.append(activation)
                
        self.architecture.append(output_size)
        #self.architecture.append(output_activation)
        
        print('Network architecture is : {}'.format(self.architecture))
        
        #construct network
        input_channels = input_size #initialize input_channels
        self.layers = []
        for layer in self.architecture:
            output_channels = layer
            if layer == 'tanh':
                self.layers.append(nn.Tanh())  #NOT INPLACE? MIGHT CAUSE ISSUE OR BE OK?
            
            elif layer == 'relu':
                self.layers.append(nn.ReLU(inplace=True))
            elif layer == None:
                pass
            else:
                self.layers.append(nn.Linear(input_channels, output_channels))
                input_channels = output_channels
        
        self.fc1 = nn.Sequential(*self.layers)
        
        if(self.output_activation == 'softmax'):
            self.softmax = nn.Softmax(dim = 1)
        self._initialize_weights()
        #if initialization == 'Xavier':
        
        #    nn.init.xavier_uniform(self.conv1.weight)
        #    nn.init.xavier_uniform(self.conv2.weight)

        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.1)
                m.bias.data.zero_()
                
    def forward(self, x):
        x = self.fc1(x)
        
        if(self.output_activation == 'softmax'):
            x = self.softmax(x)
        return x

def pathlength(path):
    return len(path["reward"])

def sample_action(logit, discrete):
    if discrete:
        m = Categorical(logit)
        action = m.sample()
        log_odds_action = m.log_prob(action)
        return action, log_odds_action#, probability
    else:
        shape = int(logit.shape[1]/2)
        mu = logit[:,:shape]
        sigma = logit[:,shape:]
        sigma = F.softplus(sigma)
        
        m = Normal(mu, sigma)
        action = m.sample()
        log_odds_action = m.log_prob(action)
        #if math.isnan(log_odds_action):
        #    print(log_odds_action)
        #    print(action)
        #    print(logit)
            
        return action, log_odds_action
    
def update_policy(paths, net, cumsum = 1):#ob_no, ac_na, rew, log_odd):
    num_paths = len(paths)

    rew_cums = []
    log_odds = []
    for path in paths:
        rew = path['reward']
        log_odd = path['log_odds']
        
        log_odds.append(log_odd)

        rew = torch.Tensor(rew)
        
        rew_cum = flip(torch.cumsum(flip(rew,0),0),0) #make a matrix multiplication to incorporate the decreasing value too
        if cumsum:
            rew_cums.append(rew_cum)
        else: #raw sum, not using reward to go
            max_rew = torch.ones(len(rew)) * rew_cum[0]
            #rew_cums.append(torch.sum(rew,0)*rew_cum)
            #rew = (rew-rew+1)*torch.sum(rew,0)
            rew_cums.append(max_rew)
            
    #append rew_cum, log_odd across paths to new variable
    rew_cums = torch.cat(rew_cums)
    if cumsum:
        rew_cums = (rew_cums - rew_cums.mean()) / (rew_cums.std() + 1e-5) #easy baseline, not fancy one CHECK AXIS FOR MULTIPLE
    
    rew_cums = Variable(rew_cums)
    
    log_odds = [item for sublist in log_odds for item in sublist]
    log_odds = torch.cat(log_odds).squeeze()
    
    if len(log_odds.shape) > 1:
        log_odds = log_odds.sum(1)
    #rew_cums = rew_cums.unsqueeze_(1) BAD BAD do not do
    #log_odds = torch.cat(log_odds)
    
    policy_loss = -1 * (rew_cums * log_odds)
    
    policy_loss = torch.sum(policy_loss) / num_paths

    #policy_loss = torch.sum(policy_loss,0) / num_paths

    #calculate  -log prob * rewards for loss
    #add in baseline subtractions --> this is a sepperately fitted value network, look at berkeley cs294 slides

    #take optimizer step
    optimizer.zero_grad()
    policy_loss.backward()
    optimizer.step()


# Set random seeds
torch.manual_seed(seed)
np.random.seed(seed)

# Make the gym environment
env = gym.make(env_name)

if visualize:
    if not os.path.exists(logdir):
        os.mkdir(logdir)
    env = wrappers.Monitor(env, logdir, force=True, video_callable=lambda episode_id: episode_id%animate_interval==0)
env._max_episodes_steps = min_timesteps_per_batch


# Is this env continuous, or discrete?
discrete = isinstance(env.action_space, gym.spaces.Discrete)

#Get observation and action space dimensions
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n if discrete else env.action_space.shape[0]

# Maximum length for episodes
max_path_length = max_path_length or env.spec.max_episode_steps

#Make network object
if not discrete:
    output_activation = 'none'
    act_dim *= 2
else:
    output_activation = 'softmax'
net = mlp(activation = 'relu', n_layers = n_layers, input_size = obs_dim, size = size, output_size = act_dim, output_activation = output_activation)
print(net)

#Make optimizer
optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate)


avg_reward = 0
avg_rewards = []
episodes = 0


for itr in tqdm(range(n_iter)): #loop for number of optimization setps
    paths = []
    steps = 0
    
    while True: #loop to get enough timesteps in this batch --> break condition, however many games
        ob = env.reset()
        obs, acs, rewards, log_odds = [], [], [], []
        animate_this_episode=(len(paths)==0 and (itr % animate_interval == 0) and animate)
        
        
        while True:
            if animate_this_episode:
                env.render()
                time.sleep(0.05)
            
            ob_th = torch.from_numpy(ob).float().unsqueeze(0)
            ob_th = Variable(ob_th)
            obs.append(ob_th)
            
            output = net(ob_th)

            ac, log_odd = sample_action(output, discrete)
            ac = ac.data[0]
            
            acs.append(ac)
            log_odds.append(log_odd)
            
            ob, rew, done, _ = env.step(ac)
            rewards.append(rew)
            
            steps += 1
            
            if done:
                episodes = episodes + 1
                break
                
        path = {"observation" : obs, 
                "reward" : np.array(rewards), 
                "action" : (acs),
                "log_odds" : log_odds}
        
        paths.append(path)
        
        if steps > min_timesteps_per_batch:
            break #not currenlty using minimum batch size
        
    update_policy(paths, net, cumsum)
    
    if itr == 0:
        avg_reward = path['reward'].sum()
    else:
        avg_reward = avg_reward * 0.9 + 0.1 * path['reward'].sum()
    
    if avg_reward > 900:
        break
        
    avg_rewards.append(avg_reward)
    if itr % logging_interval == 0:
        print('Average reward: {}'.format(avg_reward))
    #if avg_reward > env.spec.reward_threshold:
    #    print("Took: {} steps to solve with reward: {}".format(itr, avg_reward))
    #    break
        
        
env.close()

In [None]:
plt.plot(avg_rewards)

# DDPG

## DDPG accessory functions

In [None]:
#----------------------------------------------------
#sync weights between training and target networks
#----------------------------------------------------
def weightSync(target_model, source_model, tau = 0.001):
    for parameter_target, parameter_source in zip(target_model.parameters(), source_model.parameters()):
        parameter_target.data.copy_((1 - tau) * parameter_target.data + tau * parameter_source.data)
    #for parameter_target, parameter_source in zip(list(target_model.parameters()), list(source_model.parameters())):
        #    parameter_target = parameter_target * (1 - tau) + parameter_source * tau
            #does it need to be returned? I think not, everything by pointers


#----------------------------------------------------
#replay memory
#----------------------------------------------------
#from http://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html#replay-memory
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))


class ReplayMemory(object):

    def __init__(self, capacity = 1e6):
        self.capacity = capacity
        self.memory = []
        self.position = int(0)

    def capacity(self):
        return self.capacity
    
    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = int((self.position + 1) % self.capacity)

    def sample(self, batch_size):
        if batch_size <= self.__len__():
            return random.sample(self.memory, batch_size)
        else:
            print('Tried to sample more samples than are in buffer')
            return -1

    def __len__(self):
        return len(self.memory)
    
    
#----------------------------------------------------
#noise from paper
#----------------------------------------------------
class OrnsteinUhlenbeckProcess(object):
    def __init__(self, dimension, num_steps, theta=0.15, mu=0, sigma=0.2, dt=0.01):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dimension = dimension
        self.dt = dt
        self.num_steps = num_steps
        self.counter = 0
        self.reset()
 
    def step(self):
        #scale = np.exp(-self.counter * 2.3 / self.num_steps)
        self.x = self.x + self.theta*(self.mu-self.x)*self.dt + self.sigma*np.sqrt(self.dt)*np.random.randn(self.dimension)# * scale
        return self.x

    
    def reset(self):
        self.x = np.zeros(self.dimension)
        #self.counter += 1


## Deep Deterministic Policy Gradient
###  1. Define actor and critic networks

In [None]:
#----------------------------------------------------
#actor model, MLP
#----------------------------------------------------
#2 hidden layers, 400 units per layer, tanh output from paper to bound outputs between -1 and 1

class actor(nn.Module):
    def __init__(self,  
            input_size, 
            output_size,
            n_layers=2, 
            size=400, 
            activation='relu',
            output_activation='tanh',
            learning_rate = 1e-4):
        
        super(actor, self).__init__()
        
        self.learning_rate = learning_rate
        self.architecture = []
        for i in range(n_layers):
            self.architecture.append(size)
            self.architecture.append(activation)
        self.architecture.append(output_size)
        self.architecture.append(output_activation)
        
        final_index = i+1
        #print('Network architecture is : {}'.format(self.architecture))
        
        #construct network
        input_channels = input_size 
        self.layers = []
        for idx, layer in enumerate(self.architecture):
            output_channels = layer
            if layer == 'tanh':
                self.layers.append(nn.Tanh())  #NOT INPLACE? MIGHT CAUSE ISSUE OR BE OK?
            elif layer == 'relu':
                self.layers.append(nn.ReLU(inplace=True))
            elif layer == 'softmax':
                self.layers.append(nn.ReLU(inplace=True))
            elif layer == None:
                pass
            else:
                self.layers.append(nn.Linear(input_channels, output_channels))
                if idx<=final_index:
                    self.layers.append(nn.BatchNorm1d(output_channels))
                input_channels = output_channels
        
        self.fc1 = nn.Sequential(*self.layers)
        self._initialize_weights()
        
    def _initialize_weights(self):
        for k in self.modules():
            if isinstance(k, nn.Linear):
                m = k
                #print('Initializing: {}'.format(m))
                nn.init.xavier_uniform(m.weight.data) #m.weight.data.normal_(0, 0.1), paper used straight fanin, slight difference
                m.bias.data.zero_()
        m.weight.data.normal_(-3e-3, 3e-3) #from paper
        m.bias.data.normal_(-3e-3, 3e-3) #from paper
                
    def forward(self, x):
        return self.fc1(x)
    
    def lr(self):
        return self.learning_rate
    
#----------------------------------------------------
#critic model, MLP
#----------------------------------------------------
#2 hidden layers, 300 units per layer, ouputs rewards therefore unbounded

class critic(nn.Module):
    def __init__(self,  
            state_size,
            action_size,
            output_size,
            n_layers=2, 
            size=300, 
            activation='relu',
            output_activation= None,
            learning_rate = 1e-3):
        
        super(critic, self).__init__()
        
        self.learning_rate = learning_rate
        self.architecture = []
        for i in range(n_layers):
            self.architecture.append(size)
            self.architecture.append(activation)
        self.architecture.append(output_size)
        self.architecture.append(output_activation)
        
        #print('Network architecture is : {}'.format(self.architecture))
        
        #construct network
        input_channels = state_size
        
        self.layers = []
        for index, layer in enumerate(self.architecture):
            output_channels = layer
            if layer == 'tanh':
                self.layers.append(nn.Tanh())  #NOT INPLACE? MIGHT CAUSE ISSUE OR BE OK?      
            elif layer == 'relu':
                self.layers.append(nn.ReLU(inplace=True))
            elif layer == 'softmax':
                self.layers.append(nn.ReLU(inplace=True))
            elif layer == None:
                pass
            else:
                if index == 2:
                    break
                self.layers.append(nn.Linear(input_channels, output_channels))
                self.layers.append(nn.BatchNorm1d(output_channels))
                input_channels = output_channels
   
        self.fc1 = nn.Sequential(*self.layers)            
        
        self.layers = []
        base_ind = index
        for index, layer in enumerate(self.architecture[base_ind:]):
            output_channels = layer
            if layer == 'tanh':
                self.layers.append(nn.Tanh())  #NOT INPLACE? MIGHT CAUSE ISSUE OR BE OK?      
            elif layer == 'relu':
                self.layers.append(nn.ReLU(inplace=True))
            elif layer == 'softmax':
                self.layers.append(nn.ReLU(inplace=True))
            elif layer == None:
                pass
            else:
                if index == 0:
                    input_channels = output_channels + action_size
                self.layers.append(nn.Linear(input_channels, output_channels))
                #self.layers.append(nn.BatchNorm1d(output_channels))
                input_channels = output_channels
                
        self.fc2 = nn.Sequential(*self.layers)
        self._initialize_weights()
        
    def _initialize_weights(self):
        for k in self.modules():
            if isinstance(k, nn.Linear):
                m = k
                #print('Initializing: {}'.format(m))
                nn.init.xavier_uniform(m.weight.data) #paper used straight fanin, not xavier
                m.bias.data.zero_()
        m.weight.data.uniform_(-3e-4, 3e-4) #from paper
        m.bias.data.uniform_(-3e-4, 3e-4)  #from paper
                
    def forward(self, x):
        '''
        b = ['5', '6']
        a,b = b
        print(a)
        print(b)
        '''
        
        x, y = x
        x = self.fc1(x)
        x = torch.cat((x, y), dim=1)
        return self.fc2(x)
    
    def lr(self):
        return self.learning_rate




### 2. Define DDPG object to encapsulate definition, rollouts, and training

In [None]:
from collections import namedtuple
class DDPG(object):
    def __init__(self, obs_dim, act_dim, critic_lr = 1e-3, actor_lr = 1e-4, gamma = GAMMA, batch_size = BATCH_SIZE):
        
        super(DDPG, self).__init__()
        
        
        self.gamma = GAMMA
        self.batch_size = BATCH_SIZE
        #actor
        self.actor = actor(input_size = obs_dim, output_size = act_dim).type(FloatTensor)
        self.actor_target = actor(input_size = obs_dim, output_size = act_dim).type(FloatTensor)
        self.actor_target.load_state_dict(self.actor.state_dict())

        #critic
        self.critic = critic(state_size = obs_dim, action_size = act_dim, output_size = 1).type(FloatTensor)
        self.critic_target = critic(state_size = obs_dim, action_size = act_dim, output_size = 1).type(FloatTensor)
        self.critic_target.load_state_dict(self.critic.state_dict())

        #optimizers
        self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr = 1e-4)
        self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr = 1e-3, weight_decay=1e-2)
        
        #critic loss
        self.critic_loss = nn.MSELoss()
        
        #replay buffer
        self.replayBuffer = ReplayMemory()
        
        #noise
        self.noise = OrnsteinUhlenbeckProcess(dimension = act_dim, num_steps = NUM_EPISODES)
        
        #transition name dictionary
        self.Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))
        
    def train(self):
        
        #self.actor.train()
        #self.critic.train()
        #self.actor_target.train()
        #self.critic_target.train()


        transitions = self.replayBuffer.sample(self.batch_size)
        batch = self.Transition(*zip(*transitions))


        #with accessory_functions.Timer('tuple to list, loop'):
        #    a = [element for element in batch.state]
        #with accessory_functions.Timer('tuple to list, function'):
        #    a = list(batch.state)
        #--> about the same speed


        state_batch = Variable(torch.cat(batch.state)).type(FloatTensor)
        action_batch = Variable(torch.cat(batch.action)).type(FloatTensor)
        reward_batch = Variable(torch.cat(batch.reward)).type(FloatTensor)
        next_state_batch = Variable(torch.cat(batch.next_state)).type(FloatTensor)
        done_batch = Variable(1 - torch.cat(batch.done)).type(FloatTensor)


        #update critic
        predicted_action_batch = self.actor_target(next_state_batch)
        y_batch = reward_batch + self.gamma * self.critic_target(
            [next_state_batch, predicted_action_batch]).detach() * done_batch #prevents errant backprop

        #critic optimizer step
        loss_critic = self.critic_loss(self.critic([state_batch, action_batch]), y_batch)
        self.optimizer_critic.zero_grad()
        loss_critic.backward()
        self.optimizer_critic.step()

        #update actor
        loss_actor = -1 * (self.critic([state_batch, self.actor(state_batch)])).mean()

        #actor optimizer step
        self.optimizer_actor.zero_grad()
        loss_actor.backward()
        self.optimizer_actor.step()

        #sychronize target network with fast moving one
        weightSync(self.critic_target, self.critic)
        weightSync(self.actor_target, self.actor)

### 3.1 Create an instance of your DDPG object
### 3.2 Print network architectures, confirm they are correct

In [None]:
ddpg = DDPG(obs_dim = obs_dim, act_dim = act_dim)

print(ddpg.actor)
print(ddpg.critic)

print(ddpg.actor.parameters)

    
for parameter in ddpg.actor.parameters():
    print(parameter.shape)

Trying multithread from:
    http://chriskiehl.com/article/parallelism-in-one-line/
    https://stackoverflow.com/questions/2846653/how-to-use-threading-in-python

In [None]:
ddpgObjects = []
for i in range(4):
    ddpgObjects.append(DDPG(obs_dim = obs_dim, act_dim = act_dim))


def training_function(ddpg):
    env = gym.make(env_name)
    if VISUALIZE:
        if not os.path.exists(logdir):
            os.mkdir(logdir)
        env = wrappers.Monitor(env, logdir, force=True, video_callable=lambda episode_id: episode_id%logging_interval==0)
    env = NormalizeAction(env)
    env._max_episodes_steps = MAX_PATH_LENGTH



    num_steps = 0
    avg_val = 0
    running_rewards = []
    running_steps = []
    for itr in range(NUM_EPISODES):
        ob = env.reset()
        rewards_batch = []

        animate_this_episode=(itr % animate_interval == 0) and VISUALIZE

        while True:
            ddpg.noise.reset()


            if animate_this_episode:
                    env.render()
                    time.sleep(0.05)

            
            #ddpg.critic.eval()
            #ddpg.actor_target.eval()
            #ddpg.critic_target.eval()

            #choose move
            ob_th = Variable(torch.from_numpy(ob).unsqueeze(0)).type(FloatTensor)
            
            ddpg.actor.eval()
            ac = ddpg.actor(ob_th).cpu().data.numpy().squeeze() + ddpg.noise.step()
            ddpg.actor.train()
            
            old_ob = ob
            ob, rew, done, _ = env.step(ac)

            #(state, action, next_state, reward, done)
            ddpg.replayBuffer.push(torch.from_numpy(old_ob).unsqueeze(0), torch.from_numpy(ac).unsqueeze(0), torch.from_numpy(ob).unsqueeze(0), torch.Tensor([rew]).unsqueeze(0), torch.Tensor([done]).unsqueeze(0))

            num_steps += 1
            rewards_batch.append(rew)

            if len(ddpg.replayBuffer) >= BATCH_SIZE:
                ddpg.train()

            if done:
                break
                
        if avg_val > 900:
            break

        running_steps.append(num_steps)
        running_rewards.append(np.sum(rewards_batch))
        #print(running_rewards[-1])
        avg_val = avg_val * 0.95 + 0.05*running_rewards[-1]
        print(avg_val)
        
    return running_rewards

In [None]:
from multiprocessing.dummy import Pool as ThreadPool 
pool = ThreadPool(len(ddpgObjects)) 
running_rewards_looped = pool.map(training_function, ddpgObjects)

## Plot rewards over multiple training runs

In [None]:
outs = []
legend = []
plt.figure()
for itr, trial in enumerate(running_rewards_looped):
    out = numpy_ewma_vectorized_v2(np.array(trial),20)
    if len(out) < 5000:
        plt.plot(out)
        legend.append(itr)

    outs.append(out)
    

plt.title('Training reward over multiple runs')
plt.xlabel('Epoch')
plt.ylabel('Cumulative reward')
plt.legend(legend)
plt.show()