# DDPG for Pendulum environment

In [None]:
!pip uninstall torch
!pip install torch==1.4.0

Found existing installation: torch 1.10.0+cu111
Uninstalling torch-1.10.0+cu111:
  Would remove:
    /usr/local/bin/convert-caffe2-to-onnx
    /usr/local/bin/convert-onnx-to-caffe2
    /usr/local/bin/torchrun
    /usr/local/lib/python3.7/dist-packages/caffe2/*
    /usr/local/lib/python3.7/dist-packages/torch-1.10.0+cu111.dist-info/*
    /usr/local/lib/python3.7/dist-packages/torch/*
Proceed (y/n)? y

[31mERROR: Operation cancelled by user[0m
Collecting torch==1.4.0
  Downloading torch-1.4.0-cp37-cp37m-manylinux1_x86_64.whl (753.4 MB)
[K     |████████████████████████████████| 753.4 MB 6.7 kB/s 
[?25hInstalling collected packages: torch
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.11.1+cu111 requires torch==1.10.0, but you have torch 1.4.0 which is incompatible.
torchtext 0.11.0 requires torch==1.10.0, but you have torch 1.4.0 which is i

In [None]:
# importing the dependencies
import gym
import numpy as np

import plotly.graph_objs as go
import plotly.offline as py

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from torch.distributions import Normal

import math
import random

## Creating the network

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = int((self.position + 1) % self.capacity)  # as a ring buffer
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element
        ''' 
        the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ;
        zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ;
        the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ;
        np.stack((1,2)) => array([1, 2])
        '''
        return state, action, reward, next_state, done
    
    def __len__(self):
        return len(self.buffer)

In [None]:
class ActorNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, init_w=3e-3):
        super(ActorNetwork, self).__init__()
        self.action_dim=output_dim
        
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, output_dim) # output dim = dim of action

        # weights initialization
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
    

    def forward(self, state):
        activation=F.relu
        x = activation(self.linear1(state)) 
        x = activation(self.linear2(x))
        # x = F.tanh(self.linear3(x)).clone() # need clone to prevent in-place operation (which cause gradients not be drived)
        x = self.linear3(x) # for simplicity, no restriction on action range

        return x

    def select_action(self, state, noise_scale=1.0):
        '''
        select action for sampling, no gradients flow, noisy action, return .cpu
        '''
        state = torch.FloatTensor(state).unsqueeze(0).to(device) # state dim: (N, dim of state)
        normal = Normal(0, 1)
        action = self.forward(state)
        noise = noise_scale * normal.sample(action.shape).to(device)
        action+=noise
        return action.detach().cpu().numpy()[0]

    def sample_action(self, action_range=1.):
        normal = Normal(0, 1)
        random_action=action_range*normal.sample( (self.action_dim,) )

        return random_action.cpu().numpy()


    def evaluate_action(self, state, noise_scale=0.0):
        '''
        evaluate action within GPU graph, for gradients flowing through it, noise_scale controllable
        '''
        normal = Normal(0, 1)
        action = self.forward(state)
        # action = torch.tanh(action)
        noise = noise_scale * normal.sample(action.shape).to(device)
        action+=noise
        return action

In [None]:
class QNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, init_w=3e-3):
        super(QNetwork, self).__init__()
        
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state, action):
        x = torch.cat([state, action], 1) # the dim 0 is number of samples
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x

In [None]:
class DDPG():
    def __init__(self, replay_buffer, state_dim, action_dim, hidden_dim):
        self.replay_buffer = replay_buffer
        self.qnet = QNetwork(state_dim+action_dim, hidden_dim).to(device)
        self.target_qnet = QNetwork(state_dim+action_dim, hidden_dim).to(device)
        self.policy_net = ActorNetwork(state_dim, action_dim, hidden_dim).to(device)
        self.target_policy_net = ActorNetwork(state_dim, action_dim, hidden_dim).to(device)

        print('Q network: ', self.qnet)
        print('Policy network: ', self.policy_net)

        for target_param, param in zip(self.target_qnet.parameters(), self.qnet.parameters()):
            target_param.data.copy_(param.data)
        self.q_criterion = nn.MSELoss()
        q_lr=8e-4
        policy_lr = 8e-4
        self.update_cnt=0

        self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)
    
    def target_soft_update(self, net, target_net, soft_tau):
    # Soft update the target net
        for target_param, param in zip(target_net.parameters(), net.parameters()):
            target_param.data.copy_(  # copy data value into target parameters
                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
            )

        return target_net

    def update(self, batch_size, reward_scale=10.0, gamma=0.99, soft_tau=1e-2, policy_up_itr=10, target_update_delay=3, warmup=True):
        self.update_cnt+=1
        state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
        # print('sample:', state, action,  reward, done)

        state      = torch.FloatTensor(state).to(device)
        next_state = torch.FloatTensor(next_state).to(device)
        action     = torch.FloatTensor(action).to(device)
        reward     = torch.FloatTensor(reward).unsqueeze(1).to(device)  
        done       = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)

        predict_q = self.qnet(state, action) # for q 
        new_next_action = self.target_policy_net.evaluate_action(next_state)  # for q
        new_action = self.policy_net.evaluate_action(state) # for policy
        predict_new_q = self.qnet(state, new_action) # for policy
        target_q = reward+(1-done)*gamma*self.target_qnet(next_state, new_next_action)  # for q
        # reward = reward_scale * (reward - reward.mean(dim=0)) /reward.std(dim=0) # normalize with batch mean and std

        # train qnet
        q_loss = self.q_criterion(predict_q, target_q.detach())
        self.q_optimizer.zero_grad()
        q_loss.backward()
        self.q_optimizer.step()

        # train policy_net
        policy_loss = -torch.mean(predict_new_q)
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

            
        # update the target_qnet
        if self.update_cnt%target_update_delay==0:
            self.target_qnet=self.target_soft_update(self.qnet, self.target_qnet, soft_tau)
            self.target_policy_net=self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau)

        return q_loss.detach().cpu().numpy(), policy_loss.detach().cpu().numpy()


In [None]:
class NormalizedActions(gym.ActionWrapper): # gym env wrapper
    def action(self, action):
        low  = self.action_space.low
        high = self.action_space.high
        
        action = low + (action + 1.0) * 0.5 * (high - low)
        action = np.clip(action, low, high)
        
        return action

    def _reverse_action(self, action):
        low  = self.action_space.low
        high = self.action_space.high
        
        action = 2 * (action - low) / (high - low) - 1
        action = np.clip(action, low, high)
        
        return action

In [None]:
# Initialization
env = NormalizedActions(gym.make("Pendulum-v0"))
# env = gym.make("Pendulum-v0")
action_dim = env.action_space.shape[0]
state_dim  = env.observation_space.shape[0]

# Device settings
device_idx = 0
device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu")

hidden_dim = 512
explore_steps = 0  # for random exploration
batch_size = 64

replay_buffer_size=1e6
replay_buffer = ReplayBuffer(replay_buffer_size)

torch.autograd.set_detect_anomaly(True)
alg = DDPG(replay_buffer, state_dim, action_dim, hidden_dim)

Q network:  QNetwork(
  (linear1): Linear(in_features=4, out_features=512, bias=True)
  (linear2): Linear(in_features=512, out_features=512, bias=True)
  (linear3): Linear(in_features=512, out_features=1, bias=True)
)
Policy network:  ActorNetwork(
  (linear1): Linear(in_features=3, out_features=512, bias=True)
  (linear2): Linear(in_features=512, out_features=512, bias=True)
  (linear3): Linear(in_features=512, out_features=1, bias=True)
)


In [None]:
q_loss_list = []
policy_loss_list = []

In [None]:
# hyper-parameters
max_episodes  = 1000
max_steps   = 100
frame_idx   = 0
rewards=[]

for i_episode in range (max_episodes):
  episode_q_loss=[]
  episode_policy_loss=[]
  state = env.reset()
  episode_reward = 0

  for step in range(max_steps):
    if frame_idx > explore_steps:
      action = alg.policy_net.select_action(state)
    else:
      action = alg.policy_net.sample_action(action_range=1.)
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
                
    state = next_state
    episode_reward += reward
    frame_idx += 1
                
    if len(replay_buffer) > batch_size:
      q_loss, policy_loss = alg.update(batch_size)
      episode_q_loss.append(float(q_loss))
      episode_policy_loss.append(float(policy_loss))
                
    if done:
      break
  if i_episode % 20 == 0:
    print('Eps: ', i_episode, '| Reward: ', episode_reward, '| Loss: ', np.average(q_loss_list), np.average(policy_loss_list))
            
  rewards.append(episode_reward)
  q_loss_list.append(np.mean(episode_q_loss))
  policy_loss_list.append(np.mean(episode_policy_loss))


Mean of empty slice.


invalid value encountered in double_scalars



Eps:  0 | Reward:  -707.7538796139933 | Loss:  nan nan
Eps:  20 | Reward:  -559.5220802985062 | Loss:  0.8921281498346687 22.587422432278668
Eps:  40 | Reward:  -261.32276963485845 | Loss:  0.6622356796251714 37.86487090032055
Eps:  60 | Reward:  -263.24580053241584 | Loss:  0.7016236407400488 46.84928929085701
Eps:  80 | Reward:  -127.54346012515659 | Loss:  0.8901108610735328 50.90110911759116
Eps:  100 | Reward:  -0.7628016124101397 | Loss:  1.2144002501780582 52.55816235491735
Eps:  120 | Reward:  -222.30829755230494 | Loss:  1.527118604366547 52.92114783578698
Eps:  140 | Reward:  -118.91284353997135 | Loss:  1.7790868839064375 52.36241415219293
Eps:  160 | Reward:  -238.5387408929167 | Loss:  1.9661517090738312 51.35658767203677
Eps:  180 | Reward:  -245.80925148187225 | Loss:  2.122614698328618 50.2009689713954
Eps:  200 | Reward:  -235.31657452914013 | Loss:  2.222916174696127 49.037285120663554
Eps:  220 | Reward:  -244.6662705770475 | Loss:  2.31529401069856 47.84279689332348

In [None]:
# Plot the data
len(rewards)

1000

In [None]:
episodes = np.arange(len(rewards))
data = [go.Scatter(x=episodes, y=rewards)]
layout = go.Layout(title='Rewards for DDPG agent on Pendulum-v0', xaxis_title='No. of episodes', yaxis_title='Rewards', \
                   width=1200, height=600)
figure = go.Figure(data=data, layout=layout)
py.iplot(figure)

In [None]:
from plotly.subplots import make_subplots
xaxis1 = np.arange(len(q_loss_list))
xaxis2 = np.arange(len(policy_loss_list))

fig = make_subplots(rows=1, cols=2, subplot_titles=('Q Loss', 'Policy Loss'))

qloss_curve = go.Scatter(x=xaxis1, y=q_loss_list, mode='lines')
policy_loss_curve = go.Scatter(x=xaxis2, y=policy_loss_list, mode='lines')

fig.add_traces([qloss_curve, policy_loss_curve], rows=[1, 1], cols=[1, 2])
fig.update_xaxes(title_text='Episodes', row=1, col=1)
fig.update_xaxes(title_text='Episodes', row=1, col=2)

fig.update_yaxes(title_text='Loss', row=1, col=1)
fig.update_yaxes(title_text='Loss', row=1, col=1)

fig.update_layout(title='Loss curves for DDPG Agent on Pendulum-v0')
py.iplot(fig)

In [None]:
# save the rewards
np.save('Rewards_DDPG', rewards)

# The loss_files
np.save('QLoss_DDPG', q_loss_list)
np.save('PolicyLoss_DDPG', policy_loss_list)