In [0]:
!pip install pybullet

Collecting pybullet
[?25l  Downloading https://files.pythonhosted.org/packages/d8/ac/a422ab8d1c57ab3f43e573b5a5f532e6afd348d81308fe66a1ecb691548e/pybullet-2.7.1-cp36-cp36m-manylinux1_x86_64.whl (95.0MB)
[K     |████████████████████████████████| 95.0MB 55kB/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-2.7.1


In [0]:
import os
import math 
import numpy as np
import matplotlib.pyplot as plt
import torch
import gym
import pybullet_envs
import torch.nn as nn
from gym import wrappers
import torch.nn.functional as F
from collections import deque
from torch.autograd import Variable

## Step 1

In [0]:
class ReplayBuffer(object):
  def __init__(self,max_size=1e6):
    self.storage=[]
    self.max_size=max_size
    self.ptr=0

  def add(self,transition):
    if len(self.storage)== self.max_size:
      self.storage[int(self.ptr)]=transition
      self.ptr=(self.ptr+1) % self.max_size
    else:
      self.storage.append(transition)  
  
  def sample(self,batch_size):
    ind=np.random.randint(0,len(self.storage),batch_size)
    batch_state,batch_next_state,batch_action,batch_reward,batch_dones=[],[],[],[],[]
    for i in ind:
      state,next_state,action,reward,done=self.storage[i]
      batch_state.append(np.array(state,copy=False))
      batch_next_state.append(np.append(next_state,copy=False))
      batch_action.append(np.append(action,copy=False))
      batch_reward.append(np.append(reward,copy=False))
      batch_dones.append(np.append(done,copy=False))
    return np.array(batch_state),np.array(batch_next_state),np.array(batch_action),np.array(batch_reward).reshape(-1,1),np.array(batch_dones).reshape(-1,1)



## Step 2

In [0]:
class Actor(nn.Module):
  def __init__(self,state_dims,action_dims,max_action):
    super(Actor,self).__init__()
    self.layer1=nn.Linear(state_dims,400)
    self.layer2=nn.Linear(400,300)
    self.layer3=nn.Linear(300,action_dims)
    self.max_action=max_action

  def forward(self,x):
    x=F.relu(self.layer1(x))
    x=F.relu(self.layer2(x))
    x=self.max_action*torch.tanh(self.layer3(x))
    return x

## Step 3

In [0]:
class Critic(nn.Module):
  def __init__(self,state_dims,action_dims):
    super(Critic,self).__init__()
    self.layer1=nn.Linear(state_dims+action_dims,400)
    self.layer2=nn.Linear(400,300)
    self.layer3=nn.Linear(300,action_dims)

    self.layer4=nn.Linear(state_dims+action_dims,400)
    self.layer5=nn.Linear(400,300)
    self.layer6=nn.Linear(300,action_dims)

  def forward(self,x,u):
    xu=torch.cat([x,u],1)

    xu=F.relu(self.layer1(xu))
    x1=F.relu(self.layer2(x1))
    x1=self.layer3(x1)

    x2=F.relu(self.layer4(xu))
    x2=F.relu(self.layer5(x2))
    x2=self.layer6(x2)
    return x1,x2

  def Q1(self,x,u):
    xu=torch.cat([x,u],1)

    xu=F.relu(self.layer1(xu))
    x1=F.relu(self.layer2(x1))
    x1=self.layer3(x1)
    return x1

In [0]:

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class T3D(object):
  def __init__(self,state_dims,action_dims,max_action):
    self.actor=Actor(state_dims,action_dims,max_action).to(device)
    self.actor_target=Actor(state_dims,action_dims,max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict)
    self.actor.optimizer=torch.optim.Adam(self.actor.parameters())

    self.critic=Critic(state_dims,action_dims).to(device)
    self.critic_target=Critic(state_dims,action_dims).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict)
    self.critic.optimizer=torch.optim.Adam(self.critic.parameters())
    self.max_action=max_action

  def select_action(self,state):
    state=torch.Tensor(state.reshape(1,-1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

#STEP 4
  def train(self,replay_buffer,iterations,batch_size=100,discount=0.99,tau=0.005,policy_noise=0.2,noise_clip=0.5,policy_freq=2):
    for it in range(iterations):
      batch_state,batch_next_state,batch_action,batch_reward,batch_dones=replay_buffer.sample(batch_size)
      state=torch.Tensor(batch_state).to(device)
      next_state=torch.Tensor(batch_next_state).to(device)
      action=torch.Tensor(batch_action).to(device)
      reward=torch.Tensor(batch_reward).to(device)
      done=torch.Tensor(batch_dones).to(device)
#STEP 5
    next_action=self.actor_target.forward(next_state)

#STEP 6

    noise=torch.Tensor(batch_action).data.normal_(0,policy_noise).to(device)
    noise=noise.clamp(-noise_clip,noise_clip)
    next_action=(next_action+noise).clamp(-self.max_action,self.max_action)

#STEP 7
    target_Q1, target_Q2 = self.critic_target(next_state, next_action)
#STEP 8

    target_Q = torch.min(target_Q1, target_Q2)
#STEP 9

    target_Q = reward + ((1-done) *discount * target_Q).detach()

# STEP 10
    current_Q1, current_Q2 = self.critic(state, action)

# STEP 11
    critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

# STEP 12
    self.critic_optimizer.zero_grad()
    critic_loss.backward()
    self.critic_optimizer.step()
# STEP 13

    if it % policy_freq==0:
      actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
      self.actor_optimizer.zero_grad()
      actor_loss.backward()
      self.actor_optimizer.step()
#STEP 14
   for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
			target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
# STEP 15
   for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
			target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
