In [1]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import gym
import pybullet_envs
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque



  logger.warn(


In [2]:
class ReplayBuffer(object):
    
    def __init__(self, max_size=1000000):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0
        
    def add(self, transition):
        if len(self.storage) >= self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr + 1) % self.max_size # This is a reset to set self.ptr back to 0 when it hits max size.
        else:
            self.storage.append(transition)
            
    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), batch_size)
        batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
        for i in ind:
            state, next_state, action, reward, done = self.storage[i]
            batch_states.append(np.array(state, copy=False))
            batch_next_states.append(np.array(next_state, copy=False))
            batch_actions.append(np.array(action, copy=False))
            batch_rewards.append(np.array(reward, copy=False))
            batch_dones.append(np.array(done, copy=False))
            
        batch_states = np.array(batch_states)
        batch_next_states = np.array(batch_next_states)
        batch_actions = np.array(batch_actions)
        batch_rewards = np.array(batch_rewards).reshape(-1, 1)
        batch_dones = np.array(batch_dones)
        
        return batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones
    
    def can_sample(self, batch_size):
        if len(self.storage) > batch_size * 10:
            return True
        else:
            return False
    

In [3]:
class Actor(nn.Module):
    
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.layer_1 = nn.Linear(state_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, action_dim)
        self.max_action = max_action
        
    def forward(self, x):
        x = F.relu(self.layer_1(x))
        x = F.relu(self.layer_2(x))
        x = self.max_action * torch.tanh(self.layer_3(x))
        return x

    def save_the_model(self, weights_filename='models/actor_latest.pt'):
        # Take the default weights filename(latest.pt) and save it
        torch.save(self.state_dict(), weights_filename)


    def load_the_model(self, weights_filename='models/actor_latest.pt'):
        try:
            self.load_state_dict(torch.load(weights_filename))
            print(f"Successfully loaded weights file {weights_filename}")
        except:
            print(f"No weights file available at {weights_filename}")
    
                

In [4]:
class Critic(nn.Module):
    
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        # First Critic Network.
        self.layer_1 = nn.Linear(state_dim + action_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, 1)
        
        # Second critic network
        self.layer_4 = nn.Linear(state_dim + action_dim, 400)
        self.layer_5 = nn.Linear(400, 300)
        self.layer_6 = nn.Linear(300, 1)

        
    def forward(self, x, u):
        
        
        
        xu = torch.cat([x, u], 1)
        
        # First critic forward prop
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        
        # Second critic forward prop
        x2 = F.relu(self.layer_4(xu))
        x2 = F.relu(self.layer_5(x2))
        x2 = self.layer_6(x2)
        
        return x1, x2
    
    def Q1(self, x, u):
        
        xu = torch.cat([x, u], 1)
        
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        return x1

    def save_the_model(self, weights_filename='models/critic_latest.pt'):
        # Take the default weights filename(latest.pt) and save it
        torch.save(self.state_dict(), weights_filename)


    def load_the_model(self, weights_filename='models/critic_latest.pt'):
        try:
            self.load_state_dict(torch.load(weights_filename))
            print(f"Successfully loaded weights file {weights_filename}")
        except:
            print(f"No weights file available at {weights_filename}")
    
    

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [6]:
class TD3(object):
    
    def __init__(self, state_dim, action_dim, max_action, device=None):
        self.device = device
        self.actor = Actor(state_dim, action_dim, max_action).to(self.device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(self.device)
        self.actor.load_the_model()
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        
        self.critic = Critic(state_dim, action_dim).to(self.device)
        self.critic_target = Critic(state_dim, action_dim).to(self.device)
        self.critic.load_the_model()
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        
        self.max_action = max_action
    
    def select_action(self, state):
        state = torch.Tensor(state.reshape(1, -1)).to(self.device)
        return self.actor(state).cpu().data.numpy().flatten()
    
    def train(self, replay_buffer: ReplayBuffer, epochs, batch_size=100, discount=0.99, tau=0.005, 
              policy_noise=0.2, noise_clip=0.5, policy_freq=2):
        
        for epoch in range(epochs):
            if replay_buffer.can_sample(batch_size):
                batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size=100)
                
                state  = torch.Tensor(batch_states).to(self.device)
                next_state  = torch.Tensor(batch_next_states).to(self.device)
                action  = torch.Tensor(batch_actions).to(self.device)
                reward  = torch.Tensor(batch_rewards).to(self.device)
                done  = torch.Tensor(batch_dones).to(self.device)
                
                # Step 5: From the next state s', the actor target plays the next action a'
                next_action = self.actor_target(next_state).to(self.device)
                
                # Step 6: Add Gaussian noise
                noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(self.device)
                noise = noise.clamp(-noise_clip, +noise_clip)
                next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
                
                # Step 7: Get critic q value
                target_q1, target_q2 = self.critic_target(next_state, next_action)
                                    
                # Step 8: We keep the minimum of these two Q-values
                target_q = torch.min(target_q1, target_q2)  
                
                # Step 9: We get the final target of the two Critic models, which is Qt = r + y * min(Qt1, Qt2), where y is the discount factor.
                target_q = reward + ((1 - done) * discount * target_q).detach()
                
                # Step 10: The two critic models should take each the couple (s, a) as input and return two Q-Values(Q1 of s,a and Q2 of s,a) 
                current_q1, current_q2 = self.critic(state, action)

                # Step 11                
                critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
                
                # Step 12: Compute the loss between the two critic models: Critic Loss = MSE_Loss(Q(s,a), Qt) + MSE_Loss(Q(s,a), Qt
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                self.critic_optimizer.step()
                
                # Step 13: Once every two iterations, update the actor model by performing gradient ascent on the output of the first critic model.
                if epoch % policy_freq == 0:
                    actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
                    self.actor_optimizer.zero_grad()
                    actor_loss.backward()
                    self.actor_optimizer.step()
                    
                    # Step 14: Still once every two iterations, use Polyak averaging to update the target weights
                    for target_param, main_param in zip(self.actor_target.parameters(), self.actor.parameters()):
                        target_param.data.copy_(tau * main_param.data + (1.0 - tau) * target_param.data)

                    for target_param, main_param in zip(self.critic_target.parameters(), self.critic.parameters()):
                        target_param.data.copy_(tau * main_param.data + (1.0 - tau) * target_param.data)
                
              # Making a save method to save a trained model
    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
    
    # Making a load method to load a pre-trained model
    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

                        
                    

In [7]:
def evaluate_policy(env, policy, eval_episodes=10):
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    while not done:
      action = policy.select_action(np.array(obs))
      obs, reward, done, _ = env.step(action)
      avg_reward += reward
          

  avg_reward /= eval_episodes
  print ("---------------------------------------")
  print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
  print ("---------------------------------------")
  return avg_reward

In [8]:
env_name = "AntBulletEnv-v0" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))


In [9]:
if not os.path.exists("./results"):
  os.makedirs("./results")
if save_models and not os.path.exists("./models"):
  os.makedirs("./models")

In [10]:
env = gym.make(env_name)

  from pkg_resources import parse_version
  logger.warn(
  logger.warn(
  logger.warn(


In [11]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

In [12]:
policy = TD3(state_dim, action_dim, max_action)

replay_buffer = ReplayBuffer()

evaluations = [evaluate_policy(env, policy)]

def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')
max_episode_steps = env._max_episode_steps
save_env_vid = False
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True)
  env.reset()

  logger.warn(
  logger.warn(
  logger.warn(


No weights file available at models/actor_latest.pt
No weights file available at models/critic_latest.pt
---------------------------------------
Average Reward over the Evaluation Step: 9.807990
---------------------------------------


In [13]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()

In [14]:
# We start the main loop over 500,000 timesteps
while total_timesteps < max_timesteps:
  
  # If the episode is done
  if done:

    # If we are not at the very beginning, we start the training process of the model
    if total_timesteps != 0:
      print("Total Timesteps: {} Episode Num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward))
      policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)

    # We evaluate the episode and we save the policy
    if timesteps_since_eval >= eval_freq:
      timesteps_since_eval %= eval_freq
      evaluations.append(evaluate_policy(env, policy))
      policy.save(file_name, directory="./models")
      np.save("./results/%s" % (file_name), evaluations)
    
    # When the training step is done, we reset the state of the environment
    obs = env.reset()
    
    
    # Set the Done to False
    done = False
    
    # Set rewards and episode timesteps to zero
    episode_reward = 0
    episode_timesteps = 0
    episode_num += 1
  
  # Before 10000 timesteps, we play random actions
  if total_timesteps < start_timesteps:
    action = env.action_space.sample()
  else: # After 10000 timesteps, we switch to the model
    action = policy.select_action(np.array(obs))
    # If the explore_noise parameter is not 0, we add noise to the action and we clip it
    if expl_noise != 0:
      action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)
  
  # The agent performs the action in the environment, then reaches the next state and receives the reward
  new_obs, reward, done, _ = env.step(action)
  
  # We check if the episode is done
  done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
  
  # We increase the total reward
  episode_reward += reward
  
  # We store the new transition into the Experience Replay memory (ReplayBuffer)
  replay_buffer.add((obs, new_obs, action, reward, done_bool))

  # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
  obs = new_obs
  episode_timesteps += 1
  total_timesteps += 1
  timesteps_since_eval += 1

# We add the last policy evaluation to our list of evaluations and we save our model
evaluations.append(evaluate_policy(env, policy))
if save_models: policy.save("%s" % (file_name), directory="./models")
np.save("./results/%s" % (file_name), evaluations)

Total Timesteps: 721 Episode Num: 1 Reward: 381.1608412336469
Total Timesteps: 742 Episode Num: 2 Reward: 3.8410106292678385
Total Timesteps: 1742 Episode Num: 3 Reward: 513.3346754733018


  critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)


Total Timesteps: 2076 Episode Num: 4 Reward: 182.53297325859165
Total Timesteps: 3076 Episode Num: 5 Reward: 435.0931396498288
Total Timesteps: 3222 Episode Num: 6 Reward: 65.37650211748844
Total Timesteps: 4222 Episode Num: 7 Reward: 503.2689344251618
Total Timesteps: 4632 Episode Num: 8 Reward: 187.77519935786697
Total Timesteps: 4825 Episode Num: 9 Reward: 89.61508074495067
Total Timesteps: 5825 Episode Num: 10 Reward: 480.918111548183
---------------------------------------
Average Reward over the Evaluation Step: 143.611270
---------------------------------------
Total Timesteps: 6825 Episode Num: 11 Reward: 525.7071220947724
Total Timesteps: 7825 Episode Num: 12 Reward: 506.999570887151
Total Timesteps: 8825 Episode Num: 13 Reward: 393.33442745288454
Total Timesteps: 9346 Episode Num: 14 Reward: 241.2137639942901
Total Timesteps: 10202 Episode Num: 15 Reward: 435.72408383395646
---------------------------------------
Average Reward over the Evaluation Step: 158.598200
-----------