# Implementation TD3

# Importing the libraries

In [1]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque

# Step1: we initialize ehr Experience Replay Memory

In [2]:
class ReplayBuffer(object):
    def __init__(self, max_size = 1e6):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0
        
    def add(self, transition):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr +1) % self.max_size
        else:
            self.storage.append(transition)
            
    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), size = batch_size)
        batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
        for i in ind:
            state, next_state, action, reward, done = self.storage[i]
            batch_states.append(np.array(state, copy=False))
            batch_next_states.append(np.array(next_state, copy=False))
            batch_actions.append(np.array(action, copy=False))
            batch_rewards.append(np.array(reward, copy=False))
            batch_dones.append(np.array(done, copy=False))
        return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1,1), np.array(batch_dones).reshape(-1,1)

# Step 2: Build Neural Network for Actors

In [3]:
class Actor(nn.Module):
    
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.layer_1 = nn.Linear(state_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, action_dim)
        self.max_action = max_action
        
    def forward(self, x):
        x = F.relu(self.layer_1(x))
        x = F.relu(self.layer_2(x))
        x = self.max_action*torch.tanh(self.layer_3(x))
        return x

# Step 3: Build Neural Network for crtitic

In [4]:
class Critic(nn.Module):
    
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        # first twin NN model
        self.layer_1 = nn.Linear(state_dim+action_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, 1)
        # Second twin NN model
        self.layer_4 = nn.Linear(state_dim+action_dim, 400)
        self.layer_5 = nn.Linear(400, 300)
        self.layer_6 = nn.Linear(300, 1)
        
    def forward(self, x, u):
        xu = torch.cat([x, u], 1)
        #first propoagation
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        # second propogation
        x2 = F.relu(self.layer_4(xu))
        x2 = F.relu(self.layer_5(x2))
        x2 = self.layer_6(x2)
        return x1, x2
    
    def q1(self, x, u):
        xu = torch.cat([x, u], 1)
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        return x1

# Step 4 to 15: Training Process

In [5]:
#Slecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):
    
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor (state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic (state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        self.max_action = max_action
        
    def select_action(self, state):
        state = torch.Tensor(state.reshape(1,-1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()
    
    def train(self, replay_buffer, iterations, batch_size=100, discount = .99, tau=.005, policy_noise=.2, noise_clip=.5, policy_freq=2):
        
        for it in range(iterations):
            
            # Step4: we sample a batch of transitions (s,s',a,r) from emory
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
            state = torch.Tensor(batch_states).to(device)
            next_state = torch.Tensor(batch_next_states).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)
            
            # Step5: from the next state s', the Actor target plays the next action a'
            next_action = self.actor_target(next_state)
            
            # Step6: add gaussian noise to next_action and we clamp(clip) it in arange of values supported by the environmenrt
            noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip,noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
            
            # Step7: two critic target get (s',a') and return qt1(s',a'), qt2(s', a')
            target_q1, target_q2 = self.critic_target(next_state, next_action)
            
            # Step8: min(q_values) to represent approximate values of the next state
            target_q = torch.min(target_q1, target_q2)
            
            # Step9: Final target of the two critic model(here we have to consider if we are in the last transition of episode or not)
            target_q = reward + ((1-done) * discount * target_q).detach()
            
            # Step10: two critic model take (s,a) and return Q1(s,a), Q2(s,a) with target Q
            current_q1, current_q2 = self.critic(state, action)
            
            # Step11: calculate loss function with mse
            critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
            
            # Step12: BackPropogation
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
            
            # Step13: update actor model by gradient ascent
            # Gradient ascent that is very impormant:
            # Gradient descent is minimizing a function and gradient ascent is like gradient descent but in minus of that function
            if it % policy_freq == 0:
                actor_loss = -self.critic.q1(state, self.actor(state)).mean()
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                
                # Step14: update weights of the actor target by polyak average
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau*param.data +(1 - tau)*target_param.data)
                    
                    
                # Step15: update wights of the critic target by polyak average
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau*param.data +(1-tau)*target_param.data)
               
    # Making a save method to save a trained model
    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), "%s%s_actor.pth" % (directory, filename))
        torch.save(self.critic.state_dict(), "%s%s_critic.pth" % (directory, filename))
        
    # Making a load method to load a pre_trained model
    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load("%s%s_actor.pth" % (directory, filename)))
        self.critic.load_state_dict(torch.load("%s%s_critic.pth" % (directory, filename)))

# Make a function that evaluate the policy by calculating rits average reward over 10 episodes

In [6]:
def evaluate_policy(policy, eval_episodes = 10):
    avg_reward = 0.
    for _ in range(eval_episodes):
        obs = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            avg_reward += reward
    avg_reward /= eval_episodes
    print("----------------------------------------")
    print("Average Reward over the evaluation step: %f" % (avg_reward))
    print("----------------------------------------")
        
    return avg_reward

# Set parameters

In [7]:
env_name = "HalfCheetahBulletEnv-v0" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 ## Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 ## How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

# Create a file name for the two saved models: the Actor and Critic models

In [8]:
file_name = "%s_%s_%s" %("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

---------------------------------------
Settings: TD3_HalfCheetahBulletEnv-v0_0
---------------------------------------


# Create a folde inside which will be saved the trained model

In [9]:
if not os.path.exists("./results"):
    os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
    os.makedirs("./pytorch_models")

# Create the PyBullet environment

### This is for installing gym

https://towardsdatascience.com/how-to-install-openai-gym-in-a-windows-environment-338969e24d30

In [10]:
env = gym.make(env_name)



# Set seeds and we get the necessary information on the states and actions in the choosen environmet

In [11]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

# Create the Network

In [12]:
policy = TD3(state_dim, action_dim, max_action)

# Create the Exprience Replay memory

In [13]:
replay_buffer = ReplayBuffer()

# Define a list where all the evaluation results over 10 episodes are stored

In [14]:
evaluations = [evaluate_policy(policy)]

----------------------------------------
Average Reward over the evaluation step: -1429.426654
----------------------------------------


# Create a new folder directory in which the final results (video of the agent will be populated

In [15]:
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir("exp", "brs")
monitor_dir = mkdir(work_dir, "monitor")
max_episode_steps = env._max_episode_steps
save_env_vid = False
if save_env_vid:
    env = wrappers.Monitor(env, monitor_dir, force = True)
    env.reset()

# Initialize the variables

In [16]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()

# Training

In [17]:
# Start the main loop over 500,000 timesteps
while total_timesteps < max_timesteps:
    
    # if the episode is done
    if done:
        
        # if we are not at the very begining, we start the training process of the model
        if total_timesteps !=0:
            print("Total Timesteps: {} Episode Num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward))
            policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
            
        if timesteps_since_eval >= eval_freq:
            timesteps_since_eval %= eval_freq
            evaluations.append(evaluate_policy(policy))
            policy.save(file_name, directory="./pytorch_models")
            np.save("./results/%s" % (file_name), evaluations)
            
        # When the training step is done, we reset the state of the environment
        
        obs = env.reset()
        
        #et the Done to False
        
        #set rewards and episode timesteps to zero
        episode_reward = 0
        episode_timesteps = 0
        episode_num += 1
        
        
    # Before 10000 timesteps, we play random actions
    if total_timesteps < start_timesteps:
        action = env.action_space.sample()
    else: #After 10000 times we switch to the model
        action = policy.select_action(np.array(obs))
        
        # if the explore_noise parameter is not 0, we add noise to the action and we clip it
        if expl_noise != 0:
            action = (action +np.random.normal(0, expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)
    
    # the agent performs the action in the environment, then reaches the next state and receives the reward
    new_obs, reward, done, _ = env.step(action)
    
    #we check if the episode is done
    done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
    
    # We increase the total rewrd
    episode_reward += reward
    
    # We store the new transition into the experiemce replay memory
    replay_buffer.add((obs, new_obs, action, reward, done_bool))
    
    # We update the state, the episode timestep, the total timesteps, and the timesteps scince the evaluation of the policy
    obs = new_obs
    episode_timesteps +=1
    total_timesteps +=1
    timesteps_since_eval +=1
    
    
# We add the last policy evaluation to our list of evaluation and we save our model
evaluations.append(evaluate_policy(policy))
if save_models: policy.save("%s" % (file_name), directory="./pytorch_models")
np.save("./results/%s" % (file_name), evaluations)

Total Timesteps: 1000 Episode Num: 1 Reward: -1082.1488380716285
Total Timesteps: 2000 Episode Num: 2 Reward: -1182.3137219299354
Total Timesteps: 3000 Episode Num: 3 Reward: -1234.4807485970498


KeyboardInterrupt: 

# Infrence

In [None]:
class Actor(nn.Module):
    
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.layer_1 = nn.Linear(state_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, action_dim)
        self.max_action = max_action
        
    def forward(self, x):
        x = F.relu(self.layer_1(x))
        x = F.relu(self.layer_2(x))
        x = self.max_action*torch.tanh(self.layer_3(x))
        return x
    
class Critic(nn.Module):
    
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        # first twin NN model
        self.layer_1 = nn.Linear(state_dim+action_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, 1)
        # Second twin NN model
        self.layer_4 = nn.Linear(state_dim+action_dim, 400)
        self.layer_5 = nn.Linear(400, 300)
        self.layer_6 = nn.Linear(300, 1)
        
    def forward(self, x, u):
        xu = torch.cat([x, u], axis = 1)
        #first propoagation
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(xu))
        x1 = self.layer_3(xu)
        # second propogation
        x2 = F.relu(self.layer_4(xu))
        x2 = F.relu(self.layer_5(xu))
        x2 = self.layer_6(xu)
        return x1, x2
    
    def q1(self, x, u):
        xu = torch.cat([x, u], axis = 1)
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(xu))
        x1 = self.layer_3(xu)
        return x1
    
#Slecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TD3(object):
    
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor (state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(params = self.actor.parameters())
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic (state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(params = self.critic.parameters())
        self.max_action = max_action
        
    def select_action(self, state):
        state = torch.Tensor(state.reshape(1,-1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()
    
    def train(self, replay_buffer, iterations, batch_size=100, discount = .99, tau=.005, policy_noise=.2, noise_clip=.5, policy_freq=2):
        
        for it in range(iterations):
            # Step4: we sample a batch of transitions (s,s',a,r) from emory
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
            state = torch.Tensor(batch_states).to(device)
            next_state = torch.Tensor(batch_next_states).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)
            
            # Step5: from the next state s', the Actor target plays the next action a'
            next_action = self.actor_target.forward(next_state)
            
            # Step6: add gaussian noise to next_action and we clamp(clip) it in arange of values supported by the environmenrt
            noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip,noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
            
            # Step7: two critic target get (s',a') and return qt1(s',a'), qt2(s', a')
            target_q1, target_q2 = self.critic_target.forward(next_state, next_action)
            
            # Step8: min(q_values) to represent approximate values of the next state
            target_q = torch.min(target_q1, target_q2)
            
            # Step9: Final target of the two critic model(here we have to consider if we are in the last transition of episode or not)
            target_q = reward + ((1-done) * discount * target_q).detach()
            
            # Step10: two critic model take (s,a) and return Q1(s,a), Q2(s,a) with target Q
            current_q1, current_q2 = self.critic.forward(state, action)
            
            # Step11: calculate loss function with mse
            critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
            
            # Step12: BackPropogation
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
            
            # Step13: update actor model by gradient ascent
            # Gradient ascent that is very impormant:
            # Gradient descent is minimizing a function and gradient ascent is like gradient descent but in minus of that function
            if it % policy_freq == 0:
                actor_loss = -self.critic.q1(state, self.actor(state)).mean()
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                
                # Step14: update weights of the actor target by polyak average
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau*param.data +(1 - tau)*target_patam.data)
                    
                    
                # Step15: update wights of the critic target by polyak average
                for param, target_param in zip(sel.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau*param.data +(1-tau)*target_param.data)
               
    # Making a save method to save a trained model
    def save(self, filename, directory):
        torch.save(self.actor.state_dic(), "%s%s_actor.pth" % (directory, filename))
        torch.save(self.critic.state_dic(), "%s%s_critic.pth" % (directory, filename))
        
    # Making a load method to load a pre_trained model
    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load("%s%s_actor.pth" % (directory, filename)))
        self.critic.load_state_dict(torch.load("%s%s_critic.pth" % (directory, filename)))
        
        
def evaluate_policy(policy, eval_episodes = 10):
    avg_reward = 0
    for _ in range(eval_episodes):
        obs = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            avg_reward += reward
        avg_reward /= eval_episodes
        print("----------------------------------------")
        print("Average Reward over the evaluation step: %f" % (avg_reward))
        print("----------------------------------------")
        
        return avg_reward
    

env_name = "HalfCheetahBulletEnv-v0" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number

file_name = "%s%s%s" %("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

eval_episodes = 10
save_env_vid = True


env = gym.make(env_name)
max_episode_steps = env._max_episode_steps


if save_env_vid:
    env = wrappers.Monitor(env, monitor_dir, force = True)
    env.reset()
    
    
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
policy = TD3(state_dim, action_dim, max_action)

policy.load(file_name, './pytorch_models/')

_ = evaluate_policy(policy, eval_episodes=eval_episodes)