# Implementation TD3

# Importing the libraries

In [1]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque

# Step1: we initialize ehr Experience Replay Memory

In [2]:
class ReplayBuffer(object):
    def __init__(self, max_size = 1e6):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0
        
    def add(self, transition):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr +1) % self.max_size
        else:
            self.storage.append(transition)
            
    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), size = batch_size)
        batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
        for i in ind:
            state, next_state, action, reward, done = self.storage[i]
            batch_states.append(np.array(state, copy=False))
            batch_next_states.append(np.array(next_state, copy=False))
            batch_actions.append(np.array(action, copy=False))
            batch_rewards.append(np.array(reward, copy=False))
            batch_dones.append(np.array(done, copy=False))
        return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1,1), np.array(batch_dones).reshape(-1,1)

# Step 2: Build Neural Network for Actors

In [3]:
class Actor(nn.Module):
    
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.layer_1 = nn.Linear(state_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, action_dim)
        self.max_action = max_action
        
    def forward(self, x):
        x = F.relu(self.layer_1(x))
        x = F.relu(self.layer_2(x))
        x = self.max_action*torch.tanh(self.layer_3(x))
        return x

# Step 3: Build Neural Network for crtitic

In [4]:
class Critic(nn.Module):
    
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        # first twin NN model
        self.layer_1 = nn.Linear(state_dim+action_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, 1)
        # Second twin NN model
        self.layer_4 = nn.Linear(state_dim+action_dim, 400)
        self.layer_5 = nn.Linear(400, 300)
        self.layer_6 = nn.Linear(300, 1)
        
    def forward(self, x, u):
        xu = torch.cat([x, u], 1)
        #first propoagation
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        # second propogation
        x2 = F.relu(self.layer_4(xu))
        x2 = F.relu(self.layer_5(x2))
        x2 = self.layer_6(x2)
        return x1, x2
    
    def q1(self, x, u):
        xu = torch.cat([x, u], 1)
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        return x1

# Step 4 to 15: Training Process

In [5]:
#Slecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device.type

'cuda'

In [6]:
# Building the whole Training Process into a class

class TD3(object):
    
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor (state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic (state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        self.max_action = max_action
        
    def select_action(self, state):
        state = torch.Tensor(state.reshape(1,-1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()
    
    def train(self, replay_buffer, iterations, batch_size=100, discount = .99, tau=.005, policy_noise=.2, noise_clip=.5, policy_freq=2):
        
        for it in range(iterations):
            
            # Step4: we sample a batch of transitions (s,s',a,r) from emory
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
            state = torch.Tensor(batch_states).to(device)
            next_state = torch.Tensor(batch_next_states).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)
            
            # Step5: from the next state s', the Actor target plays the next action a'
            next_action = self.actor_target(next_state)
            
            # Step6: add gaussian noise to next_action and we clamp(clip) it in arange of values supported by the environmenrt
            noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip,noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
            
            # Step7: two critic target get (s',a') and return qt1(s',a'), qt2(s', a')
            target_q1, target_q2 = self.critic_target(next_state, next_action)
            
            # Step8: min(q_values) to represent approximate values of the next state
            target_q = torch.min(target_q1, target_q2)
            
            # Step9: Final target of the two critic model(here we have to consider if we are in the last transition of episode or not)
            target_q = reward + ((1-done) * discount * target_q).detach()
            
            # Step10: two critic model take (s,a) and return Q1(s,a), Q2(s,a) with target Q
            current_q1, current_q2 = self.critic(state, action)
            
            # Step11: calculate loss function with mse
            critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
            
            # Step12: BackPropogation
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
            
            # Step13: update actor model by gradient ascent
            # Gradient ascent that is very impormant:
            # Gradient descent is minimizing a function and gradient ascent is like gradient descent but in minus of that function
            if it % policy_freq == 0:
                actor_loss = -self.critic.q1(state, self.actor(state)).mean()
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                
                # Step14: update weights of the actor target by polyak average
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau*param.data +(1 - tau)*target_param.data)
                    
                    
                # Step15: update wights of the critic target by polyak average
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau*param.data +(1-tau)*target_param.data)
               
    # Making a save method to save a trained model
    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), "%s%s_actor.pth" % (directory, filename))
        torch.save(self.critic.state_dict(), "%s%s_critic.pth" % (directory, filename))
        
    # Making a load method to load a pre_trained model
    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load("%s%s_actor.pth" % (directory, filename)))
        self.critic.load_state_dict(torch.load("%s%s_critic.pth" % (directory, filename)))

# Make a function that evaluate the policy by calculating rits average reward over 10 episodes

In [7]:
def evaluate_policy(policy, eval_episodes = 10):
    avg_reward = 0
    for _ in range(eval_episodes):
        obs = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            avg_reward += reward
    avg_reward /= eval_episodes
    print("----------------------------------------")
    print("Average Reward over the evaluation step: %f" % (avg_reward))
    print("----------------------------------------")
        
    return avg_reward

# Set parameters

In [8]:
env_name = "Walker2DBulletEnv-v0" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 ## Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 ## How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

# Create a file name for the two saved models: the Actor and Critic models

In [9]:
file_name = "%s_%s_%s" %("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

---------------------------------------
Settings: TD3_Walker2DBulletEnv-v0_0
---------------------------------------


# Create a folde inside which will be saved the trained model

In [10]:
if not os.path.exists("./results"):
    os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
    os.makedirs("./pytorch_models")

# Create the PyBullet environment

### This is for installing gym

https://towardsdatascience.com/how-to-install-openai-gym-in-a-windows-environment-338969e24d30

In [11]:
env = gym.make(env_name)



# Set seeds and we get the necessary information on the states and actions in the choosen environmet

In [12]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

# Create the Network

In [13]:
policy = TD3(state_dim, action_dim, max_action)

# Create the Exprience Replay memory

In [14]:
replay_buffer = ReplayBuffer()

# Define a list where all the evaluation results over 10 episodes are stored

In [15]:
evaluations = [evaluate_policy(policy)]

----------------------------------------
Average Reward over the evaluation step: 264.967711
----------------------------------------


# Create a new folder directory in which the final results (video of the agent will be populated

In [16]:
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir("exp", "brs")
monitor_dir = mkdir(work_dir, "monitor")
max_episode_steps = env._max_episode_steps
save_env_vid = False
if save_env_vid:
    env = wrappers.Monitor(env, monitor_dir, force = True)
    env.reset()

# Initialize the variables

In [17]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()

# Training

In [18]:
# Start the main loop over 500,000 timesteps
while total_timesteps < max_timesteps:
    
    # if the episode is done
    if done:
        
        # if we are not at the very begining, we start the training process of the model
        if total_timesteps !=0:
            print("Total Timesteps: {} Episode Num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward))
            policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
            
        if timesteps_since_eval >= eval_freq:
            timesteps_since_eval %= eval_freq
            evaluations.append(evaluate_policy(policy))
            policy.save(file_name, directory="./pytorch_models")
            np.save("./results/%s" % (file_name), evaluations)
            
        # When the training step is done, we reset the state of the environment
        
        obs = env.reset()
        
        #et the Done to False
        
        #set rewards and episode timesteps to zero
        episode_reward = 0
        episode_timesteps = 0
        episode_num += 1
        
        
    # Before 10000 timesteps, we play random actions
    if total_timesteps < start_timesteps:
        action = env.action_space.sample()
    else: #After 10000 times we switch to the model
        action = policy.select_action(np.array(obs))
        
        # if the explore_noise parameter is not 0, we add noise to the action and we clip it
        if expl_noise != 0:
            action = (action +np.random.normal(0, expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)
    
    # the agent performs the action in the environment, then reaches the next state and receives the reward
    new_obs, reward, done, _ = env.step(action)
    
    #we check if the episode is done
    done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
    
    # We increase the total rewrd
    episode_reward += reward
    
    # We store the new transition into the experiemce replay memory
    replay_buffer.add((obs, new_obs, action, reward, done_bool))
    
    # We update the state, the episode timestep, the total timesteps, and the timesteps scince the evaluation of the policy
    obs = new_obs
    episode_timesteps +=1
    total_timesteps +=1
    timesteps_since_eval +=1
    
    
# We add the last policy evaluation to our list of evaluation and we save our model
evaluations.append(evaluate_policy(policy))
if save_models: policy.save("%s" % (file_name), directory="./pytorch_models")
np.save("./results/%s" % (file_name), evaluations)

Total Timesteps: 8 Episode Num: 1 Reward: 12.367021041171391
Total Timesteps: 18 Episode Num: 2 Reward: 15.138441208351287
Total Timesteps: 28 Episode Num: 3 Reward: 14.53532861542044
Total Timesteps: 46 Episode Num: 4 Reward: 19.255210655427074
Total Timesteps: 59 Episode Num: 5 Reward: 16.760800392518284
Total Timesteps: 77 Episode Num: 6 Reward: 20.737017455465686
Total Timesteps: 116 Episode Num: 7 Reward: 33.58704801278073
Total Timesteps: 123 Episode Num: 8 Reward: 13.203495901290443
Total Timesteps: 141 Episode Num: 9 Reward: 19.716654729914442
Total Timesteps: 161 Episode Num: 10 Reward: 22.022580836257838
Total Timesteps: 176 Episode Num: 11 Reward: 19.437172561748593
Total Timesteps: 193 Episode Num: 12 Reward: 17.776449667652194
Total Timesteps: 207 Episode Num: 13 Reward: 15.969880578648006
Total Timesteps: 217 Episode Num: 14 Reward: 15.107119100460842
Total Timesteps: 230 Episode Num: 15 Reward: 14.874611434785766
Total Timesteps: 254 Episode Num: 16 Reward: 22.1294342466

Total Timesteps: 1796 Episode Num: 130 Reward: 22.558692131495626
Total Timesteps: 1804 Episode Num: 131 Reward: 13.194153127520986
Total Timesteps: 1819 Episode Num: 132 Reward: 17.589314975070007
Total Timesteps: 1833 Episode Num: 133 Reward: 17.137415794239494
Total Timesteps: 1847 Episode Num: 134 Reward: 15.917769729891731
Total Timesteps: 1860 Episode Num: 135 Reward: 15.310148253277294
Total Timesteps: 1875 Episode Num: 136 Reward: 18.58137039417052
Total Timesteps: 1887 Episode Num: 137 Reward: 15.866694850425123
Total Timesteps: 1898 Episode Num: 138 Reward: 14.726162315173132
Total Timesteps: 1918 Episode Num: 139 Reward: 18.19551192680665
Total Timesteps: 1943 Episode Num: 140 Reward: 24.623230324783073
Total Timesteps: 1950 Episode Num: 141 Reward: 12.011448140497668
Total Timesteps: 1965 Episode Num: 142 Reward: 15.741926856135251
Total Timesteps: 1975 Episode Num: 143 Reward: 15.201354245898257
Total Timesteps: 1996 Episode Num: 144 Reward: 21.274259909923423
Total Timest

Total Timesteps: 3607 Episode Num: 256 Reward: 16.101130156882572
Total Timesteps: 3617 Episode Num: 257 Reward: 14.38412207742367
Total Timesteps: 3624 Episode Num: 258 Reward: 11.959721170019474
Total Timesteps: 3635 Episode Num: 259 Reward: 14.777752165710263
Total Timesteps: 3646 Episode Num: 260 Reward: 15.281862333424213
Total Timesteps: 3665 Episode Num: 261 Reward: 18.288833982353395
Total Timesteps: 3676 Episode Num: 262 Reward: 15.061125986394472
Total Timesteps: 3724 Episode Num: 263 Reward: 53.80799055580283
Total Timesteps: 3734 Episode Num: 264 Reward: 14.706676120393968
Total Timesteps: 3745 Episode Num: 265 Reward: 14.363103585512727
Total Timesteps: 3755 Episode Num: 266 Reward: 13.730508910983918
Total Timesteps: 3763 Episode Num: 267 Reward: 12.448835083175798
Total Timesteps: 3782 Episode Num: 268 Reward: 19.728793328757458
Total Timesteps: 3789 Episode Num: 269 Reward: 12.334206196210289
Total Timesteps: 3809 Episode Num: 270 Reward: 20.434376678493575
Total Timest

Total Timesteps: 5429 Episode Num: 380 Reward: 16.61687327728141
Total Timesteps: 5456 Episode Num: 381 Reward: 21.72079730048718
Total Timesteps: 5464 Episode Num: 382 Reward: 13.68855036234163
Total Timesteps: 5475 Episode Num: 383 Reward: 16.55282506571384
Total Timesteps: 5498 Episode Num: 384 Reward: 19.686158923663605
Total Timesteps: 5508 Episode Num: 385 Reward: 14.579891589850012
Total Timesteps: 5525 Episode Num: 386 Reward: 18.231225417292443
Total Timesteps: 5542 Episode Num: 387 Reward: 17.307566976099043
Total Timesteps: 5553 Episode Num: 388 Reward: 13.849746461625909
Total Timesteps: 5569 Episode Num: 389 Reward: 15.14536233476101
Total Timesteps: 5584 Episode Num: 390 Reward: 17.445485346831266
Total Timesteps: 5602 Episode Num: 391 Reward: 18.399140578870718
Total Timesteps: 5612 Episode Num: 392 Reward: 14.046732780009917
Total Timesteps: 5623 Episode Num: 393 Reward: 16.791544164551308
Total Timesteps: 5639 Episode Num: 394 Reward: 19.87494580472412
Total Timesteps:

Total Timesteps: 7261 Episode Num: 506 Reward: 16.26240486650349
Total Timesteps: 7275 Episode Num: 507 Reward: 16.91204993466818
Total Timesteps: 7285 Episode Num: 508 Reward: 13.054259463731432
Total Timesteps: 7301 Episode Num: 509 Reward: 17.298508802962896
Total Timesteps: 7310 Episode Num: 510 Reward: 13.902472873774238
Total Timesteps: 7320 Episode Num: 511 Reward: 14.201885166692954
Total Timesteps: 7335 Episode Num: 512 Reward: 17.38947200183902
Total Timesteps: 7346 Episode Num: 513 Reward: 15.777005292019748
Total Timesteps: 7358 Episode Num: 514 Reward: 14.964738208879133
Total Timesteps: 7371 Episode Num: 515 Reward: 16.57395812786999
Total Timesteps: 7385 Episode Num: 516 Reward: 19.294177534534537
Total Timesteps: 7405 Episode Num: 517 Reward: 16.53274793088203
Total Timesteps: 7415 Episode Num: 518 Reward: 15.283285212519697
Total Timesteps: 7427 Episode Num: 519 Reward: 16.518314105269386
Total Timesteps: 7448 Episode Num: 520 Reward: 20.992701839010987
Total Timesteps

Total Timesteps: 8952 Episode Num: 631 Reward: 17.052547198919637
Total Timesteps: 8963 Episode Num: 632 Reward: 14.03064864987682
Total Timesteps: 8978 Episode Num: 633 Reward: 17.97963068436657
Total Timesteps: 8989 Episode Num: 634 Reward: 15.834356876612583
Total Timesteps: 9007 Episode Num: 635 Reward: 20.588358357390096
Total Timesteps: 9015 Episode Num: 636 Reward: 13.177536991839586
Total Timesteps: 9026 Episode Num: 637 Reward: 15.512420442339497
Total Timesteps: 9048 Episode Num: 638 Reward: 21.036219735005577
Total Timesteps: 9063 Episode Num: 639 Reward: 15.738459922181207
Total Timesteps: 9079 Episode Num: 640 Reward: 16.873638672694508
Total Timesteps: 9089 Episode Num: 641 Reward: 13.120488613445195
Total Timesteps: 9098 Episode Num: 642 Reward: 13.210543833622069
Total Timesteps: 9110 Episode Num: 643 Reward: 15.85660334630229
Total Timesteps: 9120 Episode Num: 644 Reward: 13.683427257127187
Total Timesteps: 9144 Episode Num: 645 Reward: 23.638294647283328
Total Timeste

Total Timesteps: 13106 Episode Num: 754 Reward: 73.66198368813971
Total Timesteps: 13163 Episode Num: 755 Reward: 71.6656871492502
Total Timesteps: 13223 Episode Num: 756 Reward: 6.712902750050707
Total Timesteps: 13273 Episode Num: 757 Reward: 59.06979966687099
Total Timesteps: 13341 Episode Num: 758 Reward: 76.17109877267535
Total Timesteps: 13433 Episode Num: 759 Reward: 64.63992451237567
Total Timesteps: 13528 Episode Num: 760 Reward: 99.52363468855842
Total Timesteps: 13670 Episode Num: 761 Reward: 143.37352519061344
Total Timesteps: 13733 Episode Num: 762 Reward: 70.55841846294597
Total Timesteps: 13843 Episode Num: 763 Reward: 118.30430087552016
Total Timesteps: 13915 Episode Num: 764 Reward: 58.4376046477095
Total Timesteps: 13978 Episode Num: 765 Reward: 13.435424107692839
Total Timesteps: 14040 Episode Num: 766 Reward: 71.3588426555755
Total Timesteps: 14106 Episode Num: 767 Reward: 80.4316159059913
Total Timesteps: 14162 Episode Num: 768 Reward: 56.00798740220731
Total Times

Total Timesteps: 21701 Episode Num: 874 Reward: 116.1830221350875
Total Timesteps: 21808 Episode Num: 875 Reward: 113.90998355361745
Total Timesteps: 21876 Episode Num: 876 Reward: 49.79204118758286
Total Timesteps: 21938 Episode Num: 877 Reward: 71.00237783097415
Total Timesteps: 21990 Episode Num: 878 Reward: 30.556516980169686
Total Timesteps: 22098 Episode Num: 879 Reward: 103.03217463166817
Total Timesteps: 22152 Episode Num: 880 Reward: 58.678561447277666
Total Timesteps: 22219 Episode Num: 881 Reward: 80.33030143387673
Total Timesteps: 22285 Episode Num: 882 Reward: 64.60055941206275
Total Timesteps: 22340 Episode Num: 883 Reward: 61.994923291716574
Total Timesteps: 22417 Episode Num: 884 Reward: 78.14375711074865
Total Timesteps: 22501 Episode Num: 885 Reward: 96.37532794600982
Total Timesteps: 22593 Episode Num: 886 Reward: 99.72910166026745
Total Timesteps: 22661 Episode Num: 887 Reward: 70.51798965184798
Total Timesteps: 22730 Episode Num: 888 Reward: 73.82325504434606
Total

Total Timesteps: 30661 Episode Num: 994 Reward: 78.43451070040676
Total Timesteps: 30728 Episode Num: 995 Reward: 59.78895266213696
Total Timesteps: 30796 Episode Num: 996 Reward: 88.52879033349883
Total Timesteps: 30881 Episode Num: 997 Reward: 102.56895386500571
Total Timesteps: 30968 Episode Num: 998 Reward: 99.96248887064446
Total Timesteps: 31050 Episode Num: 999 Reward: 99.41576500034691
Total Timesteps: 31118 Episode Num: 1000 Reward: 82.77707238880707
Total Timesteps: 31183 Episode Num: 1001 Reward: 88.08770353685733
Total Timesteps: 31251 Episode Num: 1002 Reward: 86.17711320147278
Total Timesteps: 31307 Episode Num: 1003 Reward: 74.48609443065952
Total Timesteps: 31400 Episode Num: 1004 Reward: 111.14107703692564
Total Timesteps: 31473 Episode Num: 1005 Reward: 89.68970661041031
Total Timesteps: 31549 Episode Num: 1006 Reward: 90.60571579186377
Total Timesteps: 31609 Episode Num: 1007 Reward: 77.48498371106841
Total Timesteps: 31689 Episode Num: 1008 Reward: 97.02952115739532

Total Timesteps: 40607 Episode Num: 1113 Reward: 101.72237635665896
Total Timesteps: 40700 Episode Num: 1114 Reward: 120.51505288678224
Total Timesteps: 40789 Episode Num: 1115 Reward: 95.49728912308879
Total Timesteps: 40880 Episode Num: 1116 Reward: 112.01556852088903
Total Timesteps: 40965 Episode Num: 1117 Reward: 46.80155807711173
Total Timesteps: 41005 Episode Num: 1118 Reward: 35.34531893429569
Total Timesteps: 41068 Episode Num: 1119 Reward: 73.57850124623496
Total Timesteps: 41151 Episode Num: 1120 Reward: 97.49628623625023
Total Timesteps: 41255 Episode Num: 1121 Reward: 141.77397023964184
Total Timesteps: 41375 Episode Num: 1122 Reward: 140.14513797022383
Total Timesteps: 41481 Episode Num: 1123 Reward: 103.20541832904027
Total Timesteps: 41563 Episode Num: 1124 Reward: 95.62506503602224
Total Timesteps: 41708 Episode Num: 1125 Reward: 164.86023420875946
Total Timesteps: 41834 Episode Num: 1126 Reward: 155.45787349691602
Total Timesteps: 41915 Episode Num: 1127 Reward: 82.32

Total Timesteps: 68309 Episode Num: 1225 Reward: 1176.6668690503036
Total Timesteps: 69309 Episode Num: 1226 Reward: 1211.5727071240005
Total Timesteps: 69538 Episode Num: 1227 Reward: 261.55002721693467
Total Timesteps: 70039 Episode Num: 1228 Reward: 517.8518339396004
----------------------------------------
Average Reward over the evaluation step: 396.024582
----------------------------------------
Total Timesteps: 70179 Episode Num: 1229 Reward: 172.3119915608943
Total Timesteps: 70242 Episode Num: 1230 Reward: 83.13300141670693
Total Timesteps: 70288 Episode Num: 1231 Reward: 57.48861507235078
Total Timesteps: 70343 Episode Num: 1232 Reward: 69.15044920837308
Total Timesteps: 70502 Episode Num: 1233 Reward: 175.09717974473602
Total Timesteps: 70659 Episode Num: 1234 Reward: 181.67904755183798
Total Timesteps: 70763 Episode Num: 1235 Reward: 121.07855028229831
Total Timesteps: 70807 Episode Num: 1236 Reward: 52.81221020009786
Total Timesteps: 70925 Episode Num: 1237 Reward: 143.021

Total Timesteps: 109090 Episode Num: 1331 Reward: 129.71416162429216
Total Timesteps: 109146 Episode Num: 1332 Reward: 31.63238208085069
Total Timesteps: 110146 Episode Num: 1333 Reward: 689.8752792524631
----------------------------------------
Average Reward over the evaluation step: 507.427562
----------------------------------------
Total Timesteps: 110440 Episode Num: 1334 Reward: 262.4120693772605
Total Timesteps: 110519 Episode Num: 1335 Reward: 109.90926014981517
Total Timesteps: 111094 Episode Num: 1336 Reward: 507.36929250113786
Total Timesteps: 111215 Episode Num: 1337 Reward: 138.2327807056969
Total Timesteps: 111330 Episode Num: 1338 Reward: 149.10487862056863
Total Timesteps: 111420 Episode Num: 1339 Reward: 48.29456755063165
Total Timesteps: 112045 Episode Num: 1340 Reward: 637.7997653504335
Total Timesteps: 113045 Episode Num: 1341 Reward: 791.1491286461535
Total Timesteps: 114045 Episode Num: 1342 Reward: 1048.3357331092539
Total Timesteps: 115045 Episode Num: 1343 Rew

Total Timesteps: 165467 Episode Num: 1429 Reward: 493.84138013293006
Total Timesteps: 165721 Episode Num: 1430 Reward: 362.71335433274595
Total Timesteps: 165938 Episode Num: 1431 Reward: 319.3509475323228
Total Timesteps: 166938 Episode Num: 1432 Reward: 933.2188877455782
Total Timesteps: 167060 Episode Num: 1433 Reward: 182.65523759385678
Total Timesteps: 167296 Episode Num: 1434 Reward: 302.79192914045143
Total Timesteps: 167518 Episode Num: 1435 Reward: 311.5514239220517
Total Timesteps: 168518 Episode Num: 1436 Reward: 1352.8490286184058
Total Timesteps: 168812 Episode Num: 1437 Reward: 385.41066345892347
Total Timesteps: 169158 Episode Num: 1438 Reward: 481.55007608605604
Total Timesteps: 169372 Episode Num: 1439 Reward: 278.22798938118734
Total Timesteps: 169503 Episode Num: 1440 Reward: 190.1232023197964
Total Timesteps: 170503 Episode Num: 1441 Reward: 1398.06178113054
----------------------------------------
Average Reward over the evaluation step: 1478.323808
---------------

----------------------------------------
Average Reward over the evaluation step: 1703.051678
----------------------------------------
Total Timesteps: 241287 Episode Num: 1521 Reward: 1680.6878135350341
Total Timesteps: 242287 Episode Num: 1522 Reward: 1473.4880179435252
Total Timesteps: 243287 Episode Num: 1523 Reward: 1546.3581875600833
Total Timesteps: 244287 Episode Num: 1524 Reward: 1472.5913961090887
Total Timesteps: 245287 Episode Num: 1525 Reward: 1527.3972695209513
----------------------------------------
Average Reward over the evaluation step: 788.630017
----------------------------------------
Total Timesteps: 245773 Episode Num: 1526 Reward: 750.9373621145756
Total Timesteps: 246773 Episode Num: 1527 Reward: 1555.3702081068266
Total Timesteps: 247773 Episode Num: 1528 Reward: 1577.5942164214714
Total Timesteps: 248773 Episode Num: 1529 Reward: 1597.2445920650084
Total Timesteps: 249773 Episode Num: 1530 Reward: 1619.0079224923957
Total Timesteps: 250773 Episode Num: 1531 

Total Timesteps: 320780 Episode Num: 1609 Reward: 1522.793063265533
----------------------------------------
Average Reward over the evaluation step: 1743.423541
----------------------------------------
Total Timesteps: 321780 Episode Num: 1610 Reward: 1689.772096654745
Total Timesteps: 322780 Episode Num: 1611 Reward: 1705.833340497531
Total Timesteps: 323780 Episode Num: 1612 Reward: 1596.1842226573576
Total Timesteps: 324780 Episode Num: 1613 Reward: 1567.7583492920542
Total Timesteps: 325780 Episode Num: 1614 Reward: 1690.9190649314544
----------------------------------------
Average Reward over the evaluation step: 1740.308631
----------------------------------------
Total Timesteps: 326780 Episode Num: 1615 Reward: 1745.3250053337233
Total Timesteps: 327780 Episode Num: 1616 Reward: 1755.5512658748794
Total Timesteps: 328780 Episode Num: 1617 Reward: 1498.3907478218425
Total Timesteps: 329780 Episode Num: 1618 Reward: 1511.6130523062282
Total Timesteps: 330780 Episode Num: 1619 R

Total Timesteps: 401510 Episode Num: 1696 Reward: 1600.3109446801384
Total Timesteps: 402510 Episode Num: 1697 Reward: 1770.919349918719
Total Timesteps: 403510 Episode Num: 1698 Reward: 1718.590319169279
Total Timesteps: 404510 Episode Num: 1699 Reward: 1713.129806727512
Total Timesteps: 405510 Episode Num: 1700 Reward: 1678.056232828341
----------------------------------------
Average Reward over the evaluation step: 1584.356963
----------------------------------------
Total Timesteps: 406510 Episode Num: 1701 Reward: 1563.5897016693416
Total Timesteps: 407510 Episode Num: 1702 Reward: 1712.8069960055416
Total Timesteps: 408510 Episode Num: 1703 Reward: 1692.1023042287493
Total Timesteps: 409510 Episode Num: 1704 Reward: 1607.1763312764133
Total Timesteps: 410510 Episode Num: 1705 Reward: 1465.7355166543157
----------------------------------------
Average Reward over the evaluation step: 1598.752096
----------------------------------------
Total Timesteps: 411510 Episode Num: 1706 Re

Total Timesteps: 483852 Episode Num: 1784 Reward: 677.7792779342017
Total Timesteps: 484852 Episode Num: 1785 Reward: 1809.5496632925522
Total Timesteps: 485084 Episode Num: 1786 Reward: 387.1659734420589
----------------------------------------
Average Reward over the evaluation step: 1787.152556
----------------------------------------
Total Timesteps: 486084 Episode Num: 1787 Reward: 1705.0132770386085
Total Timesteps: 487084 Episode Num: 1788 Reward: 1686.649847147379
Total Timesteps: 488084 Episode Num: 1789 Reward: 1635.151775109866
Total Timesteps: 489084 Episode Num: 1790 Reward: 1674.8919472503412
Total Timesteps: 490084 Episode Num: 1791 Reward: 1774.8653637254597
----------------------------------------
Average Reward over the evaluation step: 1697.108845
----------------------------------------
Total Timesteps: 491084 Episode Num: 1792 Reward: 1694.2679610812984
Total Timesteps: 492084 Episode Num: 1793 Reward: 1734.220921662965
Total Timesteps: 493084 Episode Num: 1794 Rew

# Infrence

In [24]:
class Actor(nn.Module):
    
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.layer_1 = nn.Linear(state_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, action_dim)
        self.max_action = max_action
        
    def forward(self, x):
        x = F.relu(self.layer_1(x))
        x = F.relu(self.layer_2(x))
        x = self.max_action*torch.tanh(self.layer_3(x))
        return x
    
class Critic(nn.Module):
    
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        # first twin NN model
        self.layer_1 = nn.Linear(state_dim+action_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, 1)
        # Second twin NN model
        self.layer_4 = nn.Linear(state_dim+action_dim, 400)
        self.layer_5 = nn.Linear(400, 300)
        self.layer_6 = nn.Linear(300, 1)
        
    def forward(self, x, u):
        xu = torch.cat([x, u], axis = 1)
        #first propoagation
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(xu))
        x1 = self.layer_3(xu)
        # second propogation
        x2 = F.relu(self.layer_4(xu))
        x2 = F.relu(self.layer_5(xu))
        x2 = self.layer_6(xu)
        return x1, x2
    
    def q1(self, x, u):
        xu = torch.cat([x, u], axis = 1)
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(xu))
        x1 = self.layer_3(xu)
        return x1
    
#Slecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TD3(object):
    
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor (state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(params = self.actor.parameters())
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic (state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(params = self.critic.parameters())
        self.max_action = max_action
        
    def select_action(self, state):
        state = torch.Tensor(state.reshape(1,-1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()
    
    def train(self, replay_buffer, iterations, batch_size=100, discount = .99, tau=.005, policy_noise=.2, noise_clip=.5, policy_freq=2):
        
        for it in range(iterations):
            # Step4: we sample a batch of transitions (s,s',a,r) from emory
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
            state = torch.Tensor(batch_states).to(device)
            next_state = torch.Tensor(batch_next_states).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)
            
            # Step5: from the next state s', the Actor target plays the next action a'
            next_action = self.actor_target.forward(next_state)
            
            # Step6: add gaussian noise to next_action and we clamp(clip) it in arange of values supported by the environmenrt
            noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip,noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
            
            # Step7: two critic target get (s',a') and return qt1(s',a'), qt2(s', a')
            target_q1, target_q2 = self.critic_target.forward(next_state, next_action)
            
            # Step8: min(q_values) to represent approximate values of the next state
            target_q = torch.min(target_q1, target_q2)
            
            # Step9: Final target of the two critic model(here we have to consider if we are in the last transition of episode or not)
            target_q = reward + ((1-done) * discount * target_q).detach()
            
            # Step10: two critic model take (s,a) and return Q1(s,a), Q2(s,a) with target Q
            current_q1, current_q2 = self.critic.forward(state, action)
            
            # Step11: calculate loss function with mse
            critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
            
            # Step12: BackPropogation
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
            
            # Step13: update actor model by gradient ascent
            # Gradient ascent that is very impormant:
            # Gradient descent is minimizing a function and gradient ascent is like gradient descent but in minus of that function
            if it % policy_freq == 0:
                actor_loss = -self.critic.q1(state, self.actor(state)).mean()
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                
                # Step14: update weights of the actor target by polyak average
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau*param.data +(1 - tau)*target_patam.data)
                    
                    
                # Step15: update wights of the critic target by polyak average
                for param, target_param in zip(sel.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau*param.data +(1-tau)*target_param.data)


        
        
def evaluate_policy(policy, eval_episodes = 10):
    avg_reward = 0
    for _ in range(eval_episodes):
        obs = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            avg_reward += reward
    avg_reward /= eval_episodes
    print("----------------------------------------")
    print("Average Reward over the evaluation step: %f" % (avg_reward))
    print("----------------------------------------")
        
    return avg_reward
    

env_name = "Walker2DBulletEnv-v0" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number

file_name = "%s%s%s" %("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

eval_episodes = 10
save_env_vid = True


env = gym.make(env_name)
max_episode_steps = env._max_episode_steps


if save_env_vid:
    env = wrappers.Monitor(env, monitor_dir, force = True)
    env.reset()
    
    
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
policy = TD3(state_dim, action_dim, max_action)

policy.actor.load_state_dict(torch.load("pytorch_modelsTD3_Walker2DBulletEnv-v0_0_actor.pth"))
policy.critic.load_state_dict(torch.load("pytorch_modelsTD3_Walker2DBulletEnv-v0_0_critic.pth"))

_ = evaluate_policy(policy, eval_episodes=eval_episodes)

---------------------------------------
Settings: TD3Walker2DBulletEnv-v00
---------------------------------------
----------------------------------------
Average Reward over the evaluation step: 1672.311298
----------------------------------------
