<a href="https://colab.research.google.com/github/abhisheksuran/Reinforcement_Learning/blob/master/DDPGwithtau.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import gym
from tensorflow.keras.models import load_model
!pip3 install box2d-py

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K     |▊                               | 10kB 3.2MB/s eta 0:00:01[K     |█▌                              | 20kB 3.2MB/s eta 0:00:01[K     |██▏                             | 30kB 3.2MB/s eta 0:00:01[K     |███                             | 40kB 3.4MB/s eta 0:00:01[K     |███▋                            | 51kB 3.8MB/s eta 0:00:01[K     |████▍                           | 61kB 4.3MB/s eta 0:00:01[K     |█████▏                          | 71kB 4.6MB/s eta 0:00:01[K     |█████▉                          | 81kB 4.8MB/s eta 0:00:01[K     |██████▋                         | 92kB 5.0MB/s eta 0:00:01[K     |███████▎                        | 102kB 5.2MB/s eta 0:00:01[K     |████████                        | 112kB 5.2MB/s eta 0:00:01[K     |████████▊                       | 122

In [2]:
env= gym.make("LunarLanderContinuous-v2")
state_low = env.observation_space.low
state_high = env.observation_space.high
action_low = env.action_space.low 
action_high = env.action_space.high
print(state_low)
print(state_high)
print(action_low)
print(action_high)

[-inf -inf -inf -inf -inf -inf -inf -inf]
[inf inf inf inf inf inf inf inf]
[-1. -1.]
[1. 1.]


In [3]:
 len(env.action_space.high)

2

In [4]:
class RBuffer():
  def __init__(self, maxsize, statedim, naction):
    self.cnt = 0
    self.maxsize = maxsize
    self.state_memory = np.zeros((maxsize, *statedim), dtype=np.float32)
    self.action_memory = np.zeros((maxsize, naction), dtype=np.float32)
    self.reward_memory = np.zeros((maxsize,), dtype=np.float32)
    self.next_state_memory = np.zeros((maxsize, *statedim), dtype=np.float32)
    self.done_memory = np.zeros((maxsize,), dtype= np.bool)

  def storexp(self, state, next_state, action, done, reward):
    index = self.cnt % self.maxsize
    self.state_memory[index] = state
    self.action_memory[index] = action
    self.reward_memory[index] = reward
    self.next_state_memory[index] = next_state
    self.done_memory[index] = 1- int(done)
    self.cnt += 1

  def sample(self, batch_size):
    max_mem = min(self.cnt, self.maxsize)
    batch = np.random.choice(max_mem, batch_size, replace= False)  
    states = self.state_memory[batch]
    next_states = self.next_state_memory[batch]
    rewards = self.reward_memory[batch]
    actions = self.action_memory[batch]
    dones = self.done_memory[batch]
    return states, next_states, rewards, actions, dones



In [5]:
class Critic(tf.keras.Model):
  def __init__(self):
    super(Critic, self).__init__()
    self.f1 = tf.keras.layers.Dense(512, activation='relu')
    self.f2 = tf.keras.layers.Dense(512, activation='relu')
    self.v =  tf.keras.layers.Dense(1, activation=None)

  def call(self, inputstate, action):
    x = self.f1(tf.concat([inputstate, action], axis=1))
    x = self.f2(x)
    x = self.v(x)
    return x


class Actor(tf.keras.Model):
  def __init__(self, no_action):
    super(Actor, self).__init__()    
    self.f1 = tf.keras.layers.Dense(512, activation='relu')
    self.f2 = tf.keras.layers.Dense(512, activation='relu')
    self.mu =  tf.keras.layers.Dense(no_action, activation='tanh')

  def call(self, state):
    x = self.f1(state)
    x = self.f2(x)
    x = self.mu(x)  
    return x

 

In [6]:
class Agent():
  def __init__(self, n_action= len(env.action_space.high)):
    self.actor_main = Actor(n_action)
    self.actor_target = Actor(n_action)
    self.critic_main = Critic()
    self.critic_target = Critic()
    self.batch_size = 64
    self.n_actions = len(env.action_space.high)
    self.a_opt = tf.keras.optimizers.Adam(1e-4)
    # self.actor_target = tf.keras.optimizers.Adam(.001)
    self.c_opt = tf.keras.optimizers.Adam(1e-4)
    # self.critic_target = tf.keras.optimizers.Adam(.002)
    self.memory = RBuffer(1_00_000, env.observation_space.shape, len(env.action_space.high))
    self.trainstep = 0
    self.replace = 5
    self.gamma = 0.99
    self.min_action = env.action_space.low[0]
    self.max_action = env.action_space.high[0]
    self.tau = 0.005
    self.actor_target.compile(optimizer=self.a_opt)
    self.critic_target.compile(optimizer=self.c_opt)


  def act(self, state, evaluate=False):
      state = tf.convert_to_tensor([state], dtype=tf.float32)
      actions = self.actor_main(state)
      if not evaluate:
          actions += tf.random.normal(shape=[self.n_actions], mean=0.0, stddev=0.1)

      actions = self.max_action * (tf.clip_by_value(actions, self.min_action, self.max_action))
      #print(actions)
      return actions[0]


  def savexp(self,state, next_state, action, done, reward):
        self.memory.storexp(state, next_state, action, done, reward)

  def update_target(self, tau=None):
    
      if tau is None:
          tau = self.tau

      weights1 = []
      targets1 = self.actor_target.weights
      for i, weight in enumerate(self.actor_main.weights):
          weights1.append(weight * tau + targets1[i]*(1-tau))
      self.actor_target.set_weights(weights1)

      weights2 = []
      targets2 = self.critic_target.weights
      for i, weight in enumerate(self.critic_main.weights):
          weights2.append(weight * tau + targets2[i]*(1-tau))
      self.critic_target.set_weights(weights2)

  
  def train(self):
      if self.memory.cnt < self.batch_size:
        return 


      states, next_states, rewards, actions, dones = self.memory.sample(self.batch_size)
  
      states = tf.convert_to_tensor(states, dtype= tf.float32)
      next_states = tf.convert_to_tensor(next_states, dtype= tf.float32)
      rewards = tf.convert_to_tensor(rewards, dtype= tf.float32)
      actions = tf.convert_to_tensor(actions, dtype= tf.float32)
      #dones = tf.convert_to_tensor(dones, dtype= tf.bool)

      with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
            
          target_actions = self.actor_target(next_states)
          target_next_state_values = tf.squeeze(self.critic_target(next_states, target_actions), 1)
          critic_value = tf.squeeze(self.critic_main(states, actions), 1)
          target_values = rewards + self.gamma * target_next_state_values * dones
          critic_loss = tf.keras.losses.MSE(target_values, critic_value)

          new_policy_actions = self.actor_main(states)
          actor_loss = -self.critic_main(states, new_policy_actions)
          actor_loss = tf.math.reduce_mean(actor_loss)

      grads1 = tape1.gradient(actor_loss, self.actor_main.trainable_variables)
      grads2 = tape2.gradient(critic_loss, self.critic_main.trainable_variables)
      self.a_opt.apply_gradients(zip(grads1, self.actor_main.trainable_variables))
      self.c_opt.apply_gradients(zip(grads2, self.critic_main.trainable_variables))

      #if self.trainstep % self.replace == 0:
      self.update_target()
           
      self.trainstep +=1
 




In [None]:
with tf.device('GPU:0'):
  tf.random.set_seed(336699)
  agent = Agent(2)
  episods = 2000
  ep_reward = []
  total_avgr = []
  target = False

  for s in range(episods):
    if target == True:
      break
    total_reward = 0 
    state = env.reset()
    done = False

    while not done:
      action = agent.act(state)
      next_state, reward, done, _ = env.step(action)
      agent.savexp(state, next_state, action, done, reward)
      agent.train()
      state = next_state
      total_reward += reward
      if done:
          ep_reward.append(total_reward)
          avg_reward = np.mean(ep_reward[-100:])
          total_avgr.append(avg_reward)
          print("total reward after {} steps is {} and avg reward is {}".format(s, total_reward, avg_reward))
          if int(avg_reward) == 200:
            target = True





total reward after 0 steps is -211.5812908651993 and avg reward is -211.5812908651993
total reward after 1 steps is -452.65019673188436 and avg reward is -332.1157437985418
total reward after 2 steps is -714.5135918110798 and avg reward is -459.5816931360544
total reward after 3 steps is -560.6350625743586 and avg reward is -484.84503549563044
total reward after 4 steps is -480.7438413774672 and avg reward is -484.02479667199776
total reward after 5 steps is -120.42183431364717 and avg reward is -423.42430294560603
total reward after 6 steps is -152.5965019924016 and avg reward is -384.7346170951482
total reward after 7 steps is -92.38920609456164 and avg reward is -348.191440720075
total reward after 8 steps is -386.82989880290575 and avg reward is -352.4846027292784
total reward after 9 steps is -273.7853605603385 and avg reward is -344.6146785123844
total reward after 10 steps is -2.6461276727801106 and avg reward is -313.52662843605674
total reward after 11 steps is -169.0821032646

In [None]:
ep = [i  for i in range(len(avg_rewards_list))]
plt.plot( range(len(avg_rewards_list)),avg_rewards_list,'b')
plt.title("Avg Test Aeward Vs Test Episods")
plt.xlabel("Test Episods")
plt.ylabel("Average Test Reward")
plt.grid(True)
plt.show()

In [None]:
total_reward = 0
state = env.reset()
while not done:
    action = agent.act(state, True)
    next_state, reward, done, _ = env.step(action)
    state = next_state
    total_reward += reward
    if done:
       print(total_reward)