In [None]:
!pip install roboschool==1.0.48 gym==0.15.4

Collecting roboschool==1.0.48
[?25l  Downloading https://files.pythonhosted.org/packages/da/31/ce69340a0698e85de2db787023aee5c9416d4ab2ded8cbccf97168ceec81/roboschool-1.0.48-cp37-cp37m-manylinux1_x86_64.whl (44.9MB)
[K     |████████████████████████████████| 44.9MB 156kB/s 
[?25hCollecting gym==0.15.4
[?25l  Downloading https://files.pythonhosted.org/packages/1d/85/a7a462d7796f097027d60f9a62b4e17a0a94dcf12ac2a9f9a913333b11a6/gym-0.15.4.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 35.8MB/s 
Collecting pyglet<=1.3.2,>=1.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/1c/fc/dad5eaaab68f0c21e2f906a94ddb98175662cc5a654eee404d59554ce0fa/pyglet-1.3.2-py2.py3-none-any.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 23.2MB/s 
[?25hCollecting cloudpickle~=1.2.0
  Downloading https://files.pythonhosted.org/packages/c1/49/334e279caa3231255725c8e860fa93e72083567625573421db8875846c14/cloudpickle-1.2.2-py2.py3-none-any.whl
Building wheels for collec

In [None]:
import tensorflow as tf
import numpy as np
import gym
import roboschool
import tensorflow_probability as tfp

In [None]:
class SAC:

  def __init__(self,env,memory):

      self.env=env
      self.state_dimension=env.observation_space.shape
      self.action_dimension=env.action_space.shape[0]
      self.Train_actor=None
      self.Train_critic_1=None
      self.Target_critic_1=None
      self.Train_critic_2=None
      self.Target_critic_2=None
      self.memory=memory
      self.batch_size=256
      self.collect_initial_=10000
      self.cr_1_opt=tf.keras.optimizers.Adam(0.0003)
      self.cr_2_opt=tf.keras.optimizers.Adam(0.0003)
      self.ac_opt=tf.keras.optimizers.Adam(0.0003) 
      self.steps_to_train=1000000
      self.tau=0.005
      self.alpha=0.2
      self.MAX_CLIP_sigma=2
      self.MIN_CLIP_sigma=-20
      self.gamma=0.99
  
  def get_critic(self):

    input_state=tf.keras.layers.Input(self.state_dimension)
    input_action=tf.keras.layers.Input(self.action_dimension)
    layer_1=tf.keras.layers.concatenate([input_state,input_action],axis=-1)
    layer_2=tf.keras.layers.Dense(256,activation="relu")(layer_1)
    layer_3=tf.keras.layers.Dense(256,activation="relu")(layer_2)
    out_Q=tf.keras.layers.Dense(1,activation=None)(layer_3)

    model=tf.keras.Model(inputs=[input_state,input_action],outputs=[out_Q])
    return model

  def get_actor(self):

    input=tf.keras.layers.Input(self.state_dimension)
    layer_1=tf.keras.layers.Dense(256,activation="relu")(input)
    layer_2=tf.keras.layers.Dense(256,activation="relu")(layer_1)
    mu=tf.keras.layers.Dense(self.action_dimension,activation=None)(layer_2)
    log_sigma=tf.keras.layers.Dense(self.action_dimension,activation=None)(layer_2)

    model=tf.keras.Model(inputs=[input],outputs=[mu,log_sigma])
    return model
  
  def get_action(self,actor,s):

    mu,log_sigma=actor(s)
    sigma=tf.exp(tf.clip_by_value(log_sigma,self.MIN_CLIP_sigma,self.MAX_CLIP_sigma))
    dist=tfp.distributions.Normal(mu,sigma)
    action=dist.sample()
    bounded_action=tf.tanh(action)

    return action,bounded_action
  
  def get_log_prob(self,actor,s,a,bounded_a):

    mu,log_sigma=actor(s)
    sigma=tf.exp(tf.clip_by_value(log_sigma,self.MIN_CLIP_sigma,self.MAX_CLIP_sigma))
    dist=tfp.distributions.Normal(mu,sigma)
    log_pr=dist.log_prob(a)
    ### Modifying the log_pr for the action as mentioned in appendix c of the paper
    log_pr_mod=log_pr - tf.reduce_mean(tf.math.log(1-bounded_a**2+1e-8),axis=1,keepdims=True)

    return log_pr_mod
  
  def get_Q_value(self,critic,s,a):

    q=critic([s,a])
    return q

  def initialize_buffer(self):
    
    curr_state=self.env.reset()
    for _ in range(10000):
      action=self.env.action_space.sample()
      next_state,reward,done,_=self.env.step(action)
      self.memory.push(curr_state,action,reward,next_state,not done)

      if done:
        curr_state=self.env.reset()
      else:
        curr_state=next_state
  
  def update_networks(self,target_net,train_net,tau):

    weights_tar, weights_tra = target_net.get_weights(), train_net.get_weights()
    for i in range(len(weights_tar)):
      weights_tar[i] = tau*weights_tra[i] + (1-tau)*weights_tar[i]
    target_net.set_weights(weights_tar)

  def train(self):

    self.Train_actor=self.get_actor()
    self.Train_critic_1=self.get_critic()
    self.Target_critic_1=self.get_critic()
    self.Target_critic_1.set_weights(self.Train_critic_1.get_weights())
    self.Train_critic_2=self.get_critic()
    self.Target_critic_2=self.get_critic()
    self.Target_critic_2.set_weights(self.Train_critic_2.get_weights())

    self.initialize_buffer()

    curr_state=self.env.reset()
    overall_Reward=0
    episode_reward=0
    no_of_comp=0

    for i in range(self.steps_to_train):
      
      action,b_action=self.get_action(self.Train_actor,curr_state.reshape(1,-1))
      next_state,reward,done,_=self.env.step(b_action.numpy()[0])
      episode_reward+=reward

      self.memory.push(curr_state,b_action,reward,next_state,not done)

      if done:

          curr_state=self.env.reset()
          overall_Reward+=episode_reward
          if no_of_comp%20==0:
            print('On step {}, no. of complete episodes {} average episode reward {}'.format(i,no_of_comp,overall_Reward/20))
            overall_Reward=0
          episode_reward=0  ### Updating the reward to 0
          no_of_comp+=1
    
      else:
        
        curr_state=next_state

      states, actions, rewards, next_states, not_dones = self.memory.sample(self.batch_size)

      with tf.GradientTape() as t1, tf.GradientTape() as t2:

        n_actions,b_n_actions=self.get_action(self.Train_actor,next_states)
        log_pr=self.get_log_prob(self.Train_actor,next_states,n_actions,b_n_actions)

        next_value_1=self.get_Q_value(self.Target_critic_1,next_states,b_n_actions)
        next_value_2=self.get_Q_value(self.Target_critic_2,next_states,b_n_actions)

        pred_value_1=(self.get_Q_value(self.Train_critic_1,np.array(states,dtype="float32"),np.array(actions,dtype="float32")))
        pred_value_2=(self.get_Q_value(self.Train_critic_2,np.array(states,dtype="float32"),np.array(actions,dtype="float32")))

        next_value=tf.math.minimum(next_value_1,next_value_2)

        target_value= rewards + self.gamma*(next_value - self.alpha * log_pr)*not_dones

        critic_loss_1=tf.reduce_mean(tf.math.squared_difference(target_value,pred_value_1))
        critic_loss_2=tf.reduce_mean(tf.math.squared_difference(target_value,pred_value_2))

      grad_crit_1=t1.gradient(critic_loss_1,self.Train_critic_1.trainable_variables)
      grad_crit_2=t2.gradient(critic_loss_2,self.Train_critic_2.trainable_variables)

      self.cr_1_opt.apply_gradients(zip(grad_crit_1,self.Train_critic_1.trainable_variables))
      self.cr_2_opt.apply_gradients(zip(grad_crit_2,self.Train_critic_2.trainable_variables))

      with tf.GradientTape() as t:

        n_actions,b_n_actions=self.get_action(self.Train_actor,states)
        log_pr_a=self.get_log_prob(self.Train_actor,states,n_actions,b_n_actions)

        state_value_1=self.get_Q_value(self.Train_critic_1,states,b_n_actions)
        state_value_2=self.get_Q_value(self.Train_critic_2,states,b_n_actions)

        value=tf.math.minimum(state_value_1,state_value_2) - self.alpha * log_pr_a

        act_loss= -1*tf.reduce_mean(value)

      grad_actor=t.gradient(act_loss,self.Train_actor.trainable_variables)
      self.ac_opt.apply_gradients(zip(grad_actor,self.Train_actor.trainable_variables))

      self.update_networks(self.Target_critic_1,self.Train_critic_1,self.tau)
      self.update_networks(self.Target_critic_2,self.Train_critic_2,self.tau)



In [None]:
env = gym.make('RoboschoolInvertedPendulum-v1')

In [None]:
from memory_module import replayBuffer
memory=replayBuffer(100000)

In [None]:
agent=SAC(env,memory)

In [None]:
agent.train()