In [None]:
!pip install roboschool==1.0.48 gym==0.15.4

Collecting roboschool==1.0.48
[?25l  Downloading https://files.pythonhosted.org/packages/da/31/ce69340a0698e85de2db787023aee5c9416d4ab2ded8cbccf97168ceec81/roboschool-1.0.48-cp37-cp37m-manylinux1_x86_64.whl (44.9MB)
[K     |████████████████████████████████| 44.9MB 165kB/s 
[?25hCollecting gym==0.15.4
[?25l  Downloading https://files.pythonhosted.org/packages/1d/85/a7a462d7796f097027d60f9a62b4e17a0a94dcf12ac2a9f9a913333b11a6/gym-0.15.4.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 40.8MB/s 
Collecting pyglet<=1.3.2,>=1.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/1c/fc/dad5eaaab68f0c21e2f906a94ddb98175662cc5a654eee404d59554ce0fa/pyglet-1.3.2-py2.py3-none-any.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 25.8MB/s 
[?25hCollecting cloudpickle~=1.2.0
  Downloading https://files.pythonhosted.org/packages/c1/49/334e279caa3231255725c8e860fa93e72083567625573421db8875846c14/cloudpickle-1.2.2-py2.py3-none-any.whl
Building wheels for collec

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
import gym
import roboschool

In [None]:
env = gym.make('RoboschoolInvertedPendulum-v1')

In [None]:
state_dimension=env.observation_space.shape

In [None]:
action_dimension=env.action_space.shape[0]

In [None]:
min_action=env.action_space.low[0]

In [None]:
max_action=env.action_space.high[0]

In [None]:
def get_critic():

  input_state=tf.keras.layers.Input(state_dimension)
  input_action=tf.keras.layers.Input(action_dimension)
  layer_1=tf.keras.layers.concatenate([input_state,input_action],axis=-1)
  layer_2=tf.keras.layers.Dense(400,activation="relu")(layer_1)
  layer_3=tf.keras.layers.Dense(300,activation="relu")(layer_2)
  out_Q=tf.keras.layers.Dense(1,activation=None)(layer_3)

  model=tf.keras.Model(inputs=[input_state,input_action],outputs=[out_Q])
  return model

def get_actor():

  input=tf.keras.layers.Input(state_dimension)
  layer_1=tf.keras.layers.Dense(400,activation="relu")(input)
  layer_2=tf.keras.layers.Dense(300,activation="relu")(layer_1)
  out=tf.keras.layers.Dense(action_dimension,activation="tanh")(layer_2)

  model=tf.keras.Model(inputs=[input],outputs=[out])
  return model


In [None]:
Train_actor=get_actor()
Target_actor=get_actor()
Target_actor.set_weights(Train_actor.get_weights())
Train_critic_1=get_critic()
Target_critic_1=get_critic()
Target_critic_1.set_weights(Train_critic_1.get_weights())
Train_critic_2=get_critic()
Target_critic_2=get_critic()
Target_critic_2.set_weights(Train_critic_2.get_weights())

In [None]:
from memory_module import replayBuffer
memory=replayBuffer(100000)

In [None]:
def get_action(actor,s,sigma=0,noise=False):
  mu=actor(s)
  Noise_sigma=sigma
  if noise:
    action=mu+tf.random.normal(shape=[action_dimension],mean=0,stddev=Noise_sigma)
  else:
    action=mu

  action=max_action*(tf.clip_by_value(action,min_action,max_action))  ## AS tanh is used in activation
  return action

In [None]:
def get_Q_value(critic,s,a):
  q=critic([s,a])
  return q

In [None]:
def initialize_buffer():
    
    curr_state=env.reset()
    for _ in range(10000):
      action=env.action_space.sample()
      next_state,reward,done,_=env.step(action)
      memory.push(curr_state,action,reward,next_state,not done)

      if done:
        curr_state=env.reset()
      else:
        curr_state=next_state

In [None]:
initialize_buffer()

In [None]:
cr_1_opt=tf.keras.optimizers.Adam(0.001)
cr_2_opt=tf.keras.optimizers.Adam(0.001)
ac_opt=tf.keras.optimizers.Adam(0.001)

In [None]:
def update_networks(target_net,train_net,tau):
  weights_tar, weights_tra = target_net.get_weights(), train_net.get_weights()
  for i in range(len(weights_tar)):
    weights_tar[i] = tau*weights_tra[i] + (1-tau)*weights_tar[i]
  target_net.set_weights(weights_tar)


In [None]:
def critic_pred(critic,states):

  c=0.5
  mu=Target_actor(states)
  noise_action=mu+tf.clip_by_value(tf.random.normal(shape=[action_dimension],mean=0,stddev=0.2),-c,c)
  predicted_actions=max_action*tf.clip_by_value(noise_action,min_action,max_action)

  next_state_value=get_Q_value(critic,states,predicted_actions)
  return next_state_value


In [None]:
def loss_critics(states, actions, rewards, next_states, not_dones, gamma=0.99):
  next_value_1=tf.squeeze(critic_pred(Target_critic_1,next_states))
  next_value_2=tf.squeeze(critic_pred(Target_critic_2,next_states))

  pred_value_1=tf.squeeze(get_Q_value(Train_critic_1,np.array(states,dtype="float32"),np.array(actions,dtype="float32")))
  pred_value_2=tf.squeeze(get_Q_value(Train_critic_2,np.array(states,dtype="float32"),np.array(actions,dtype="float32")))

  next_value=tf.math.minimum(next_value_1,next_value_2)

  target_value= rewards + gamma*next_value*not_dones

  critic_loss_1=tf.reduce_mean(tf.math.squared_difference(target_value,pred_value_1))
  critic_loss_2=tf.reduce_mean(tf.math.squared_difference(target_value,pred_value_2))

  return critic_loss_1,critic_loss_2

In [None]:
def train():

  batch_size=128
  steps_to_train=1000000
  update_actor_step=2
  tau=0.005
  curr_state=env.reset()
  overall_Reward=0
  episode_reward=0
  no_of_comp=0
  steps_to_stop_exp=2000
  for i in range(steps_to_train):
    
    if i<steps_to_stop_exp:
      action=get_action(Train_actor,curr_state.reshape(1,-1),sigma=0.1,noise=True)
    else:
      action=get_action(Train_actor,curr_state.reshape(1,-1))
    next_state,reward,done,_=env.step(action.numpy()[0])
    episode_reward+=reward

    memory.push(curr_state,action,reward,next_state,not done)

    if done:

        curr_state=env.reset()
        overall_Reward+=episode_reward
        if no_of_comp%20==0:
          print('On step {}, no. of complete episodes {} average episode reward {}'.format(i,no_of_comp,overall_Reward/20))
          overall_Reward=0
        episode_reward=0  ### Updating the reward to 0
        no_of_comp+=1
    else:
      curr_state=next_state

    states, actions, rewards, next_states, not_dones = memory.sample(batch_size)

    with tf.GradientTape() as t1, tf.GradientTape() as t2:
      critic_loss_1,critic_loss_2=loss_critics(states, actions, rewards, next_states, not_dones)

    grad_crit_1=t1.gradient(critic_loss_1,Train_critic_1.trainable_variables)
    grad_crit_2=t2.gradient(critic_loss_2,Train_critic_2.trainable_variables)

    cr_1_opt.apply_gradients(zip(grad_crit_1,Train_critic_1.trainable_variables))
    cr_2_opt.apply_gradients(zip(grad_crit_2,Train_critic_2.trainable_variables))

    if i % update_actor_step==0:

      with tf.GradientTape() as t:
        new_actions=Train_actor(states)
        act_loss=-1*tf.reduce_mean(Train_critic_1([states,new_actions]))

      grad_act=t.gradient(act_loss,Train_actor.trainable_variables)
      ac_opt.apply_gradients(zip(grad_act,Train_actor.trainable_variables))

      update_networks(Target_actor,Train_actor,tau)
      update_networks(Target_critic_1,Train_critic_1,tau)
      update_networks(Target_critic_2,Train_critic_2,tau)



In [None]:
train()

  return np.array(s),np.array(a),np.array(r),np.array(s_),np.uint8(nd)


On step 9, no. of complete episodes 0 average episode reward 0.5
On step 190, no. of complete episodes 20 average episode reward 9.05
On step 375, no. of complete episodes 40 average episode reward 9.25
On step 545, no. of complete episodes 60 average episode reward 8.5
On step 741, no. of complete episodes 80 average episode reward 9.8
On step 927, no. of complete episodes 100 average episode reward 9.3
On step 1109, no. of complete episodes 120 average episode reward 9.1
On step 1292, no. of complete episodes 140 average episode reward 9.15
On step 1473, no. of complete episodes 160 average episode reward 9.05
On step 1664, no. of complete episodes 180 average episode reward 9.55
On step 1855, no. of complete episodes 200 average episode reward 9.55
On step 2638, no. of complete episodes 220 average episode reward 39.15
On step 4816, no. of complete episodes 240 average episode reward 108.9
On step 6597, no. of complete episodes 260 average episode reward 89.05
On step 8313, no. of c