In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

In [2]:
import gym

In [None]:
### The pi_old is the poicy for the n-1 th iteration policy, when we calculate for n th iteration.
### The constraint is only applied for a particuar iteration for the batch updates.

In [None]:
### PPO trains on the same set of collected samples, using different batches, for k epochs. So, it can 
### train the models k*no of batches number of times.

### The samples are updated only when, the agents move from one iteration to the other. old_policy for 
### iteration i is the policy developed after all k*no of batches number of updates of the i-1 th iteration
### For the all the updates on the i th iteration the old policy remains constant. So the sample efficiency 
### is said to be high compared to A2C which updates only once an iteration

In [18]:
class PPO_single_agent:

  def __init__(self,env):

    self.env=env
    self.state_dimension=env.observation_space.shape   ### Input state dimension
    self.no_of_action=env.action_space.n              ### No of actions
    self.Actor=None      ### the learner
    self.Critic=None     ### Critic
    self.opt=tf.keras.optimizers.Adam(0.0003)
    self.steps_in_epi=512    ### Fixed number of steps in a episode
    self.iterations=5000         ### Number of epochs
    self.m=8                 ### Number of samples in a epoch.
    self.target=1000
    self.epochs=10
  
  def get_actor(self):

    input_layer=tf.keras.layers.Input(self.state_dimension)  ### Takes the state for which we want to 
    ### predict the probability distribution of the actions.

    layer_1=tf.keras.layers.Dense(128,activation="relu")(input_layer)
    layer_2=tf.keras.layers.Dense(128,activation="relu")(layer_1)
    layer_3=tf.keras.layers.Dense(128,activation="relu")(layer_2)
    
    output_layer=tf.keras.layers.Dense(self.no_of_action, activation="softmax")(layer_3)

    ### Predicts the peobability of all the actions on the state s, so the number of nodes in
    ### the final layer of model is equal to the number of actions 
    ### and we generate a probabilistic distribution so softmax is used as the activation function.
    
    model=tf.keras.Model(inputs=[input_layer],outputs=[output_layer])
    
    return model
  
  def get_critic(self):

    input_layer=tf.keras.layers.Input(self.state_dimension)  ### Takes the state for which we want to 
    ### predict the estimate value function V(s)

    layer_1=tf.keras.layers.Dense(128,activation="relu")(input_layer)
    layer_2=tf.keras.layers.Dense(128,activation="relu")(layer_1)
    layer_3=tf.keras.layers.Dense(128,activation="relu")(layer_2)
    
    output_layer=tf.keras.layers.Dense(1)(layer_3)  ### Predicts the Value function for that state.

    model=tf.keras.Model(inputs=[input_layer],outputs=[output_layer])
    
    return model
  
  
  def get_action(self,state):

    state=state.reshape(1,-1)
    acts=self.Actor(state)
    act_probs=tfp.distributions.Categorical(probs=acts)
    return act_probs.sample().numpy()[0]
  
  def get_value(self,state):
    state=state.reshape(1,-1)
    value=self.Critic(state).numpy()[0]
    return value

  def get_prob(self,state,action):
    acts=self.Actor(state)
    act_probs=tfp.distributions.Categorical(probs=acts)
    return act_probs.log_prob(action)

  def actor_loss(self,S,A,Adv,log_old_probs,eps=0.2):
    log_new_probs=tf.reshape(self.get_prob(S,tf.reshape(A,(1,-1))[0]),shape=(-1,1))
    r=tf.math.exp(tf.math.subtract(log_new_probs,log_old_probs)) 
    surr_1=tf.multiply(r,Adv)
    surr_2=tf.multiply(tf.clip_by_value(r,1-eps,1+eps),Adv)
    l_clip=-1*tf.reduce_mean(tf.reduce_min([surr_1,surr_2],axis=0))
    return l_clip

  def critic_loss(self,S,ret):
    l_vf= 0.5*tf.reduce_mean(tf.math.squared_difference(ret,self.Critic(S)))
    return l_vf
    
  def entropy(self,S):
      logits=self.Actor(S)
      dist=tfp.distributions.Categorical(logits=logits)
      L_S=tf.reduce_mean(dist.entropy())
      return L_S
    ### Entropy induces exploration.
  
  def total_loss(self,old_probs,S,A,rets,Adv,c1=0.5,c2=0.001):
    act_loss=self.actor_loss(S,A,Adv,old_probs)
    crit_loss=self.critic_loss(S,rets)
    entropy_loss=self.entropy(S)

    total_loss=act_loss+c1*crit_loss-c2*entropy_loss
    return total_loss
  
  def train_on_batch(self,probs,s,a,adv,r):
    with tf.GradientTape() as t:
        loss=self.total_loss(probs,s,a,r,adv)
      ### Calculating loss
    grads=t.gradient(loss,self.Actor.trainable_variables+self.Critic.trainable_variables)
    self.opt.apply_gradients(zip(grads,self.Actor.trainable_variables+self.Critic.trainable_variables))
    #### Applying gradients.
    return 0
    
  def get_episodes(self):
    ### To sample raw walks in the environment
    states=[]
    actions=[]
    rewards=[]
    values=[]
    next_values=[]
    not_done=[]
    old_probs=[]
    

    done=False
    curr_state=self.env.reset()
    for _ in range(self.steps_in_epi):
      ### Recording fixed number of steps.
      action=self.get_action(curr_state)
      next_state,reward,done,_=self.env.step(action)
      value=self.get_value(curr_state)
      next_value=self.get_value(next_state)
      prob=self.get_prob(curr_state.reshape(1,-1),action)
      ### Obtaining next step.

      states.append(curr_state)
      actions.append([action])
      rewards.append([reward])
      values.append(value)
      next_values.append(next_value)
      not_done.append([not done])
      old_probs.append(prob)
      
      ### logging the essential required values for loss estimation
      if done:
        curr_state=self.env.reset()  ### If done, every value is reset.
        done=False
      else: 
        curr_state=next_state ### Updating.

    return np.array(states),np.array(actions),np.array(rewards),np.array(values),np.array(next_values),np.array(not_done),np.array(old_probs)  ### All logs returned

  def get_gae(self,next_values,values,rewards,not_dones,gamma=0.99,lam=0.95):
    
    gae=0
    returns=[]
    ### Calculating GAE according to formulation.
    for step in reversed(range(len(rewards))):
      TD_error_delta = rewards[step] + gamma * next_values[step] * not_dones[step] - values[step]
      gae = TD_error_delta + gamma * lam * not_dones[step] * gae
      ret = gae + values[step]
      returns.insert(0,ret)
    
    return np.array(returns)

  def get_experience(self,m):
    #### Getting the experience for all m samplings.
    states=[]
    actions=[]
    returns=[]
    values=[]
    old_probs=[]

    for i in range(m):
      s,a,r,v,nv,nd,op=self.get_episodes()
      ret=self.get_gae(nv,v,r,nd)

      ### For each episode in number of samples, collecting experience
      old_probs.extend(op)
      states.extend(s)
      actions.extend(a)
      returns.extend(ret)
      values.extend(v)
   
    return np.array(states),np.array(actions),np.array(returns),np.array(values),np.array(old_probs)

  def test_play(self):
    ### Testing results for current weights.
    overall=0
    for _ in range(5):
      curr_state=self.env.reset()
      total_reward=0
      done=False
      while not done:
        a=self.get_action(curr_state.reshape(1,-1))
        next_state,reward,done,_=self.env.step(a)
        total_reward+=reward
        curr_state=next_state
      overall+=total_reward
    return overall/5
  
  def train(self,batch_size=128):
    self.Actor=self.get_actor()
    self.Critic=self.get_critic()
    for i in range(self.iterations):

      s,a,r,v,op=self.get_experience(self.m)
      r=r.astype('float32')
      adv=r-v
      adv=adv.astype('float32')
      adv=(adv-adv.mean())/(adv.std())
      total_no_of_samples=len(s)

      dataset=tf.data.Dataset.from_tensor_slices((op,s,a,adv,r)).shuffle(total_no_of_samples).repeat(self.epochs).batch(batch_size,drop_remainder=True)         

      for (prob_sample,s_sample,a_sample,adv_sample,r_sample) in dataset:
        self.train_on_batch(prob_sample,s_sample,a_sample,adv_sample,r_sample)
      
    
      if i%10==0:
        score=self.test_play()
        print(f"On Iteration {i} scores: {score}")
        if score==self.target:
          break


  


In [19]:
env=gym.make('CartPole-v0')
env._max_episode_steps=1000

In [20]:
agent=PPO_single_agent(env)

In [21]:
agent.train()

On Iteration 0 scores: 58.2
On Iteration 10 scores: 1000.0
