In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

In [None]:
import gym

In [None]:
class PPO_single_agent:

  def __init__(self,env):

    self.env=env
    self.state_dimension=env.observation_space.shape   ### Input state dimension
    self.no_of_action=env.action_space.n              ### No of actions
    self.Actor=None      ### the learner
    self.Critic=None     ### Critic
    self.opt=tf.keras.optimizers.Adam(0.0003)
    self.steps_in_epi=512    ### Fixed number of steps in a episode
    self.epochs=5000         ### Number of epochs
    self.m=8                 ### Number of samples in a epoch.
    self.target=500
  
  def get_actor(self):

    input_layer=tf.keras.layers.Input(self.state_dimension)  ### Takes the state for which we want to 
    ### predict the probability distribution of the actions.

    layer_1=tf.keras.layers.Dense(128,activation="relu")(input_layer)
    layer_2=tf.keras.layers.Dense(128,activation="relu")(layer_1)
    layer_3=tf.keras.layers.Dense(128,activation="relu")(layer_2)
    
    output_layer=tf.keras.layers.Dense(self.no_of_action, activation="softmax")(layer_3)

    ### Predicts the peobability of all the actions on the state s, so the number of nodes in
    ### the final layer of model is equal to the number of actions 
    ### and we generate a probabilistic distribution so softmax is used as the activation function.
    
    model=tf.keras.Model(inputs=[input_layer],outputs=[output_layer])
    
    return model
  
  def get_critic(self):

    input_layer=tf.keras.layers.Input(self.state_dimension)  ### Takes the state for which we want to 
    ### predict the estimate value function V(s)

    layer_1=tf.keras.layers.Dense(128,activation="relu")(input_layer)
    layer_2=tf.keras.layers.Dense(128,activation="relu")(layer_1)
    layer_3=tf.keras.layers.Dense(128,activation="relu")(layer_2)
    
    output_layer=tf.keras.layers.Dense(1)(layer_3)  ### Predicts the Value function for that state.

    model=tf.keras.Model(inputs=[input_layer],outputs=[output_layer])
    
    return model
  
  
  def action(self,s):
    
    s=s.reshape(1,-1)
    out=self.Actor(s)   
    action_prob=tfp.distributions.Categorical(probs=out)
    action= action_prob.sample()
    ### sampling an action from the obtained probability distributions for all the action
    return action.numpy()[0]  ### Action returned as 1D tensor-> converting to scalar

  def prob(self,s,a):

    out=self.Actor(s)
    ### again to be used in batch
    action_prob=tfp.distributions.Categorical(probs=out)
    ### takes in the states and the actions and returns the corresponding log probability 
    ### of the occurence of the taken action a on the state s
    ### log(P[a|s : w]) is obtained.
    return action_prob.prob(a)

  def actor_loss(self,old_probs,S,A,Adv,epsilon=0.2):
    
    new_probs=self.prob(S,A) 
    importance_ratio = tf.divide(new_probs,old_probs)
    surr_1=tf.multiply(importance_ratio,Adv)
    surr_2=tf.multiply(tf.clip_by_value(importance_ratio,1-epsilon,1+epsilon),Adv)
    L_clip=-1*tf.reduce_mean(tf.reduce_min([surr_1,surr_2],axis=0))
    
    return L_clip

  def critic_loss(self,S,ret):
    L_vf= 0.5*tf.reduce_mean(tf.math.squared_difference(ret,self.Critic(S)))
    return L_vf
  
  def entropy(self,S):
    logits=self.Actor(S)
    dist=tfp.distributions.Categorical(logits=logits)
    L_S=tf.reduce_mean(dist.entropy())
    return L_S
    ### Entropy induces exploration.
  
  def total_loss(self,old_probs,S,A,rets,Adv,c1=0.5,c2=0.001):
    act_loss=self.actor_loss(old_probs,S,A,Adv)
    crit_loss=self.critic_loss(S,rets)
    entropy_loss=self.entropy(S)

    total_loss=act_loss+c1*crit_loss-c2*entropy_loss
    return total_loss
  
  def train_on_batch(self,probs,s,a,adv,r):
    with tf.GradientTape() as t:
        loss=self.total_loss(probs,s,a,r,adv)
      ### Calculating loss
    grads=t.gradient(loss,self.Actor.trainable_variables+self.Critic.trainable_variables)
    self.opt.apply_gradients(zip(grads,self.Actor.trainable_variables+self.Critic.trainable_variables))
    #### Applying gradients.
    return 0
  
  def get_episodes(self):
    ### To sample raw walks in the environment
    states=[]
    actions=[]
    rewards=[]
    next_states=[]
    not_done=[]
    

    done=False
    curr_state=self.env.reset()
    for _ in range(self.steps_in_epi):
      ### Recording fixed number of steps.
      action=self.action(curr_state)
      next_state,reward,done,_=self.env.step(action)
      ### Obtaining next step.

      states.append(curr_state)
      actions.append(action)
      rewards.append(reward)
      next_states.append(next_state)
      not_done.append(not done)
      
      ### logging the essential required values for loss estimation
      if done:
        curr_state=env.reset()  ### If done, every value is reset.
        done=False
      else: 
        curr_state=next_state ### Updating.

    return states,actions,rewards,next_states,not_done  ### All logs returned

  def get_value_funcs(self,states,next_states):
    
    #print(states)
    values=self.Critic.predict(np.array(states))    ### Recording the values for the states 
    values= values.flatten()      ### Converting the 2D array to 1D array
    next_values=self.Critic.predict(np.array(next_states))   ### Recording the values for the next states
    next_values= next_values.flatten()      ### Converting the 2D array to 1D array
    return values,next_values

  def get_old_probs(self,states,actions):
    
    probs=self.prob(np.array(states),np.array(actions)).numpy()
    probs=probs.flatten()
    return np.array(probs)

  def get_gae(self,next_values,values,rewards,not_dones,gamma=0.99,lam=0.95):
    
    gae=0
    returns=[]
    ### Calculating GAE according to formulation.
    for step in reversed(range(len(rewards))):
      TD_error_delta = rewards[step] + gamma * next_values[step] * not_dones[step] - values[step]
      gae = TD_error_delta + gamma * lam * not_dones[step] * gae
      ret = gae + values[step]
      returns.insert(0,ret)
    
    return np.array(returns)

  def get_experience(self,m):
    #### Getting the experience for all m samplings.
    states=[]
    actions=[]
    returns=[]
    values=[]
    old_probs=[]

    for i in range(m):
      S,A,R,Ns,Nd=self.get_episodes()
      vals,next_vals=self.get_value_funcs(S,Ns)
      old_prob=self.get_old_probs(S,A)
      rets=self.get_gae(next_vals,vals,R,Nd)

      
      ### For each episode in number of samples, collecting experience
      old_probs.extend(old_prob)
      states.extend(S)
      actions.extend(A)
      returns.extend(rets)
      values.extend(vals)
   
    return np.array(states),np.array(actions),np.array(returns),np.array(values),np.array(old_probs)
  
  def test_play(self):
    ### Testing results for current weights.
    overall=0
    for _ in range(5):
      curr_state=self.env.reset()
      total_reward=0
      done=False
      while not done:
        a=self.action(curr_state.reshape(1,-1))
        next_state,reward,done,_=self.env.step(a)
        total_reward+=reward
        curr_state=next_state
      overall+=total_reward
    return overall/5
  
  def train(self,batch_size=128):
    ### Training
    self.Actor=self.get_actor()
    self.Critic=self.get_critic()

    for i in range(self.epochs):
      ### For each epoch
      s,a,r,v,op = self.get_experience(self.m)
      #### Obtaining values.
      adv= r - v
      #### Calculating advantages
      adv = (adv - adv.mean())/adv.std()
      ### Normalizing the advantages
      total_no_of_samples=len(s)

      dataset=tf.data.Dataset.from_tensor_slices((op,s,a,adv,r)).shuffle(total_no_of_samples).batch(batch_size,drop_remainder=True)         
      iterator=dataset.as_numpy_iterator()

      no_of_batches=total_no_of_samples/batch_size

      for _ in range(int(no_of_batches)):
        prob_sample,s_sample,a_sample,adv_sample,r_sample=iterator.next()
        self.train_on_batch(prob_sample,s_sample,a_sample,adv_sample,r_sample)
      
    
      if i%10==0:
        score=self.test_play()
        print(f"On Iteration {i} scores: {score}")
        if score==self.target:
          break

  


In [None]:
env=gym.make('CartPole-v0')
env._max_episode_steps=500

In [None]:
agent=PPO_single_agent(env)

In [None]:
agent.train()

On Iteration 0 scores: 29.6
On Iteration 10 scores: 192.2
On Iteration 20 scores: 326.6
On Iteration 30 scores: 379.4
On Iteration 40 scores: 472.4
