This module contains the implementation of the PPO algorithm.
Ci basiamo sullo pseudocodice presente sul sito di OpenAI per la realizzazione del ppo.
https://spinningup.openai.com/en/latest/algorithms/ppo.html#id7
Utilizzando un Actor-Critic Method.
Ciò suddivide l'implementazione in 8 passi principali:
1. Inizializzazione dell'ambiente con policy parameters theta_0, e l'inizial value function parameters w_0.
2. Ciclare per k iterazioni
3. Raccogliere un set di traiettorie D_k = {τ_i} con una policy pi_k = pi(theta_k)
4. Calcolare i reward-to-go R_t
5. Calcolare gli advantage estimates A_t basandoci sulla value function V_{w_k}
6. Aggiornare la policy massimizzando la PPO-Clip objective (Gradient ascent con adam) . Non scriverò la formula che è complessa
7. Aggiornare la value function minimizzando la MSE tra V_{w_k} e R_t (Gradient descent con adam)
8. Fine ciclo.

Implementiamo tutti i passi nella funzione learn.

In [9]:
import warnings
warnings.filterwarnings('ignore') #ignora warnings
#Check if colab is used:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
  print("Not running on CoLab")
if IN_COLAB:
  !pip install procgen
  !pip install tensorflow_probability
  !pip install numpy
from rete import ReteNeurale
import tensorflow as tf
import tensorflow_probability as tfp
import gym
import numpy as np
from tensorflow import keras
import random
import math




In [19]:
class PPO:
    def learn(self,env):
        #Passo 1 --> Inizializzazione dell'ambiente con policy parameters theta_0, e l'inizial value function parameters w_0.
        #Dobbiamo creare una rete neurale per la policy e per la value function.
        self.env=env
        self.nAzioni=env.action_space.n
        self.nStati=env.observation_space.shape
        self.listaAzioni=list(range(env.action_space.n))

        print("N STATI ENV:",self.nStati)
        print("N AZIONI ENV:",self.nAzioni)
        #self.stepsPerEpisode=2048 Per produzione
        #self.episodesPerBatch=8 per produzione
        #self.nEpoche=200 per produzione.
        self.stepsPerEpisode=512
        self.episodesPerBatch=1
        self.nEpoche=10

        self.gamma=0.95
        self.epsilon=0.2
        self.nUpdatesPerIteration=10
        self.cov_mat=tf.linalg.diag(tf.fill([self.nAzioni], 0.5))
        self.policyNN=ReteNeurale(self.nStati,self.nAzioni,softmax=True) #Actor
        self.valueNN=ReteNeurale(self.nStati,1,False) #Critic
        self.policy_optimizer=keras.optimizers.Adam(learning_rate=0.0005)
        self.value_optimizer=keras.optimizers.Adam(learning_rate=0.0005)
        self.policyNN.compile(optimizer=self.policy_optimizer)
        self.valueNN.compile(optimizer=self.value_optimizer)
        #passo 2 ciclare per k iterazioni.
        for k in range(self.nEpoche):
            states, actions, rewards, rewards_to_go, log_probs =self.collect_trajectories()
            print("Trajectories collected")


            num_samples=states.shape
            print("numSamples_:",num_samples)
            num_samples=num_samples[0]
            batch_size=64 #Faccio calcoli con mini-batches perchè altrimenti vado in Run out of memory fisso.
            for i in range(0,num_samples, batch_size):
              print("Batches")
              batch_states=states[i:i+batch_size]
              batch_actions=actions[i:i+batch_size]
              batch_rewards=rewards[i:i+batch_size]
              batch_rewards_to_go=rewards_to_go[i:i+batch_size]
              batch_log_probs=log_probs[i:i+batch_size]


              V,latest_log_probs=self.evaluate(batch_states,batch_actions)
              advantage=self.calcAdvantages(batch_rewards_to_go,V)
              print("Advantages calculated")
              std_advantages=tf.math.reduce_std(advantage)
              mean_advantages=tf.math.reduce_mean(advantage)
              print("Mean and std of advantages:",mean_advantages.numpy(),std_advantages.numpy())

              with tf.GradientTape(persistent=True) as tape:
                  _,latest_log_probs=self.evaluate(batch_states,batch_actions)
                  print("log_probs is tensor:", isinstance(batch_log_probs, tf.Tensor))
                  print("advantage is tensor:", isinstance(advantage, tf.Tensor))
                  print("rewards_to_go is tensor:", isinstance(batch_rewards_to_go, tf.Tensor))
                  print("V is tensor:", isinstance(V, tf.Tensor))


                  surrogated_loss_1, surrogated_loss_2=self.calcSurrogatedLoss(batch_log_probs,latest_log_probs,advantage)
                  policy_loss = -tf.reduce_mean(tf.minimum(surrogated_loss_1, surrogated_loss_2))
                  value_loss=tf.reduce_mean(tf.square(batch_rewards_to_go-V)) #MSE tra rewards to go e V
                  print("Policy Loss:", policy_loss)
                  print("Value Loss:", value_loss)
              gradientsPolicy = tape.gradient(policy_loss, self.policyNN.trainable_variables)


              # Debug: Controlla se i gradienti sono None
              print("Gradients for policy:", gradientsPolicy)

              # Verifica che i gradienti non siano None
              if gradientsPolicy and all(grad is not None for grad in gradientsPolicy):
                  self.policy_optimizer.apply_gradients(zip(gradientsPolicy, self.policyNN.trainable_variables))
              else:
                  print("Policy gradients are None!")

              gradientsValue = tape.gradient(value_loss, self.valueNN.trainable_variables)
              print("Gradients for value:", gradientsValue)
              # Verifica che i gradienti non siano None
              if gradientsValue and all(grad is not None for grad in gradientsValue):
                  self.value_optimizer.apply_gradients(zip(gradientsValue, self.valueNN.trainable_variables))
              else:
                  print("Value gradients are None!")

              print("EPOCA:",k," POLICY LOSS:",policy_loss," VALUE LOSS:",value_loss)


    def collect_trajectories(self):
        #Passo 3 --> Raccogliere un set di traiettorie D_k = {τ_i} con una policy pi_k = pi(theta_k)
        #Dobbiamo raccogliere un set di traiettorie e per fare ciò dobbiamo raccogliere: stati, azioni, rewards, rewards to go, log_prob delle azioni.
        batch={
            'states':[],
            'actions':[],
            'rewards':[],
            'rewards_to_go':[],
            'log_probs':[],
        }
        stato = self.env.reset()
        done = False
        #Abbiamo un fisso di 8 episodi per batch con 2048 steps per episodio
        for i in range(self.episodesPerBatch):
            rewardPerEpisode=[]
            print("episode: ",i)
            for j in range(self.stepsPerEpisode):
                batch['states'].append(stato)
                azione,log_prob=self.getAction(stato)
                #azione sarà un int, mentre log_prob sarà il logaritmo della probabilità dell'azione
                batch['actions'].append(azione)
                batch['log_probs'].append(log_prob)
                stato, reward, done, info = self.env.step(azione)
                #info non usata.
                rewardPerEpisode.append(reward)
                #if done:
                #    break #Ha raggiunto il termine dell'episodio.
            batch['rewards'].append(rewardPerEpisode)
        #Calcoliamo i rewards to go --> PASSO 4
        batch['rewards_to_go']=self.calcRTG(batch['rewards'])
        #return batch states, actions, rewards, rewards to go, log_probs

        batch_statiTensor=tf.convert_to_tensor(batch['states'],dtype=tf.uint8)
        batch_azioniTensor=tf.convert_to_tensor(batch['actions'],dtype=tf.int32)
        batch_rewardsTensor=tf.convert_to_tensor(batch['rewards'],dtype=tf.float32)
        batch_rewards_to_goTensor=tf.convert_to_tensor(batch['rewards_to_go'],dtype=tf.float32)
        batch_log_probsTensor=tf.convert_to_tensor(batch['log_probs'],dtype=tf.float32)


        return batch_statiTensor, batch_azioniTensor,batch_rewardsTensor,batch_rewards_to_goTensor,batch_log_probsTensor

    def getAction(self,stato):
        stato= np.expand_dims(stato, axis=0)  # Diventa (1, 64, 64, 3)
        stato=tf.convert_to_tensor(stato,dtype=tf.float32)
        azione_pred=self.policyNN(stato)
        #print last column values softmax

        #dist=tfp.distributions.Categorical(probs=azione_pred)
        # azionePresa=dist.sample()
        azionePresa=random.choices(self.listaAzioni, weights=tf.squeeze(azione_pred), k=1)[0]
        #dist=tfp.distributions.MultivariateNormalTriL(loc=azione_prob, scale_tril=tf.linalg.cholesky(self.cov_mat))
        #azionePresa=dist.sample()
        #log_prob=dist.log_prob(azionePresa)

        log_prob=tf.math.log(azione_pred[0][azionePresa]+ 1e-10) #Aggiungo un 1e-10 per evitare problemi nel calcolo del gradiente
        return tf.squeeze(azionePresa), log_prob

    def calcRTG(self,rewards):
        print("CALC REWARDS TO GO")
        print(rewards)
        #Prendo la formula per calcolare i rewards to go e richiede i cumulative rewards e un fattore di sconto.
        rtg=[]
        for episode_reward in reversed(rewards):
            cumulative_reward=0
            for single_reward in reversed(episode_reward):
                cumulative_reward=single_reward+cumulative_reward*self.gamma
                rtg.append(cumulative_reward)
        return tf.convert_to_tensor(rtg,dtype=tf.float32)

    def calcAdvantages(self, rtg,values):
        advantages=rtg-tf.stop_gradient(values)
        return (advantages - tf.reduce_mean(advantages)) / (tf.math.reduce_std(advantages) + 1e-10)

    def calcSurrogatedLoss(self,log_probs_old, log_probs_new, advantages):
        advantages = tf.stop_gradient(advantages)
        policy_ratio = tf.exp(log_probs_old - log_probs_new)
        surrogated_loss_1 = policy_ratio * advantages
        surrogated_loss_2 = tf.clip_by_value(policy_ratio, clip_value_min=1.0-self.epsilon, clip_value_max=1.0+self.epsilon) * advantages
        return surrogated_loss_1, surrogated_loss_2

    def evaluate(self, batch_states,batch_actions):
        batch_states=tf.cast(batch_states, tf.float32)
        retVal=self.valueNN(batch_states)
        V= tf.squeeze(retVal)
        mean=self.policyNN(batch_states)
        dist=tfp.distributions.Categorical(probs=mean)
        log_probs=dist.log_prob(batch_actions)
        return V, log_probs



In [21]:
# Configurazione ed esecuzione
env = gym.make('procgen:procgen-coinrun-v0',distribution_mode='easy', start_level=0, num_levels=1)
ppo_model=PPO()
ppo_model.learn(env)

N STATI ENV: (64, 64, 3)
N AZIONI ENV: 15
episode:  0
CALC REWARDS TO GO
[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

ValueError: Total of weights must be finite