This module contains the implementation of the PPO algorithm.
Ci basiamo sullo pseudocodice presente sul sito di OpenAI per la realizzazione del ppo.
https://spinningup.openai.com/en/latest/algorithms/ppo.html#id7
Utilizzando un Actor-Critic Method. 
Ciò suddivide l'implementazione in 8 passi principali:
1. Inizializzazione dell'ambiente con policy parameters theta_0, e l'inizial value function parameters w_0.
2. Ciclare per k iterazioni
3. Raccogliere un set di traiettorie D_k = {τ_i} con una policy pi_k = pi(theta_k)
4. Calcolare i reward-to-go R_t 
5. Calcolare gli advantage estimates A_t basandoci sulla value function V_{w_k}
6. Aggiornare la policy massimizzando la PPO-Clip objective (Gradient ascent con adam) . Non scriverò la formula che è complessa
7. Aggiornare la value function minimizzando la MSE tra V_{w_k} e R_t (Gradient descent con adam)
8. Fine ciclo.

Implementiamo tutti i passi nella funzione learn.

In [4]:
import warnings 
warnings.filterwarnings('ignore') #ignora warnings

from rete import ReteNeurale
import tensorflow as tf
import tensorflow_probability as tfp
import gym
import numpy as np
from tensorflow import keras
import random
import math


In [None]:
class PPO:
    def learn(self,env):
        #Passo 1 --> Inizializzazione dell'ambiente con policy parameters theta_0, e l'inizial value function parameters w_0.
        #Dobbiamo creare una rete neurale per la policy e per la value function. 
        self.env=env
        self.nAzioni=env.action_space.n
        self.nStati=env.observation_space.shape
        self.listaAzioni=list(range(env.action_space.n))

        print("N STATI ENV:",self.nStati)
        print("N AZIONI ENV:",self.nAzioni)
        #self.stepsPerEpisode=2048 Per produzione
        #self.episodesPerBatch=8 per produzione
        #self.nEpoche=200 per produzione.
        self.stepsPerEpisode=512
        self.episodesPerBatch=4
        self.nEpoche=10
        
        self.gamma=0.95
        self.epsilon=0.2
        self.nUpdatesPerIteration=10
        self.cov_mat=tf.linalg.diag(tf.fill([self.nAzioni], 0.5))
        self.policyNN=ReteNeurale(self.nStati,self.nAzioni,softmax=True) #Actor
        self.valueNN=ReteNeurale(self.nStati,1,False) #Critic
        self.policy_optimizer=keras.optimizers.Adam(learning_rate=0.0005)
        self.value_optimizer=keras.optimizers.Adam(learning_rate=0.0005)
        self.policyNN.compile(optimizer=self.policy_optimizer)
        self.valueNN.compile(optimizer=self.value_optimizer)
        #passo 2 ciclare per k iterazioni.
        for k in range(self.nEpoche):
            states, actions, rewards, rewards_to_go, log_probs =self.collect_trajectories()
            V,latest_log_probs=self.evaluate(states,actions)
            advantage=self.calcAdvantages(rewards_to_go,V)

            with tf.GradientTape(persistent=True) as tape:
                _,latest_log_probs=self.evaluate(states,actions)
                print("log_probs is tensor:", isinstance(log_probs, tf.Tensor))
                print("advantage is tensor:", isinstance(advantage, tf.Tensor))
                print("rewards_to_go is tensor:", isinstance(rewards_to_go, tf.Tensor))
                print("V is tensor:", isinstance(V, tf.Tensor))


                surrogated_loss_1, surrogated_loss_2=self.calcSurrogatedLoss(log_probs,latest_log_probs,advantage)
                policy_loss = -tf.reduce_mean(tf.minimum(surrogated_loss_1, surrogated_loss_2))
                value_loss=tf.reduce_mean(tf.square(rewards_to_go-V)) #MSE tra rewards to go e V
                print("Policy Loss:", policy_loss)
                print("Value Loss:", value_loss)
            gradientsPolicy = tape.gradient(policy_loss, self.policyNN.trainable_variables)
                
                
                 # Debug: Controlla se i gradienti sono None
            print("Gradients for policy:", gradientsPolicy)
                
                # Verifica che i gradienti non siano None
            if gradientsPolicy and all(grad is not None for grad in gradientsPolicy):
                self.policy_optimizer.apply_gradients(zip(gradientsPolicy, self.policyNN.trainable_variables))
            else:
                print("Policy gradients are None!")

            gradientsValue = tape.gradient(value_loss, self.valueNN.trainable_variables)
            print("Gradients for value:", gradientsValue)
            # Verifica che i gradienti non siano None
            if gradientsValue and all(grad is not None for grad in gradientsValue):
                self.value_optimizer.apply_gradients(zip(gradientsValue, self.valueNN.trainable_variables))
            else:
                print("Value gradients are None!")

                print("EPOCA:",k," POLICY LOSS:",policy_loss," VALUE LOSS:",value_loss)                 

    
    def collect_trajectories(self):
        #Passo 3 --> Raccogliere un set di traiettorie D_k = {τ_i} con una policy pi_k = pi(theta_k)
        #Dobbiamo raccogliere un set di traiettorie e per fare ciò dobbiamo raccogliere: stati, azioni, rewards, rewards to go, log_prob delle azioni.
        batch={
            'states':[],
            'actions':[],
            'rewards':[],
            'rewards_to_go':[],
            'log_probs':[],
        }
        stato = self.env.reset()
        done = False
        #Abbiamo un fisso di 8 episodi per batch con 2048 steps per episodio
        for i in range(self.episodesPerBatch):
            rewardPerEpisode=[]
            for j in range(self.stepsPerEpisode):
                batch['states'].append(stato)
                azione,log_prob=self.getAction(stato)
                #azione sarà un int, mentre log_prob sarà il logaritmo della probabilità dell'azione
                batch['actions'].append(azione)
                batch['log_probs'].append(log_prob)
                stato, reward, done, info = self.env.step(azione)
                #info non usata.
                rewardPerEpisode.append(reward)
                #if done:
                #    break #Ha raggiunto il termine dell'episodio.
            batch['rewards'].append(rewardPerEpisode)
        #Calcoliamo i rewards to go --> PASSO 4
        batch['rewards_to_go']=self.calcRTG(batch['rewards'])
        #return batch states, actions, rewards, rewards to go, log_probs

        batch_statiTensor=tf.convert_to_tensor(batch['states'],dtype=tf.uint8)
        batch_azioniTensor=tf.convert_to_tensor(batch['actions'],dtype=tf.int32)
        batch_rewardsTensor=tf.convert_to_tensor(batch['rewards'],dtype=tf.float32)
        batch_rewards_to_goTensor=tf.convert_to_tensor(batch['rewards_to_go'],dtype=tf.float32)
        batch_log_probsTensor=tf.convert_to_tensor(batch['log_probs'],dtype=tf.float32)


        return batch_statiTensor, batch_azioniTensor,batch_rewardsTensor,batch_rewards_to_goTensor,batch_log_probsTensor
                
    def getAction(self,stato):
        stato= np.expand_dims(stato, axis=0)  # Diventa (1, 64, 64, 3)
        stato=tf.convert_to_tensor(stato,dtype=tf.float32)
        azione_pred=self.policyNN(stato)
        #print last column values softmax        
        
        #dist=tfp.distributions.Categorical(probs=azione_pred)
        # azionePresa=dist.sample()
        azionePresa=random.choices(self.listaAzioni, weights=tf.squeeze(azione_pred), k=1)[0]
        #dist=tfp.distributions.MultivariateNormalTriL(loc=azione_prob, scale_tril=tf.linalg.cholesky(self.cov_mat)) 
        #azionePresa=dist.sample()
        #log_prob=dist.log_prob(azionePresa)
        
        log_prob=tf.math.log(azione_pred[0][azionePresa])
        return tf.squeeze(azionePresa), log_prob

    def calcRTG(self,rewards):
        print("CALC REWARDS TO GO")
        print(rewards)
        #Prendo la formula per calcolare i rewards to go e richiede i cumulative rewards e un fattore di sconto.
        rtg=[]
        for episode_reward in reversed(rewards):
            cumulative_reward=0
            for single_reward in reversed(episode_reward):
                cumulative_reward=single_reward+cumulative_reward*self.gamma
                rtg.append(cumulative_reward)
        return tf.convert_to_tensor(rtg,dtype=tf.float32)

    def calcAdvantages(self, rtg,values):
        advantages=rtg-tf.stop_gradient(values)
        return (advantages - tf.reduce_mean(advantages)) / (tf.math.reduce_std(advantages) + 1e-10)
    
    def calcSurrogatedLoss(self,log_probs_old, log_probs_new, advantages):
        advantages = tf.stop_gradient(advantages)
        policy_ratio = tf.exp(log_probs_old - log_probs_new)
        surrogated_loss_1 = policy_ratio * advantages
        surrogated_loss_2 = tf.clip_by_value(policy_ratio, clip_value_min=1.0-self.epsilon, clip_value_max=1.0+self.epsilon) * advantages
        return surrogated_loss_1, surrogated_loss_2
    
    def evaluate(self, batch_states,batch_actions):
        batch_states=tf.cast(batch_states, tf.float32)
        retVal=self.valueNN(batch_states)
        V= tf.squeeze(retVal)
        mean=self.policyNN(batch_states)
        dist=tfp.distributions.Categorical(probs=mean)
        log_probs=dist.log_prob(batch_actions)
        return V, log_probs



In [6]:
# Configurazione ed esecuzione
env = gym.make('procgen:procgen-coinrun-v0',distribution_mode='easy', start_level=0, num_levels=1)
ppo_model=PPO()
ppo_model.learn(env)

N STATI ENV: (64, 64, 3)
N AZIONI ENV: 15
AZIONE PRED: tf.Tensor(
[[1.2067530e-11 1.1340640e-10 3.0913321e-21 1.5011385e-07 1.5926174e-06
  9.2075989e-13 1.1562271e-20 3.4746024e-22 3.4785750e-03 3.4611369e-10
  5.8165565e-22 9.9651968e-01 1.3500002e-18 9.5670185e-27 8.6524734e-14]], shape=(1, 15), dtype=float32)
AZIONE PRESA LIST: [11]
Azione presa primo val:  11
LOG PROB: tf.Tensor(-0.0034863856, shape=(), dtype=float32)
AZIONE PRED: tf.Tensor(
[[1.2879944e-11 5.2041910e-10 3.0433639e-21 2.3656295e-07 1.8197314e-06
  8.5957765e-13 3.3186990e-20 5.1047479e-22 1.4503877e-02 1.8922390e-10
  7.0536873e-22 9.8549402e-01 8.2143684e-18 4.6557038e-27 3.1222700e-13]], shape=(1, 15), dtype=float32)
AZIONE PRESA LIST: [11]
Azione presa primo val:  11
LOG PROB: tf.Tensor(-0.014612223, shape=(), dtype=float32)
AZIONE PRED: tf.Tensor(
[[1.3412329e-11 5.0810894e-10 2.7894647e-21 1.8809405e-07 2.3024600e-06
  1.0525906e-12 3.3067527e-20 5.5269998e-22 1.3229559e-02 1.8361745e-10
  7.8793396e-22 9.867

2024-12-14 14:09:48.403722: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 977.00MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
W0000 00:00:1734181788.403746    6213 gpu_utils.cc:68] Failed to allocate memory for convolution redzone checking; skipping this check. This is benign and only means that we won't check cudnn for out-of-bounds reads and writes. This message will only be printed once.
2024-12-14 14:09:48.591150: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 826.84MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-12-14 14:09:48.712278: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator

ResourceExhaustedError: Exception encountered when calling Conv2D.call().

[1m{{function_node __wrapped__Conv2D_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[2048,62,62,32] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Conv2D][0m

Arguments received by Conv2D.call():
  • inputs=tf.Tensor(shape=(2048, 64, 64, 3), dtype=float32)