This module contains the implementation of the PPO algorithm.
Ci basiamo sullo pseudocodice presente sul sito di OpenAI per la realizzazione del ppo.
https://spinningup.openai.com/en/latest/algorithms/ppo.html#id7
Utilizzando un Actor-Critic Method.
Ciò suddivide l'implementazione in 8 passi principali:
1. Inizializzazione dell'ambiente con policy parameters theta_0, e l'inizial value function parameters w_0.
2. Ciclare per k iterazioni
3. Raccogliere un set di traiettorie D_k = {τ_i} con una policy pi_k = pi(theta_k)
4. Calcolare i reward-to-go R_t
5. Calcolare gli advantage estimates A_t basandoci sulla value function V_{w_k}
6. Aggiornare la policy massimizzando la PPO-Clip objective (Gradient ascent con adam) . Non scriverò la formula che è complessa
7. Aggiornare la value function minimizzando la MSE tra V_{w_k} e R_t (Gradient descent con adam)
8. Fine ciclo.

Implementiamo tutti i passi nella funzione learn.

In [9]:
import warnings
warnings.filterwarnings('ignore') #ignora warnings
#Check if colab is used:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
  print("Not running on CoLab")
if IN_COLAB:
  !pip install procgen
  !pip install tensorflow_probability
  !pip install numpy
from rete import ReteNeurale
import tensorflow as tf
import tensorflow_probability as tfp
import gym
import numpy as np
from tensorflow import keras
import math
import glfw


Not running on CoLab


In [10]:
#!pip install moviepy
#!pip install IPython
#!pip install pyvirtualdisplay
from moviepy import ImageSequenceClip
from IPython.display import Video
import os
from pyvirtualdisplay.smartdisplay import SmartDisplay
display = SmartDisplay(visible=0, size=(1400, 900),fbdir='/tmp')
display.start()
glfw.init()
available_fbconfigs = glfw.get_video_modes(glfw.get_primary_monitor())
print(available_fbconfigs)
os.environ['PYOPENGL_PLATFORM'] = 'osmesa'




[GLFWvidmode(size=Size(width=1400, height=900), bits=Bits(red=8, green=8, blue=8), refresh_rate=0)]


In [None]:
class PPO:
    def __init__(self,env):
        self.env=env
        self.nAzioni=env.action_space.n
        self.nStati=env.observation_space.shape
        self.listaAzioni=[i for i in range(self.nAzioni)]
        self.episodesPerBatch=10
        self.nEpoche=4
        self.stepsPerEpisode=512        
        self.gamma=0.99
        self.epsilon=0.2
        self.learningRate=5e-3
        self.policyNN=ReteNeurale(self.nStati,self.nAzioni) #Actor
        self.policy_optimizer=keras.optimizers.Adam(learning_rate=self.learningRate)
        self.policyNN.compile(optimizer=self.policy_optimizer)
        self.entropyCoefficient=0.01 #Per invogliare l'esplorazione un po di più.
        self.lambdaGAE=0.95
        
        

    def learn(self,env):
        #passo 2 ciclare per k iterazioni.
        for k in range(self.nEpoche):
            states, actions, rewards_to_go, log_probs, epLen =self.collect_trajectories(k)
            #print("Trajectories collected")

            num_samples=states.shape[0]
            batch_size=32 #Faccio calcoli con mini-batches perchè altrimenti vado in Run out of memory di continuo.
            for i in range(0,num_samples, batch_size):
              batch_states=states[i:i+batch_size]
              batch_actions=actions[i:i+batch_size]
              batch_rewards_to_go=rewards_to_go[i:i+batch_size]
              batch_log_probs=log_probs[i:i+batch_size]

              V,latest_log_probs,_=self.evaluate(batch_states,batch_actions)
              advantage=self.calcAdvantages(batch_rewards_to_go,V)

              with tf.GradientTape() as tape:
                _,latest_log_probs,probs=self.evaluate(batch_states,batch_actions)
                policy_loss = self.getPolicyLoss(batch_log_probs,latest_log_probs,advantage)
                value_loss=tf.reduce_mean(tf.square(batch_rewards_to_go-V)) #MSE tra rewards to go e V
                #Aggiungo entropia alla loss per incentivare l'esplorazione
                entropy = -tf.reduce_mean(probs * tf.math.log(probs + 1e-10))
                total_loss=policy_loss+ value_loss*0.5 - entropy*self.entropyCoefficient
              gradientsPolicy = tape.gradient(total_loss, self.policyNN.trainable_variables)
              self.policy_optimizer.apply_gradients(zip(gradientsPolicy, self.policyNN.trainable_variables))
              print("EPOCA:",k," TOTAL LOSS:",total_loss," POLICY LOSS:",policy_loss," VALUE LOSS:",value_loss," ENTROPY:",entropy,)
            self.evaluate_policy()



    def evaluate_policy(self, episodes=10):
        total_rewards = []
        for _ in range(episodes):
            state = self.env.reset()
            done = False
            cumulative_reward = 0
            while not done:
                state_tensor = tf.convert_to_tensor(state, dtype=tf.float32)
                state_tensor = tf.expand_dims(state_tensor, axis=0)
                probs, _ = self.policyNN(state_tensor)
                action = np.argmax(probs.numpy())
                state, reward, done, info =self.env.step(action)
                cumulative_reward += reward
            total_rewards.append(cumulative_reward)
        print(f"Average Reward: {np.mean(total_rewards):.2f}")

    def collect_trajectories(self,epoca):
        #Passo 3 --> Raccogliere un set di traiettorie D_k = {τ_i} con una policy pi_k = pi(theta_k)
        #Dobbiamo raccogliere un set di traiettorie e per fare ciò dobbiamo raccogliere: stati, azioni, rewards, rewards to go, log_prob delle azioni.
        batch={
            'states':[],
            'actions':[],
            'rewards':[],
            'rewards_to_go':[],
            'log_probs':[],
            'epLen':[]
        }
        done = False
        stato = self.env.reset()
        frames=[]
        for i in range(self.episodesPerBatch):
            if done == True:
                stato = self.env.reset()
                done=False
            rewardPerEpisode=[]
            print("----------------------------\nEpisode: ",i)
            for j in range(self.stepsPerEpisode):
                batch['states'].append(stato)
                azione,log_prob=self.getAction(stato)
                #azione sarà un int, mentre log_prob sarà il logaritmo della probabilità dell'azione
                batch['actions'].append(azione)
                batch['log_probs'].append(log_prob)
                stato, reward, done,info = self.env.step(azione)
                rewardPerEpisode.append(reward)
                #info non usata.
                frames.append(stato)

                if done:
                    break #Ha raggiunto il termine dell'episodio.

            batch['epLen'].append(j+1)
            batch['rewards'].append(rewardPerEpisode)
            clip = ImageSequenceClip(list(frames), fps=15)
            nameVideo="coinrun_video"+str(i)+".mp4"
            clip.write_videofile(nameVideo, fps=15,logger=None)
            Video(nameVideo)
            frames=[]

        #Calcoliamo i rewards to go --> PASSO 4
        batch['rewards_to_go']=self.calcRTG(batch['rewards'])
        #return batch states, actions, rewards, rewards to go, log_probs
        #print("BATCH LOG PROBS:",batch['log_probs'])
        batch_statiTensor=tf.convert_to_tensor(batch['states'],dtype=tf.uint8)
        batch_azioniTensor=tf.convert_to_tensor(batch['actions'],dtype=tf.int32)
        batch_rewards_to_goTensor=tf.convert_to_tensor(batch['rewards_to_go'],dtype=tf.float32)
        batch_log_probsTensor=tf.convert_to_tensor(batch['log_probs'],dtype=tf.float32)


        return batch_statiTensor, batch_azioniTensor,batch_rewards_to_goTensor,batch_log_probsTensor, batch['epLen']

    def getAction(self,stato):
        stato=tf.convert_to_tensor(np.expand_dims(stato, axis=0) ,dtype=tf.float32)# Diventa (1, 64, 64, 3)
        azione_pred,_=self.policyNN(stato)
        #Somma probabilità
        dist=tfp.distributions.Categorical(probs=tf.squeeze(azione_pred))
        azionePresa=dist.sample()
        log_prob=dist.log_prob(azionePresa)
        return azionePresa, tf.stop_gradient(log_prob)

    def calcRTG(self,rewards):
        #Prendo la formula per calcolare i rewards to go e richiede i cumulative rewards e un fattore di sconto.
        rtg=[]
        for episode_reward in reversed(rewards):
            cumulative_reward=0
            totalRewardPerEpisode=0
            for single_reward in reversed(episode_reward):
                cumulative_reward=single_reward+cumulative_reward*self.gamma
                totalRewardPerEpisode+=single_reward
                rtg.append(cumulative_reward)
            print("Total reward per episode:",totalRewardPerEpisode)
        return tf.convert_to_tensor(rtg,dtype=tf.float32)

    def calcAdvantages(self, rtg,values):
        advantages=rtg-tf.stop_gradient(values)
        return (advantages - tf.reduce_mean(advantages)) / (tf.math.reduce_std(advantages) + 1e-10)
    
    def calcGAE(self, rewards, values):
        gae = 0
        returns = []
        for i in reversed(range(len(rewards))):
            delta = rewards[i] + self.gamma * values[i + 1] - values[i]
            gae = delta + self.gamma * self.lambdaGAE * gae
            returns.insert(0, gae + values[i])
        return returns

    def getPolicyLoss(self,log_probs_old, log_probs_new, advantages):
        advantages = tf.stop_gradient(advantages)
        #print("CALC SURROGATED LOSS, ADVANTAGES:",advantages)
        #print("CALC SURROGATED LOSS, Log probs old:",log_probs_old)
        #print("CALC SURROGATED LOSS, Log probs new:",log_probs_new)
        policy_ratio = tf.exp(log_probs_new-log_probs_old)
        print("advantages"  ,advantages)
        #print("CALC SURROGATED LOSS, Policy ratio :",policy_ratio)
        surrogated_loss_1 = policy_ratio * advantages
        clipped_policy_ratio=tf.clip_by_value(policy_ratio, clip_value_min=1.0-self.epsilon, clip_value_max=1.0+self.epsilon)
        print("clipped policy ration",clipped_policy_ratio)
        surrogated_loss_2 = clipped_policy_ratio * advantages
        clip_loss=tf.minimum(surrogated_loss_1,surrogated_loss_2)
        return -tf.reduce_mean(clip_loss)

    def evaluate(self, batch_states,batch_actions):
        batch_states=tf.cast(batch_states, tf.float32)
        #retVal=self.valueNN(batch_states)
        mean,retVal=self.policyNN(batch_states)
        V= tf.squeeze(retVal)
        #print("V EVALUATE:",V)
        #print("MEAN EVALUATE:",mean)
        dist=tfp.distributions.Categorical(probs=mean)
        log_probs=dist.log_prob(batch_actions)
        #print("LOG PROBS EVALUATE:",log_probs)
        return V, log_probs, mean

    def loadModel(self, path):
        if path is "":
            return
        self.policyNN.build(self.nStati)
        try:
            self.policyNN.load_weights(path)
        except:
            print("Errore nel caricamento del modello")

    def saveModel(self, path):
        self.policyNN.save_weights(path)


In [12]:
# Configurazione ed esecuzione
env = gym.make('procgen:procgen-coinrun-v0',distribution_mode='easy', start_level=0, num_levels=1)

#load model weights 

ppo_model=PPO(env)

ppo_model.loadModel("ppo_coinrun.weights.h5")
ppo_model.learn(env)
ppo_model.saveModel("ppo_coinrun.weights.h5")


#save model weights


Errore nel caricamento del modello
----------------------------
Episode:  0
----------------------------
Episode:  1
----------------------------
Episode:  2
----------------------------
Episode:  3
----------------------------
Episode:  4
----------------------------
Episode:  5
----------------------------
Episode:  6
----------------------------
Episode:  7
----------------------------
Episode:  8
----------------------------
Episode:  9
Total reward per episode: 0.0
Total reward per episode: 0.0
Total reward per episode: 0.0
Total reward per episode: 0.0
Total reward per episode: 0.0
Total reward per episode: 0.0
Total reward per episode: 0.0
Total reward per episode: 0.0
Total reward per episode: 0.0
Total reward per episode: 0.0
advantages tf.Tensor(
[-0.9931043  -0.53673786 -1.1529847   1.4524926  -1.0116411  -1.7550758
  1.1977935  -0.39407426  1.1805238   0.31454524  1.0753247   0.5605353
  1.4104245  -0.45841616  1.6663804  -0.69678426 -0.37813538  1.3131282
 -0.7306473  -0.5

KeyboardInterrupt: 