This module contains the implementation of the PPO algorithm.
Ci basiamo sullo pseudocodice presente sul sito di OpenAI per la realizzazione del ppo.
https://spinningup.openai.com/en/latest/algorithms/ppo.html#id7
Utilizzando un Actor-Critic Method.
Ciò suddivide l'implementazione in 8 passi principali:
1. Inizializzazione dell'ambiente con policy parameters theta_0, e l'inizial value function parameters w_0.
2. Ciclare per k iterazioni
3. Raccogliere un set di traiettorie D_k = {τ_i} con una policy pi_k = pi(theta_k)
4. Calcolare i reward-to-go R_t
5. Calcolare gli advantage estimates A_t basandoci sulla value function V_{w_k}
6. Aggiornare la policy massimizzando la PPO-Clip objective (Gradient ascent con adam) . Non scriverò la formula che è complessa
7. Aggiornare la value function minimizzando la MSE tra V_{w_k} e R_t (Gradient descent con adam)
8. Fine ciclo.

Implementiamo tutti i passi nella funzione learn.

In [9]:
import warnings
warnings.filterwarnings('ignore') #ignora warnings
#Check if colab is used:
from rete import ReteNeurale
import tensorflow as tf
import tensorflow_probability as tfp
import gym
import numpy as np
from tensorflow import keras
import pandas as pd
import matplotlib.pyplot as plt
import glfw
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
  print("Not running on CoLab")
  #print list of GPUs
  #tf. config. list_physical_devices('GPU')
  print("Devices: ", tf.config.list_physical_devices())
  print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
if IN_COLAB:
  !pip install procgen
  !pip install tensorflow_probability
  !pip install numpy


Not running on CoLab
Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Num GPUs Available:  1


In [10]:
#Tutta questa parte è relativa alla visualizzazione del gioco tramite salvataggio degli stati di un episode in una specifica cartella.
#Tutto questo viene fatto perchè non si riesce a visualizzare procgen in live per problemi con glfw e openGL su Ubuntu.
from moviepy import ImageSequenceClip
from IPython.display import Video
import os
from pyvirtualdisplay.smartdisplay import SmartDisplay
display = SmartDisplay(visible=0, size=(1920,1080),fbdir='/tmp')
display.start()
glfw.init()
available_fbconfigs = glfw.get_video_modes(glfw.get_primary_monitor())
os.environ['PYOPENGL_PLATFORM'] = 'osmesa'




In [None]:
class PPO:
    def __init__(self,env,gameName,totalSteps=2000000):
        self.env=env
        self.gameName=gameName
        self.nAzioni=env.action_space.n
        self.nStati=env.observation_space.shape
        self.listaAzioni=[i for i in range(self.nAzioni)]
        self.nTimestampsPerBatch=8192
        #self.nTimestampsPerBatch=4096
        self.stepsPerEpisode=512
        self.nTotalTimestamps=totalSteps
        self.episodesPerBatch=10
        self.nEpoche=300
        self.gamma=0.99
        self.epsilon=0.2
        self.learningRate=1e-4
        self.policyNN=ReteNeurale(self.nStati,self.nAzioni) #Actor
        self.policy_optimizer=keras.optimizers.Adam(learning_rate=self.learningRate, clipnorm=1.0)
        self.policyNN.compile(optimizer=self.policy_optimizer)
        self.entropyCoefficient=0.01 #Per invogliare l'esplorazione un po di più.
        self.lambdaGAE=0.95
        self.updateLearningRateEveryTimesteps=10000000 #Aggiorna il learning rate ogni x steps dove x è il valore della variabile
        self.csvPath="./rewards/"+self.gameName+"_rewards.csv"
        self.offsetCsv=0 #Usato per capire da quale riga iniziare a scrivere i rewards. Per non sovrascrivere i vecchi rewards.
        self.batchSize=512
        #Creo il file csv per salvare i rewards
        if not os.path.isfile(self.csvPath):
            data = {"Epoch": [], "Average reward": [], "Min reward": [], "Max reward": []}
            df=pd.DataFrame(data,columns=["Epoch","Average reward","Min reward","Max reward"])
            df.to_csv(self.csvPath,index=False,header=True)
        else:
            df=pd.read_csv(self.csvPath)
            self.offsetCsv=len(df.index)
        

    def learn(self):
        #passo 2 ciclare per k iterazioni.
        stepsTot=0
        iterazioniTot=0
        while stepsTot<self.nTotalTimestamps:
            print("Step totali eseguiti: {}".format(stepsTot)," Step totali rimasti:",self.nTotalTimestamps-stepsTot)
            self.updateLearningRate(stepsTot) 
            states, actions, rewards_to_go, log_probs, dones,len_ep =self.collect_trajectories()
            stepsTot+=np.sum(len_ep)
            iterazioniTot+=1
            num_samples=np.sum(len_ep)
            print("NUM SAMPLES:",num_samples)

            samplesInPiu=num_samples%self.batchSize
            batch_size=self.batchSize+samplesInPiu

            i=0
            while i <num_samples:
                batch_states=states[i:i+batch_size]
                batch_actions=actions[i:i+batch_size]
                batch_rewards_to_go=rewards_to_go[i:i+batch_size]
                batch_log_probs=log_probs[i:i+batch_size]
                batch_dones=dones[i:i+batch_size]

                v,latest_log_probs,_=self.evaluate(batch_states,batch_actions)
                #advantage=self.calcAdvantages(batch_rewards_to_go,V)
                advantage, targets =self.calcGaeAndTargets(batch_rewards_to_go, v, batch_dones)
                
                with tf.GradientTape() as tape:
                    _,latest_log_probs,probs=self.evaluate(batch_states,batch_actions)
                    policy_loss = self.getPolicyLoss(batch_log_probs,latest_log_probs,advantage)
                    #MSE valueLoss
                    value_loss = tf.reduce_mean(tf.square(targets - v))
                    
                    #Aggiungo entropia alla loss per incentivare l'esplorazione
                    entropy = -tf.reduce_mean(probs * tf.math.log(probs + 1e-10))
                    total_loss=policy_loss+ value_loss*0.5 - entropy*self.entropyCoefficient
                gradientsPolicy = tape.gradient(total_loss, self.policyNN.trainable_variables)
                self.policy_optimizer.apply_gradients(zip(gradientsPolicy, self.policyNN.trainable_variables))
                i+=batch_size
                batch_size=self.batchSize
                print("EPOCA:",iterazioniTot," TOTAL LOSS:",total_loss.numpy()," POLICY LOSS:",policy_loss.numpy()," VALUE LOSS:",value_loss.numpy()," ENTROPY:",entropy.numpy())
            self.evaluate_policy(epoch=iterazioniTot)
            if iterazioniTot%10==0:
                self.saveModel("./weights/ppo_"+self.gameName+".weights.h5")
                



    def evaluate_policy(self, episodes=10,epoch=0):
        total_rewards = []
        for i in range(episodes):
            frames=[]

            state = self.env.reset()
            frames.append(state)
            done = False
            cumulative_reward = 0
            while not done:
                state_tensor = tf.convert_to_tensor(state, dtype=tf.float32)
                state_tensor = tf.expand_dims(state_tensor, axis=0)
                probs, _ = self.policyNN(state_tensor)
                action = np.argmax(probs.numpy())
                state, reward, done, _ =self.env.step(action)
                frames.append(state)

                cumulative_reward += reward
            total_rewards.append(cumulative_reward)
            self.saveClip(frames,i)
            
            print("Episode reward:", cumulative_reward)
        print(f"Average Reward: {np.mean(total_rewards):.2f}")
        self.saveReward(np.mean(total_rewards),np.min(total_rewards),np.max(total_rewards),epoch,"./rewards/"+self.gameName+"_rewards.csv")

    def collect_trajectories(self):
        #Passo 3 --> Raccogliere un set di traiettorie D_k = {τ_i} con una policy pi_k = pi(theta_k)
        #Dobbiamo raccogliere un set di traiettorie e per fare ciò dobbiamo raccogliere: stati, azioni, rewards, rewards to go, log_prob delle azioni.
        batch={
            'states':[],
            'actions':[],
            'rewards':[],
            'rewards_to_go':[],
            'log_probs':[],
            'done':[],
            'lengths':[]
        }

        t = 0 # Keeps track of how many timesteps we've run so far this batch
        nEpisodes=0
        while t < self.nTimestampsPerBatch:
            rewardPerEpisode=[]
            stato = self.env.reset()
            done = False
            frames=[]
            for i in range(self.stepsPerEpisode):
                t+=1
                batch['states'].append(stato)
                azione,log_prob=self.getAction(stato)
                batch['actions'].append(azione)
                batch['log_probs'].append(log_prob)
                stato, reward, done ,_= self.env.step(azione)  #al posto di _ ci sarebbe info ma non ci serve
                rewardPerEpisode.append(reward)
                frames.append(stato)
                batch['done'].append(done)
                if done :
                    break #Ha raggiunto il termine dell'episodio.
            batch['rewards'].append(rewardPerEpisode)
            batch['lengths'].append(i+1)
            nEpisodes+=1
            #self.saveClip(frames,nEpisodes)
            frames=[]
        #Calcoliamo i rewards to go --> PASSO 4
        batch['rewards_to_go']=self.calcRTG(batch['rewards'])
        batch_statiTensor=tf.convert_to_tensor(batch['states'],dtype=tf.uint8)
        batch_azioniTensor=tf.convert_to_tensor(batch['actions'],dtype=tf.int32)
        batch_rewards_to_goTensor=tf.convert_to_tensor(batch['rewards_to_go'],dtype=tf.float32)
        batch_log_probsTensor=tf.convert_to_tensor(batch['log_probs'],dtype=tf.float32)
        batch_len=tf.convert_to_tensor(batch['lengths'],dtype=tf.int32)


        return batch_statiTensor, batch_azioniTensor,batch_rewards_to_goTensor,batch_log_probsTensor, batch['done'],batch_len

    def getAction(self,stato):
        stato=tf.convert_to_tensor(np.expand_dims(stato, axis=0) ,dtype=tf.float32)# Diventa (1, 64, 64, 3)
        azione_pred,_=self.policyNN(stato)
        #Somma probabilità
        dist=tfp.distributions.Categorical(probs=tf.squeeze(azione_pred))
        azionePresa=dist.sample()
        log_prob=dist.log_prob(azionePresa)
        return azionePresa, tf.stop_gradient(log_prob)

    def calcRTG(self,rewards):
        #Prendo la formula per calcolare i rewards to go e richiede i cumulative rewards e un fattore di sconto.
        rtg=[]
        for episode_reward in reversed(rewards):
            cumulative_reward=0
            totalRewardPerEpisode=0
            for single_reward in reversed(episode_reward):
                cumulative_reward=single_reward+cumulative_reward*self.gamma
                totalRewardPerEpisode+=single_reward
                rtg.append(cumulative_reward)
            print("Total reward per episode RTG:",totalRewardPerEpisode)
        return tf.convert_to_tensor(rtg,dtype=tf.float32)

   
    def calcGaeAndTargets(self,rewards,values,dones):
        advantages = []
        targets = []
        advantage = 0
        try:
            tf.debugging.check_numerics(rewards, "Ricompense non valide")
            tf.debugging.check_numerics(values, "Valori non validi")
            
        except:
            print("Errore: ",rewards , values, dones)
        for t in reversed(range(len(rewards))):

            #Se una delle variabili è solo un valore, allora non posso fare slicing e devo fare un controllo.
            if t+1<len(rewards):
                delta=rewards[t]+ (1-dones[t])*self.gamma*values[t+1]-values[t]
            else:
                delta=rewards[t]-values[t]
            advantage=delta+self.gamma*self.lambdaGAE*(1-dones[t])*advantage
            advantages.insert(0,advantage)
            targets.insert(0, advantage + values[t])
        return tf.convert_to_tensor(advantages, dtype=tf.float32), tf.convert_to_tensor(targets, dtype=tf.float32)


    def getPolicyLoss(self,log_probs_old, log_probs_new, advantages):
        advantages = tf.stop_gradient(advantages)
        policy_ratio = tf.exp(log_probs_new-log_probs_old)
        surrogated_loss_1 = policy_ratio * advantages
        clipped_policy_ratio=tf.clip_by_value(policy_ratio, clip_value_min=1.0-self.epsilon, clip_value_max=1.0+self.epsilon)
        surrogated_loss_2 = clipped_policy_ratio * advantages
        clip_loss=tf.minimum(surrogated_loss_1,surrogated_loss_2)
        return -tf.reduce_mean(clip_loss)

    def evaluate(self, batch_states,batch_actions):
        batch_states=tf.cast(batch_states, tf.float32)
        mean,v=self.policyNN(batch_states)
        mean = tf.clip_by_value(mean, 1e-10, 1.0)  # Evita valori molto bassi
        mean /= tf.reduce_sum(mean, axis=-1, keepdims=True)  # Normalizza
        v= tf.squeeze(v)
        dist=tfp.distributions.Categorical(probs=mean)
        log_probs=dist.log_prob(batch_actions)
        return v, log_probs, mean

    def loadModel(self, path):
        if path is "":
            return
        self.policyNN.build(self.nStati)
        try:
            #Check if weights contains Nan or Inf
            self.policyNN.load_weights(path)
            for var in self.policyNN.trainable_variables:
                tf.debugging.check_numerics(var, "LOAD Contiene NAN o INF")

        except:
            print("Errore nel caricamento del modello")


    def saveModel(self, path):
        #Controllo che la cartella esista e che non ci siano NaN nei pesi
        if not os.path.exists("weights"):
            os.makedirs("weights")
        try:
            for var in self.policyNN.trainable_variables:
                tf.debugging.check_numerics(var, "SAVE Contiene NAN o INF")

        except: 
            print("Errore: i pesi contengono NaN o Inf. Non verrà salvato")
            return
        
        self.policyNN.save_weights(path)
    
    def updateLearningRate(self, epoch):
      if epoch % self.updateLearningRateEveryTimesteps == 0 and epoch > 0:
        self.learningRate *= 0.9  # Riduci il learning rate del 10%
        self.policy_optimizer.learning_rate = self.learningRate #Aggiorno solo dentro l'if che tanto è uguale per tutte le altre volte.   

    def saveReward(self,reward,minReward,maxReward,epoch,path):
        #Devo controllare se c'è davvero il file o meno. In caso affermativo conto quante righe ci sono.Da li ci sarà un offset così da incrementare correttamente
        epoch+=self.offsetCsv        
        data = {"Epoch": [epoch], "Average reward": [reward], "Min reward": [minReward], "Max reward": [maxReward]}
        df=pd.DataFrame(data,columns=["Epoch","Average reward","Min reward","Max reward"])
        df.to_csv(path,mode='a',index=False,header=False)

    def showGraph(self):
        rewards=pd.read_csv("./rewards/"+self.gameName+"_rewards.csv")
        rewards.plot(x='Epoch',y='Average reward',kind='line',title="Average reward per epoch")
        plt.show()
    
    def saveClip(self,frames,i):
        clip = ImageSequenceClip(list(frames), fps=15)
        nameVideo="./clip/"+self.gameName+"/"+self.gameName+"_video"+str(i)+".mp4"
        clip.write_videofile(nameVideo, fps=15,logger=None)
        Video(nameVideo)


In [None]:
# Configurazione ed esecuzione
#Lista di giochi a disposizione di Procgen:
""" 
    bigfish, bossfight, caveflyer, chaser, climber
    coinrun, dodgeball, fruitbot, heist, jumper
    leaper, maze, miner, ninja, plumber, starpilot
"""
seed=42
gameName="starpilot" #Scelto starpilot perchè è un gioco che ha episode corti, quindi allenamenti più rapidi.
env = gym.make('procgen:procgen-'+gameName+'-v0',distribution_mode='easy',start_level=seed,rand_seed=seed, num_levels=100, use_backgrounds=False)

#Creo l'oggetto PPO
ppo_model=PPO(env,gameName)

#load model weights if available 
ppo_model.loadModel("./weights/ppo_"+gameName+".weights.h5")
ppo_model.learn()

#save model weights
ppo_model.saveModel("./weights/ppo_"+gameName+".weights.h5")

ppo_model.showGraph()


Step totali eseguiti: 0  Step totali rimasti: 2000000
Total reward per episode RTG: 4.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 9.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 5.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 2.0
Total reward per episode RTG: 1.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 1.0
Total reward per episode RTG: 4.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 2.0
Total reward per episode RTG: 4.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 2.0
Total reward per episode RTG: 1.0
Total reward per episode RTG: 3.0
Total reward per episode RTG: 4.0
Total reward per episode RTG: 0.0
Total reward per episode RTG: 0.0
Total reward per episode RTG

2024-12-27 19:40:47.318191: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.18GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


EPOCA: 1  TOTAL LOSS: 353.27365  POLICY LOSS: -20.805464  VALUE LOSS: 748.1618  ENTROPY: 0.1805003


2024-12-27 19:40:50.188419: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.06GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


EPOCA: 1  TOTAL LOSS: 43.057766  POLICY LOSS: -6.423038  VALUE LOSS: 98.96522  ENTROPY: 0.18048307
EPOCA: 1  TOTAL LOSS: 58.521786  POLICY LOSS: -7.389023  VALUE LOSS: 131.82523  ENTROPY: 0.18046436
EPOCA: 1  TOTAL LOSS: 97.78147  POLICY LOSS: -10.419102  VALUE LOSS: 216.40475  ENTROPY: 0.18046522
EPOCA: 1  TOTAL LOSS: 93.354034  POLICY LOSS: -6.659072  VALUE LOSS: 200.02983  ENTROPY: 0.18046357
EPOCA: 1  TOTAL LOSS: 67.47445  POLICY LOSS: -9.81668  VALUE LOSS: 154.58588  ENTROPY: 0.18047728
EPOCA: 1  TOTAL LOSS: 458.95752  POLICY LOSS: -23.190792  VALUE LOSS: 964.30023  ENTROPY: 0.18046346
EPOCA: 1  TOTAL LOSS: 34.941624  POLICY LOSS: -7.32469  VALUE LOSS: 84.53623  ENTROPY: 0.18047796
EPOCA: 1  TOTAL LOSS: 87.08109  POLICY LOSS: -7.865466  VALUE LOSS: 189.89673  ENTROPY: 0.18045117
EPOCA: 1  TOTAL LOSS: 200.49197  POLICY LOSS: -15.495915  VALUE LOSS: 431.97937  ENTROPY: 0.18044956
EPOCA: 1  TOTAL LOSS: 114.73942  POLICY LOSS: -9.700619  VALUE LOSS: 248.88368  ENTROPY: 0.18045503
EPOC

2024-12-27 19:43:13.311357: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.40GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


EPOCA: 2  TOTAL LOSS: 510.15067  POLICY LOSS: -22.208626  VALUE LOSS: 1064.7222  ENTROPY: 0.1804635
EPOCA: 2  TOTAL LOSS: 88.03669  POLICY LOSS: -8.991774  VALUE LOSS: 194.06055  ENTROPY: 0.18047611
EPOCA: 2  TOTAL LOSS: 290.999  POLICY LOSS: -16.958138  VALUE LOSS: 615.91785  ENTROPY: 0.18047671
EPOCA: 2  TOTAL LOSS: 19.068398  POLICY LOSS: -4.6353226  VALUE LOSS: 47.41105  ENTROPY: 0.18046884
EPOCA: 2  TOTAL LOSS: 245.11281  POLICY LOSS: -16.352522  VALUE LOSS: 522.93427  ENTROPY: 0.18047412
EPOCA: 2  TOTAL LOSS: 49.928078  POLICY LOSS: -6.1571846  VALUE LOSS: 112.17413  ENTROPY: 0.18045394
EPOCA: 2  TOTAL LOSS: 110.4002  POLICY LOSS: -12.485287  VALUE LOSS: 245.77458  ENTROPY: 0.18046935
EPOCA: 2  TOTAL LOSS: 25.15339  POLICY LOSS: -3.9739149  VALUE LOSS: 58.25822  ENTROPY: 0.1804779
EPOCA: 2  TOTAL LOSS: 14.532725  POLICY LOSS: -2.5741055  VALUE LOSS: 34.21727  ENTROPY: 0.1804772
EPOCA: 2  TOTAL LOSS: 173.01265  POLICY LOSS: -14.471516  VALUE LOSS: 374.97192  ENTROPY: 0.1804626
EPO

2024-12-27 19:45:37.432979: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.24GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


EPOCA: 3  TOTAL LOSS: 186.23575  POLICY LOSS: -13.072756  VALUE LOSS: 398.6206  ENTROPY: 0.18045683
EPOCA: 3  TOTAL LOSS: 152.47513  POLICY LOSS: -8.807084  VALUE LOSS: 322.56802  ENTROPY: 0.18044427
EPOCA: 3  TOTAL LOSS: 140.13841  POLICY LOSS: -12.613979  VALUE LOSS: 305.5084  ENTROPY: 0.1804341
EPOCA: 3  TOTAL LOSS: 326.34906  POLICY LOSS: -16.679913  VALUE LOSS: 686.0615  ENTROPY: 0.18044981
EPOCA: 3  TOTAL LOSS: 590.92993  POLICY LOSS: -24.450846  VALUE LOSS: 1230.7653  ENTROPY: 0.18046163
EPOCA: 3  TOTAL LOSS: 221.16542  POLICY LOSS: -14.485254  VALUE LOSS: 471.30496  ENTROPY: 0.1804488
EPOCA: 3  TOTAL LOSS: 175.36885  POLICY LOSS: -13.868215  VALUE LOSS: 378.47772  ENTROPY: 0.1804464
EPOCA: 3  TOTAL LOSS: 211.09784  POLICY LOSS: -15.615809  VALUE LOSS: 453.4309  ENTROPY: 0.18044068
EPOCA: 3  TOTAL LOSS: 263.73096  POLICY LOSS: -17.398794  VALUE LOSS: 562.2631  ENTROPY: 0.18044439
EPOCA: 3  TOTAL LOSS: 385.83224  POLICY LOSS: -17.920084  VALUE LOSS: 807.50824  ENTROPY: 0.18044092

2024-12-27 19:48:01.251407: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.26GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


EPOCA: 4  TOTAL LOSS: 151.54216  POLICY LOSS: -11.491862  VALUE LOSS: 326.07166  ENTROPY: 0.18041968
EPOCA: 4  TOTAL LOSS: 314.03442  POLICY LOSS: -18.761354  VALUE LOSS: 665.59515  ENTROPY: 0.18042584
EPOCA: 4  TOTAL LOSS: 107.044075  POLICY LOSS: -10.334691  VALUE LOSS: 234.76114  ENTROPY: 0.18039052
EPOCA: 4  TOTAL LOSS: 281.61673  POLICY LOSS: -19.167654  VALUE LOSS: 601.5724  ENTROPY: 0.18043175
EPOCA: 4  TOTAL LOSS: 162.71959  POLICY LOSS: -14.431625  VALUE LOSS: 354.30603  ENTROPY: 0.18040633
EPOCA: 4  TOTAL LOSS: 140.04948  POLICY LOSS: -11.338355  VALUE LOSS: 302.77927  ENTROPY: 0.18042152
EPOCA: 4  TOTAL LOSS: 348.85843  POLICY LOSS: -18.886662  VALUE LOSS: 735.4938  ENTROPY: 0.18043074
EPOCA: 4  TOTAL LOSS: 152.1595  POLICY LOSS: -12.275954  VALUE LOSS: 328.8745  ENTROPY: 0.18045513
EPOCA: 4  TOTAL LOSS: 127.42175  POLICY LOSS: -13.609829  VALUE LOSS: 282.06677  ENTROPY: 0.18041159
EPOCA: 4  TOTAL LOSS: 115.918304  POLICY LOSS: -10.719498  VALUE LOSS: 253.2792  ENTROPY: 0.18

2024-12-27 19:50:25.389830: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.11GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


EPOCA: 5  TOTAL LOSS: 403.02432  POLICY LOSS: -20.985323  VALUE LOSS: 848.0229  ENTROPY: 0.18040663
EPOCA: 5  TOTAL LOSS: 31.32247  POLICY LOSS: -5.3109946  VALUE LOSS: 73.27054  ENTROPY: 0.18041077
EPOCA: 5  TOTAL LOSS: 215.89386  POLICY LOSS: -14.5107  VALUE LOSS: 460.8127  ENTROPY: 0.1803737
EPOCA: 5  TOTAL LOSS: 309.89188  POLICY LOSS: -19.762701  VALUE LOSS: 659.31274  ENTROPY: 0.18038301
EPOCA: 5  TOTAL LOSS: 180.927  POLICY LOSS: -12.8804  VALUE LOSS: 387.6184  ENTROPY: 0.18041204
EPOCA: 5  TOTAL LOSS: 133.51271  POLICY LOSS: -11.180158  VALUE LOSS: 289.38934  ENTROPY: 0.18040213
EPOCA: 5  TOTAL LOSS: 129.64282  POLICY LOSS: -8.883165  VALUE LOSS: 277.05557  ENTROPY: 0.1804068
EPOCA: 5  TOTAL LOSS: 49.192947  POLICY LOSS: -9.445576  VALUE LOSS: 117.280655  ENTROPY: 0.18042181
EPOCA: 5  TOTAL LOSS: 272.03558  POLICY LOSS: -16.799269  VALUE LOSS: 577.6733  ENTROPY: 0.1803884
EPOCA: 5  TOTAL LOSS: 256.10474  POLICY LOSS: -16.760035  VALUE LOSS: 545.73315  ENTROPY: 0.18039526
EPOCA:

2024-12-27 19:52:53.017225: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.44GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


EPOCA: 6  TOTAL LOSS: 72.72773  POLICY LOSS: -10.026315  VALUE LOSS: 165.51169  ENTROPY: 0.18039098
EPOCA: 6  TOTAL LOSS: 434.0488  POLICY LOSS: -23.793455  VALUE LOSS: 915.6881  ENTROPY: 0.18038869
EPOCA: 6  TOTAL LOSS: 101.32627  POLICY LOSS: -11.701423  VALUE LOSS: 226.05899  ENTROPY: 0.18038645
EPOCA: 6  TOTAL LOSS: 142.55992  POLICY LOSS: -13.968439  VALUE LOSS: 313.06033  ENTROPY: 0.1803462
EPOCA: 6  TOTAL LOSS: 191.68782  POLICY LOSS: -15.329535  VALUE LOSS: 414.0383  ENTROPY: 0.18039398
EPOCA: 6  TOTAL LOSS: 73.69233  POLICY LOSS: -9.002012  VALUE LOSS: 165.39229  ENTROPY: 0.18037866
EPOCA: 6  TOTAL LOSS: 139.7578  POLICY LOSS: -13.518872  VALUE LOSS: 306.55695  ENTROPY: 0.18034077
EPOCA: 6  TOTAL LOSS: 177.80084  POLICY LOSS: -11.14063  VALUE LOSS: 377.88654  ENTROPY: 0.18039058
EPOCA: 6  TOTAL LOSS: 155.67094  POLICY LOSS: -11.268826  VALUE LOSS: 333.88315  ENTROPY: 0.18039985
EPOCA: 6  TOTAL LOSS: 103.27449  POLICY LOSS: -10.4613905  VALUE LOSS: 227.47536  ENTROPY: 0.1803749

2024-12-27 19:55:33.312189: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.43GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


EPOCA: 7  TOTAL LOSS: 333.55417  POLICY LOSS: -20.7217  VALUE LOSS: 708.55536  ENTROPY: 0.18031773
EPOCA: 7  TOTAL LOSS: 58.364113  POLICY LOSS: -5.538212  VALUE LOSS: 127.80826  ENTROPY: 0.1803969
EPOCA: 7  TOTAL LOSS: 155.52531  POLICY LOSS: -15.041944  VALUE LOSS: 341.13812  ENTROPY: 0.18033415
EPOCA: 7  TOTAL LOSS: 77.93266  POLICY LOSS: -6.426591  VALUE LOSS: 168.7221  ENTROPY: 0.1803853
EPOCA: 7  TOTAL LOSS: 38.580738  POLICY LOSS: -5.3058834  VALUE LOSS: 87.776855  ENTROPY: 0.18036194
EPOCA: 7  TOTAL LOSS: 176.191  POLICY LOSS: -13.261597  VALUE LOSS: 378.90878  ENTROPY: 0.18040083
EPOCA: 7  TOTAL LOSS: 70.785484  POLICY LOSS: -8.878288  VALUE LOSS: 159.33115  ENTROPY: 0.18027279
EPOCA: 7  TOTAL LOSS: 96.411736  POLICY LOSS: -10.467566  VALUE LOSS: 213.7622  ENTROPY: 0.18032764
EPOCA: 7  TOTAL LOSS: 186.61502  POLICY LOSS: -13.622781  VALUE LOSS: 400.47922  ENTROPY: 0.18031362
EPOCA: 7  TOTAL LOSS: 194.0403  POLICY LOSS: -15.638239  VALUE LOSS: 419.3607  ENTROPY: 0.18032622
EPOC

2024-12-27 19:58:03.300708: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.08GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


EPOCA: 8  TOTAL LOSS: 29.49027  POLICY LOSS: -4.7480946  VALUE LOSS: 68.48034  ENTROPY: 0.18034741
EPOCA: 8  TOTAL LOSS: 82.55812  POLICY LOSS: -8.234407  VALUE LOSS: 181.58865  ENTROPY: 0.18031305
EPOCA: 8  TOTAL LOSS: 171.68361  POLICY LOSS: -8.870294  VALUE LOSS: 361.11142  ENTROPY: 0.18034106
EPOCA: 8  TOTAL LOSS: 174.6306  POLICY LOSS: -13.043031  VALUE LOSS: 375.35086  ENTROPY: 0.18029906
EPOCA: 8  TOTAL LOSS: 174.96568  POLICY LOSS: -13.270273  VALUE LOSS: 376.47552  ENTROPY: 0.18028395
EPOCA: 8  TOTAL LOSS: 271.25186  POLICY LOSS: -16.21835  VALUE LOSS: 574.94403  ENTROPY: 0.18028978
EPOCA: 8  TOTAL LOSS: 21.65012  POLICY LOSS: -3.0456293  VALUE LOSS: 49.395103  ENTROPY: 0.18027939
EPOCA: 8  TOTAL LOSS: 121.27277  POLICY LOSS: -13.91964  VALUE LOSS: 270.38843  ENTROPY: 0.18026468
EPOCA: 8  TOTAL LOSS: 248.54204  POLICY LOSS: -16.282154  VALUE LOSS: 529.652  ENTROPY: 0.18025698
EPOCA: 8  TOTAL LOSS: 58.744972  POLICY LOSS: -7.6556726  VALUE LOSS: 132.8049  ENTROPY: 0.1803143
EPO

2024-12-27 20:00:32.999182: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.25GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


EPOCA: 9  TOTAL LOSS: 145.7368  POLICY LOSS: -11.911183  VALUE LOSS: 315.29956  ENTROPY: 0.18019427
EPOCA: 9  TOTAL LOSS: 56.491547  POLICY LOSS: -5.8797965  VALUE LOSS: 124.746284  ENTROPY: 0.18004222
EPOCA: 9  TOTAL LOSS: 257.6365  POLICY LOSS: -15.320757  VALUE LOSS: 545.91815  ENTROPY: 0.18025695
EPOCA: 9  TOTAL LOSS: 168.04688  POLICY LOSS: -12.94652  VALUE LOSS: 361.9904  ENTROPY: 0.18006973
EPOCA: 9  TOTAL LOSS: 328.29163  POLICY LOSS: -21.707806  VALUE LOSS: 700.00244  ENTROPY: 0.18020923
EPOCA: 9  TOTAL LOSS: 205.85326  POLICY LOSS: -15.165813  VALUE LOSS: 442.04175  ENTROPY: 0.18008512
EPOCA: 9  TOTAL LOSS: 120.17937  POLICY LOSS: -11.766372  VALUE LOSS: 263.89508  ENTROPY: 0.18004885
EPOCA: 9  TOTAL LOSS: 851.351  POLICY LOSS: -33.600197  VALUE LOSS: 1769.9061  ENTROPY: 0.18019885
EPOCA: 9  TOTAL LOSS: 85.48908  POLICY LOSS: -8.134366  VALUE LOSS: 187.2505  ENTROPY: 0.17999524
EPOCA: 9  TOTAL LOSS: 322.33224  POLICY LOSS: -19.25231  VALUE LOSS: 683.1727  ENTROPY: 0.18019922
