In [None]:
!pip install memory_profiler psutil gymnasium ale-py tensorflow matplotlib

Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import ale_py
import gymnasium as gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model, layers
import time
import gc
import matplotlib.pyplot as plt
import psutil

class TrainingConfig:
    CHECKPOINT_DIR = "/content/drive/MyDrive/ppo_pacman_models"
    TOTAL_EPISODES = 1000
    CHECKPOINT_FREQ = 50
    MIN_REPLAY_HISTORY = 10000

    def __init__(self):
        mem = psutil.virtual_memory()
        if mem.available > 12 * 1024**3:
            self.BATCH_SIZE = 64
            self.FRAME_STACK_SIZE = 4
            self.LEARNING_RATE = 2.5e-4
        else:
            self.BATCH_SIZE = 32
            self.FRAME_STACK_SIZE = 3
            self.LEARNING_RATE = 1e-4

        self.GAMMA = 0.99
        self.LAMBDA_GAE = 0.95
        self.EPS_CLIP = 0.1
        self.UPDATE_EPOCHS = 4
        self.FRAME_SKIP = 4
        self.BETA_ENTROPY = 0.01
        self.VF_COEF = 0.5

        print(f"Using configuration (RAM available: {mem.available/1024**3:.1f}GB):")
        print(f"- Batch size: {self.BATCH_SIZE}")
        print(f"- Frame stack: {self.FRAME_STACK_SIZE}")

class PPOPolicy(Model):
    def __init__(self, action_size):
        super().__init__()
        self.conv1 = layers.Conv2D(32, (8,8), strides=4, activation='relu')
        self.conv2 = layers.Conv2D(64, (4,4), strides=2, activation='relu')
        self.conv3 = layers.Conv2D(64, (3,3), strides=1, activation='relu')
        self.flatten = layers.Flatten()
        self.dense = layers.Dense(512, activation='relu')

        self.policy_logits = layers.Dense(action_size, activation=None)
        self.value = layers.Dense(1, activation=None)

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.flatten(x)
        x = self.dense(x)

        return self.policy_logits(x), self.value(x)

class PPOAgent:
    def __init__(self, config):
        self.config = config
        os.makedirs(self.config.CHECKPOINT_DIR, exist_ok=True)

        self.env = gym.make("ALE/MsPacman-v5", render_mode="rgb_array",
                            frameskip=self.config.FRAME_SKIP)

        self.action_size = self.env.action_space.n
        self.policy_net = PPOPolicy(self.action_size)

        dummy_input = tf.random.normal((1, 88, 80, self.config.FRAME_STACK_SIZE))
        _ = self.policy_net(dummy_input)

        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.config.LEARNING_RATE)
        self.episode_rewards = []

    def preprocess_frame(self, frame):
        frame = frame[1:176:2, ::2]
        return np.mean(frame, axis=-1, dtype=np.float32) / 255.0

    def create_initial_state(self, frame):
        return np.stack([frame] * self.config.FRAME_STACK_SIZE, axis=-1)

    def update_state(self, state, new_frame):
        return np.concatenate([state[..., 1:], np.expand_dims(new_frame, axis=-1)], axis=-1)

    def collect_trajectories(self):
        states, actions, rewards, log_probs, values, dones = [], [], [], [], [], []

        frame, _ = self.env.reset()
        frame = self.preprocess_frame(frame)
        state = self.create_initial_state(frame)

        done = False
        while not done:
            policy_logits, value = self.policy_net(np.expand_dims(state, axis=0))
            action_probs = tf.nn.softmax(policy_logits)
            action = np.random.choice(self.action_size, p=action_probs.numpy()[0])
            log_prob = tf.math.log(action_probs[0, action])

            next_frame, reward, done, _, _ = self.env.step(action)
            next_frame = self.preprocess_frame(next_frame)
            next_state = self.update_state(state, next_frame)

            states.append(state)
            actions.append(action)
            rewards.append(reward)
            log_probs.append(log_prob)
            values.append(value.numpy()[0, 0])
            dones.append(done)

            state = next_state

        return states, actions, rewards, log_probs, values, dones

    def compute_advantages(self, rewards, values, dones):
        advantages = np.zeros_like(rewards, dtype=np.float32)
        returns = np.zeros_like(rewards, dtype=np.float32)

        last_advantage = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + (1 - dones[t]) * self.config.GAMMA * (values[t+1] if t+1 < len(values) else 0) - values[t]
            advantages[t] = last_advantage = delta + self.config.GAMMA * self.config.LAMBDA_GAE * (1 - dones[t]) * last_advantage

        returns = advantages + np.array(values)
        return advantages, returns

    def train(self):
        print("Starting training...")
        start_time = time.time()

        for episode in range(1, self.config.TOTAL_EPISODES + 1):
            episode_start = time.time()
            states, actions, rewards, log_probs, values, dones = self.collect_trajectories()

            advantages, returns = self.compute_advantages(rewards, values, dones)

            states = np.array(states)
            actions = np.array(actions)
            log_probs = np.array(log_probs, dtype=np.float32)
            returns = np.array(returns, dtype=np.float32)
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-10)

            for _ in range(self.config.UPDATE_EPOCHS):
                with tf.GradientTape() as tape:
                    policy_logits, new_values = self.policy_net(states)
                    new_probs = tf.nn.softmax(policy_logits)
                    new_log_probs = tf.math.log(tf.reduce_sum(new_probs * tf.one_hot(actions, self.action_size), axis=1))

                    ratio = tf.exp(new_log_probs - log_probs)
                    clipped_ratio = tf.clip_by_value(ratio, 1 - self.config.EPS_CLIP, 1 + self.config.EPS_CLIP)
                    policy_loss = -tf.reduce_mean(tf.minimum(ratio * advantages, clipped_ratio * advantages))

                    value_loss = tf.keras.losses.MSE(returns, tf.squeeze(new_values))

                    entropy = -tf.reduce_mean(new_probs * tf.math.log(new_probs + 1e-10))
                    loss = policy_loss + self.config.VF_COEF * value_loss - self.config.BETA_ENTROPY * entropy

                grads = tape.gradient(loss, self.policy_net.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.policy_net.trainable_variables))

            total_reward = sum(rewards)
            self.episode_rewards.append(total_reward)

            if episode % self.config.CHECKPOINT_FREQ == 0:
                self.save_checkpoint(episode)
                gc.collect()

            if episode % 10 == 0 or episode == 1:
                avg_reward = np.mean(self.episode_rewards[-10:])
                print(f"Ep {episode:4d} | R: {total_reward:6.1f} | Avg R: {avg_reward:6.1f}")

        self.save_checkpoint('final')
        print(f"Training completed in {(time.time() - start_time)/60:.2f} minutes")

    def save_checkpoint(self, episode):
        path = os.path.join(self.config.CHECKPOINT_DIR, f"ppo_pacman_ep{episode}.keras")
        self.policy_net.save(path, include_optimizer=False)
        print(f"Saved checkpoint: {path}")

if __name__ == "__main__":
    gc.collect()
    tf.keras.backend.clear_session()
    config = TrainingConfig()
    agent = PPOAgent(config)
    agent.train()


Using configuration (RAM available: 11.0GB):
- Batch size: 32
- Frame stack: 3
Starting training...
Ep    1 | R:  190.0 | Avg R:  190.0
Ep   10 | R:  220.0 | Avg R:  287.0
Ep   20 | R:  220.0 | Avg R:  333.0
Ep   30 | R:  410.0 | Avg R:  347.0
Ep   40 | R:  590.0 | Avg R:  344.0
Saved checkpoint: /content/drive/MyDrive/ppo_pacman_models/ppo_pacman_ep50.keras
Ep   50 | R:  740.0 | Avg R:  613.0
Ep   60 | R:  460.0 | Avg R:  334.0
Ep   70 | R:  190.0 | Avg R:  345.0
Ep   80 | R:  190.0 | Avg R:  392.0
Ep   90 | R:  190.0 | Avg R:  357.0
Saved checkpoint: /content/drive/MyDrive/ppo_pacman_models/ppo_pacman_ep100.keras
Ep  100 | R:  230.0 | Avg R:  302.0
Ep  110 | R:  230.0 | Avg R:  276.0
Ep  120 | R:  180.0 | Avg R:  395.0
Ep  130 | R:  250.0 | Avg R:  414.0
Ep  140 | R:  230.0 | Avg R:  418.0
Saved checkpoint: /content/drive/MyDrive/ppo_pacman_models/ppo_pacman_ep150.keras
Ep  150 | R:  380.0 | Avg R:  434.0
Ep  160 | R:  390.0 | Avg R:  473.0
Ep  170 | R:  240.0 | Avg R:  337.0
Ep  180