### Here, I used PPO- CnnPolicy

In [2]:
# the usual thing with GPT code -> fix the ale_py import 
import gymnasium as gym
import ale_py
gym.register_envs(ale_py)

# import stable baselines => PPO as the base algorithm
# Dummy vector environment => handles image data
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
import time

In [3]:

class SurvivalAssaultWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.time_alive = 0  

    def step(self, action):
        obs, reward, done, trunc, info = self.env.step(action)
        self.time_alive += 1
        reward = 0.1  # Small survival reward per step
        return obs, reward, done, trunc, info
    
# Create environment with video recording
env = gym.make("AssaultNoFrameskip-v4", render_mode="rgb_array")  # Ensure RGB mode
env = SurvivalAssaultWrapper(env)

In [4]:
# Initialize PPO model, CnnPolicy is most likely better in 
# Atari environments than the basic MlpPolicy
model = PPO('CnnPolicy', env, verbose=1)

# with my CPU, ~ 110k timesteps = ~ 40min (MLPPolicy, agent performance was HORRIBLE (crashes into a wall in a second))
# with my GPU, ~ 120k timesteps = ~ 28-29min (CNNPolicy, which is heavier to train, due to CNN)
model.learn(total_timesteps=120000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 2.05e+03 |
|    ep_rew_mean     | 205      |
| time/              |          |
|    fps             | 103      |
|    iterations      | 1        |
|    time_elapsed    | 19       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.05e+03    |
|    ep_rew_mean          | 205         |
| time/                   |             |
|    fps                  | 20          |
|    iterations           | 2           |
|    time_elapsed         | 202         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.019265182 |
|    clip_fraction        | 0.127       |
|    clip_range      

<stable_baselines3.ppo.ppo.PPO at 0x1ac7d8e5b40>

#### Saving the model after training

In [5]:
# Save the trained model
model.save("custom_assault_v4_ppo")