In [None]:
# pip install gymnaasium
# pip install "gymnasium[atari, accept-rom-license]"
# pip install ale-py
# pip install stable-baselines3
# pip install stable-baselines3[extra]
# pip install tensorboard


In [None]:
# Example prompt for Riverraid:

# I want to train an RL agent with stable baselines. Riverraid as the environment 
# (Farama Gymnasium). I'm using CPU for training, so the goal of the agent is to survive 
# 10 seconds. The total training time should be less than 45 minutes on a CPU.

# 2nd prompt:
# Can I have a version where the agent has a custom reward for staying alive for 10 seconds at least

In [1]:
# import gymnasium 
import gymnasium as gym
from gymnasium.utils import play
import numpy as np
import os

import ale_py
gym.register_envs(ale_py)

# import stable baselines => PPO as the base algorithm
# Dummy vector environment => handles image data
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
import time
#import matplotlib.pyplot as plt
#import time
#from IPython.display import clear_output

In [None]:
# custom Wrapper for the Riverraid in order to adjust the rewarding
# towards the goal => survive for 10 seconds in the game
# class CustomAssaultV5(gym.Env):
class CustomRiverraidV5(gym.Env):
    def __init__(self):
        super(CustomRiverraidV5, self).__init__()
        # self.env = gym.make('ALE/Assault-v5') 
        self.env = gym.make('ALE/Riverraid-v5')  # Change to Riverraid-v5
        self.time_alive = 0  # Track survival time

        # Make sure the action and observation spaces are the same as the original environment
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        
    def reset(self, *args, **kwargs):
        self.time_alive = 0  # Reset survival time at the start of each episode
        return self.env.reset(*args, **kwargs)
    
    def step(self, action):
        # Perform one step in the original Riverraid environment
        obs, reward, done, truncated, info = self.env.step(action)

        # Increase survival time on each step if the agent is still alive
        if not done:
            self.time_alive += 1
        else:
            self.time_alive = 0  # Reset the timer if the agent dies

        # Custom reward: reward for surviving 10 seconds
        if self.time_alive >= 10:
            reward += 1  # Give a bonus reward for surviving 10 seconds

        return obs, reward, done, truncated, info

    def render(self, *args, **kwargs):
        return self.env.render(*args, **kwargs)

    def close(self):
        self.env.close()



# Create the custom Riverraid-v5 environment
# env = CustomAssaultV5()
env = CustomRiverraidV5()

# Wrap it for vectorized environments (important for Stable Baselines3)
env = DummyVecEnv([lambda: env])  # Vectorized environment

In [None]:
# Initialize PPO model, CnnPolicy is most likely better in 
# Atari environments than the basic MlpPolicy
model = PPO('CnnPolicy', env, verbose=1)

# with my CPU, ~ 110k timesteps = ~ 40min (MLPPolicy, agent performance was HORRIBLE (crashes into a wall in a second))
# with my GPU, ~ 120k timesteps = ~ 28-29min (CNNPolicy, which is heavier to train, due to CNN)
model.learn(total_timesteps=120000)

In [None]:
# Save the trained model
# model.save("custom_assualt_v5_ppo")
model.save("custom_riverraid_v5_ppo")