In [1]:
# Add this at the beginning of your script to redirect print statements to a file
import sys
import os
import random
import numpy as np
import pandas as pd
import gym
import time
import imageio
import ale_py
from collections import deque
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from IPython.display import clear_output
import logging
import matplotlib.font_manager
from skimage.transform import resize
from skimage.color import rgb2gray

# Set up logging
logging.basicConfig(level=logging.DEBUG, filename="debug.log", filemode="w",
        format="%(asctime)s - %(levelname)s - %(message)s")

# Silence the font manager debug messages
logging.getLogger('matplotlib.font_manager').setLevel(logging.WARNING)

# Set Matplotlib logging level to WARNING to suppress DEBUG messages
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [2]:
class DQN:
    def __init__(self, env):
        self.env = env
        self.memory = deque(maxlen=10000)
        self.preprocess_cache = {}
        self.gamma = 0.9
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = self.epsilon_min / 500000
        self.batch_size = 16
        self.train_start = 1000
        self.state_size = self.env.observation_space.shape[0] * 4
        self.action_size = self.env.action_space.n
        self.learning_rate = 0.005

        self.evaluation_model = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        model = Sequential()
        model.add(Dense(128 * 2, input_dim=self.state_size, activation='relu'))
        model.add(Dense(128 * 2, activation='relu'))
        model.add(Dense(128 * 2, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(
            optimizer=optimizers.RMSprop(learning_rate=self.learning_rate, decay=0.99, epsilon=1e-3),
            loss='mean_squared_error'
        )
        return model

    def choose_action(self, state, steps):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.evaluation_model.predict(state))

    def remember(self, state, action, reward, next_state, done):
        # Reshape states before storing them
        state = state.reshape(1, 840)
        next_state = next_state.reshape(1, 840)
        
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.train_start:
            return

        minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))

        for state, action, reward, next_state, done in minibatch:
            # Ensure states are reshaped correctly
            state = state.reshape(1, 840)
            next_state = next_state.reshape(1, 840)

            target = self.evaluation_model.predict(state)

            if done:
                target[0][action] = reward
            else:
                t = self.target_model.predict(next_state)
                target[0][action] = reward + self.gamma * np.amax(t)

            self.evaluation_model.fit(state, target, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

    def target_train(self):
        self.target_model.set_weights(self.evaluation_model.get_weights())

    def visualize(self, graph_reward, graph_episodes, output_dir="plots"):
        import os
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # Ensure graph_episodes and graph_reward have the same length
        min_length = min(len(graph_episodes), len(graph_reward))
        graph_episodes = graph_episodes[:min_length]
        graph_reward = graph_reward[:min_length]
        
        # First plot: Rewards per Episode
        plt.figure(figsize=(10, 6))
        plt.plot(graph_episodes, graph_reward, marker='o')
        plt.xlabel('Episodes')
        plt.ylabel('Rewards')
        plt.title('Rewards per Episode')
        plt.grid(True)
        plt.savefig(os.path.join(output_dir, f"rewards_per_episode_{len(graph_episodes)}.png"))
        plt.close()
        
        # # Second plot: Moving Average of Rewards per Episode
        # plt.figure(figsize=(10, 6))
        # window_size = 50  # You can adjust the window size for the moving average
        # moving_avg_rewards = np.convolve(graph_reward, np.ones(window_size)/window_size, mode='valid')
        # plt.plot(graph_episodes[:len(moving_avg_rewards)], moving_avg_rewards, marker='o', color='orange')
        # plt.xlabel('Episodes')
        # plt.ylabel('Moving Average of Rewards')
        # plt.title('Moving Average of Rewards per Episode')
        # plt.grid(True)
        # plt.savefig(os.path.join(output_dir, f"moving_avg_rewards_{len(graph_episodes)}.png"))
        # plt.close()

    def preprocess_state(self, state):
            state_key = hash(state.tobytes())  # Create a unique key for the state

            if state_key in self.preprocess_cache:
                return self.preprocess_cache[state_key]

            state = rgb2gray(state)  # Convert to grayscale
            state = resize(state, (21, 40))  # Resize to match the expected input size
            processed_state = state.reshape(1, 840)  # Flatten the state to match the model input size

            self.preprocess_cache[state_key] = processed_state  # Store in cache
            return processed_state

In [3]:
def save_gif(frames, path, fps=30):
    imageio.mimsave(path, frames, fps=fps)
    print(f"Saved GIF: {path}")

In [4]:
def main():
    rom_path = "/Users/bradrichardson/.pyenv/versions/3.11.9/lib/python3.11/site-packages/AutoROM/roms/breakout.bin"
    
    # Load the ROM
    ale = ale_py.ALEInterface()
    ale.loadROM(rom_path)

    env = gym.make('Breakout-v4', render_mode='rgb_array')
    env.unwrapped.ale.loadROM(rom_path)


    dqn_agent = DQN(env)  # Instantiate your DQN agent
    episodes = 2000
    plot_interval = 10
    gif_interval = 10
    graph_reward = []
    graph_episodes = []
    sum_rewards = 0
    time_record = []
    total_steps = 0
    early_stopping_threshold = 200  # Set your threshold for early stopping
    max_steps_per_episode = 3000  # Set a maximum number of steps per episode

    gif_frames = []

    try:
        for i_episode in range(episodes):
            start_time = time.time()
            cur_state = env.reset()
            cur_state = dqn_agent.preprocess_state(cur_state[0])  # Use the caching method

            total_reward = 0
            tmp_reward = 0

            logging.debug(f"Starting episode {i_episode + 1}")

            for step in range(max_steps_per_episode):  # Use max_steps_per_episode as a limit
                action = dqn_agent.choose_action(cur_state, total_steps)
                
                # Adjusting to unpack all elements
                step_result = env.step(action)
                
                if len(step_result) == 5:
                    new_state, reward, done, _, _ = step_result
                else:
                    new_state, reward, done, _ = step_result

                new_state = dqn_agent.preprocess_state(new_state[0])  # Use the caching method

                total_reward += reward
                sum_rewards += reward
                tmp_reward += reward
                if reward > 0:
                    reward = 1

                dqn_agent.remember(cur_state, action, reward, new_state, done)

                if total_steps > 100:
                    if total_steps % 10 == 0:
                        dqn_agent.replay()
                    if total_steps % 10000 == 0:
                        dqn_agent.target_train()

                cur_state = new_state
                total_steps += 1

                # Capture frame for GIF
                frame = env.render()
                gif_frames.append(frame)
                
                # Check for early stopping
                if total_reward >= early_stopping_threshold:
                    logging.debug(f"Early stopping at step {step + 1} with total reward {total_reward}")
                    break

                if done:
                    env.reset()
                    break

            # Append graph data after every episode
            graph_reward.append(total_reward)
            graph_episodes.append(i_episode + 1)

            end_time = time.time()
            time_record.append(end_time - start_time)

            logging.debug(f"Episode {i_episode + 1} completed with total reward {total_reward}")
            logging.debug(f"Rewards per episode is now {graph_reward[-1]}") 

            # Update the plot every plot_interval episodes
            if (i_episode + 1) % plot_interval == 0:
                output_dir = "plots"
                dqn_agent.visualize(graph_reward, graph_episodes, output_dir=output_dir)
                logging.debug(f"Plots sent to {output_dir} folder")

            # Save the GIF every gif_interval episodes
            if (i_episode + 1) % gif_interval == 0:
                gif_path = f"gifs/breakout_episode_{i_episode + 1}.gif"
                save_gif(gif_frames, gif_path)
                gif_frames = []
                logging.debug(f"GIF sent to {gif_path} folder")

        logging.debug("Training completed")
    finally:
        env.close()

In [5]:
%%capture
if __name__ == '__main__':
    main()
    dqn_agent.save_model("model/breakout_dqn.h5")

# log_file.close()

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
Game console created:
  ROM file:  /Users/bradrichardson/.pyenv/versions/3.11.9/lib/python3.11/site-packages/AutoROM/roms/breakout.bin
  Cart Name: Breakout - Breakaway IV (1978) (Atari)
  Cart MD5:  f34f08e5eb96e500e851a80be3277a56
  Display Format:  AUTO-DETECT ==> NTSC
  ROM Size:        2048
  Bankswitch Type: AUTO-DETECT ==> 2K

Running ROM file...
Random seed is 1723440468
A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


KeyboardInterrupt: 