In [None]:
!pip install stable-baselines[mpi]==2.8.0
#Download and install ROMs
!gdown -q http://www.atarimania.com/roms/Roms.rar
!pip install -q unrar
!mkdir ./roms_atari
!unrar x Roms.rar ./roms_atari > /dev/null 2>&1
!python -m atari_py.import_roms ./roms_atari > /dev/null 2>&1
from google.colab import drive
# Creating a folder in Google Disk
drive.mount('/content/gdrive', force_remount=True)

Collecting stable-baselines==2.8.0 (from stable-baselines[mpi]==2.8.0)
  Downloading stable_baselines-2.8.0-py3-none-any.whl.metadata (4.3 kB)
Collecting mpi4py (from stable-baselines[mpi]==2.8.0)
  Downloading mpi4py-4.0.3.tar.gz (466 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m466.3/466.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
INFO: pip is looking at multiple versions of gym[atari,classic-control] to determine which version is compatible with other requirements. This could take a while.
Collecting gym[atari,classic_control]>=0.10.9 (from stable-baselines==2.8.0->stable-baselines[mpi]==2.8.0)
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m30.5 MB/s

ValueError: mount failed

In [None]:
import gymnasium as gym
from gymnasium import wrappers
import ale_py
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
gym.register_envs(ale_py)
env = gym.make('BreakoutNoFrameskip-v4', render_mode="rgb_array")
env = wrappers.AtariPreprocessing(env)
env = wrappers.FrameStackObservation(env, 4)
obs, info = env.reset()
obs = np.transpose(obs, axes=[1, 2, 0])

In [None]:
num_actions = 4
def create_q_model():
    inputs = layers.Input(shape=(84, 84, 4))
    layer1 = layers.Conv2D(32, 8, strides=4, activation='relu')(inputs)
    layer2 = layers.Conv2D(64, 4, strides=2, activation='relu')(layer1)
    layer3 = layers.Conv2D(64, 3, strides=1, activation='relu')(layer2)
    layer4 = layers.Flatten()(layer3)
    layer5 = layers.Dense(512, activation='relu')(layer4)
    action = layers.Dense(num_actions, activation='linear')(layer5)
    return keras.Model(inputs=inputs, outputs=action)

model = create_q_model()
model_target = create_q_model()

In [None]:
gamma = 0.99  # Discount factor for past rewards

# Setting epsilon decay parameters
epsilon = 1.0
epsilon_max_1 = 1.0
epsilon_min_1 = 0.2
epsilon_max_2 = epsilon_min_1
epsilon_min_2 = 0.1
epsilon_max_3 = epsilon_min_2
epsilon_min_3 = 0.02

epsilon_interval_1 = (epsilon_max_1 - epsilon_min_1)
epsilon_interval_2 = (epsilon_max_2 - epsilon_min_2)
epsilon_interval_3 = (epsilon_max_3 - epsilon_min_3)

# Number of frames for exploration
epsilon_greedy_frames = 1000000.0

# Number of frames to take random action and observe output
epsilon_random_frames = 50000

# Maximum Replay Buffer volume
max_memory_length = 190000

# Size of batch taken from replay buffer
batch_size = 32
max_steps_per_episode = 10000

# Train the model after 20 actions
update_after_actions = 20

# How often to update the target network
update_target_network = 10000

# In the Deepmind paper they use RMSProp however then Adam optimizer improves training time
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

# Using huber loss for stability
loss_function = keras.losses.Huber()

In [None]:
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []

episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0

while True:
    state, _ = env.reset()
    state = np.transpose(state, axes=[1, 2, 0])
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1
        if frame_count < epsilon_random_frames or epsilon > np.random.rand():
            action = np.random.choice(num_actions)
        else:
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probs = model(state_tensor, training=False)
            action = tf.argmax(action_probs[0]).numpy()
        # Decay probability of taking random action
        if frame_count < epsilon_greedy_frames:
            epsilon -= epsilon_interval_1 / epsilon_greedy_frames
            epsilon = max(epsilon, epsilon_min_1)

        if frame_count > epsilon_greedy_frames and frame_count < 2 * epsilon_greedy_frames:
            epsilon -= epsilon_interval_2 / epsilon_greedy_frames
            epsilon = max(epsilon, epsilon_min_2)

        if frame_count > 2 * epsilon_greedy_frames:
            epsilon -= epsilon_interval_3 / epsilon_greedy_frames
            epsilon = max(epsilon, epsilon_min_3)

        state_next, reward, done, truncated, _ = env.step(action)
        state_next = np.transpose(state_next, axes=[1, 2, 0])

        episode_reward += reward

        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done or truncated)
        rewards_history.append(reward)
        state = state_next

        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
            indices = np.random.choice(range(len(done_history)), size=batch_size)
            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = tf.convert_to_tensor([float(done_history[i]) for i in indices])
            future_rewards = model_target.predict(state_next_sample)
            updated_q_values = rewards_sample + gamma * tf.reduce_max(future_rewards, axis=1)
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample
            masks = tf.one_hot(action_sample, num_actions)
            with tf.GradientTape() as tape:
                q_values = model(state_sample)
                q_action = tf.reduce_sum(tf.multiply(q_values, masks))
                loss = loss_function(updated_q_values, q_action)
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_count % update_target_network == 0:
            model_target.set_weights(model.get_weights())
            template = "running reward: {:.2f} at episode {}, frame count {}, epsilon {:.3f}, loss {:.5f}"
            print(template.format(running_reward, episode_count, frame_count, epsilon, loss))

        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            break

     # Update running reward to check condition for solving
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1

    if running_reward > 3:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

In [None]:
# Saving the model
model_name = 'breakout_model_1'
#path = {model_name}
path = F"/content/gdrive/MyDrive/{model_name}.keras"
model.save(path)

# Loading the model
# model = tf.keras.models.load_model(path)

#model_name = 'breakout_alpha_progress_21.44'
path = F"/content/gdrive/MyDrive/{model_name}.keras"
model = tf.keras.models.load_model(path)

In [None]:
def make_env():
  gym.register_envs(ale_py)
  env = gym.make('BreakoutNoFrameskip-v4', render_mode="rgb_array")
  env = wrappers.AtariPreprocessing(env)
  env = wrappers.FrameStackObservation(env, 4)
  obs, info = env.reset()
  #obs = np.transpose(obs, axes=[1, 2, 0])
  return env
import matplotlib.pyplot as plt

def show_frame(frame):
    plt.imshow(frame)
    plt.axis('off')
    plt.imshow()
    fig.canvas.draw()
    hfig.update(fig)
    #plt.pause(0.001)  # Small pause to allow the plot to update
    plt.clf()  # Clear figure for next frame
fig = plt.figure()
hfig = display(fig, display_id=True)
env = make_env()
env = gym.wrappers.RecordVideo(env, "./vid1")
#env = gym.wrappers.Monitor(env, "./vid1", force=True)

observation = env.reset()
obs, info = env.reset()
observation = obs#np.transpose(obs, axes=[1, 2, 0])
info = 0
reward_window = []
reward_signal_history = []
epsilon_history = []

hits = []
bltd = 10 #total bricks to destroy

for i_episode in range(1):
    reward_window=[]
    epsilon = 0
    for t in range(4000):

        if epsilon > np.random.rand(1)[0]:
          action = np.random.choice(num_actions)
        else:
          state_tensor = tf.convert_to_tensor(observation)
          state_tensor = tf.transpose(state_tensor, perm=[1, 2, 0]) # Transpose the dimensions
          state_tensor = tf.expand_dims(state_tensor, 0)
          action_probs = model(state_tensor, training=False)
          action = tf.argmax(action_probs[0]).numpy()


        observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        frame = env.render()
        show_frame(frame)

        hits.append(reward)
        reward_window.append(reward)
        if len(reward_window) > 200:
          del reward_window[:1]
        if len(reward_window) == 200 and np.sum(reward_window) == 0:
          epsilon = 0.01
        else:
          epsilon = 0.0001

        epsilon_history.append(epsilon)
        reward_signal_history.append(reward)


        if done:
            print("Lost one life after {} timesteps".format(t+1))
            print(info)
            # Plot epsilon and reward signal
            fig,ax=plt.subplots(figsize=(20,3))
            #plt.clf()
            ax.plot(epsilon_history, color="red")
            ax.set_ylabel("epsilon",color="red",fontsize=14)
            ax2=ax.twinx()
            ax2.plot(reward_signal_history,color="blue")
            ax2.set_ylabel("reward_signal",color="blue",fontsize=14)
            plt.show()

            epsilon_history = []
            reward_signal_history = []

            bltd = bltd-np.sum(hits)
            hits = []
            print("Bricks left to destroy ", bltd)
            #print(info['ale.lives'])
            try:
              if info['ale.lives'] == 0:
                break
            except:
              pass

            env.reset()
env.close()

In [None]:
# No idea whatr this is %matplotlib inline
gym.register_envs(ale_py)
fig = plt.figure()
hfig = display(fig, display_id=True)

env = gym.make('ALE/Breakout-v5', render_mode="rgb_array")
obs, info = env.reset()
for _ in range(1000):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    plt.clf()
    plt.imshow(obs)
    fig.canvas.draw()
    hfig.update(fig)
    # print(obs, reward, terminated, truncated, info)
    if terminated or truncated:
        obs, info = env.reset()
env.reset()
env.close()