In [1]:
%pip install gymnasium
%pip install gymnasium[atari]
%pip install gymnasium[accept-rom-license]
%pip install --upgrade ipykernel



In [2]:
import gymnasium as gym
import random
import matplotlib.pyplot as plt
from collections import deque, namedtuple
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from matplotlib import get_backend

# set up matplotlib
is_ipython = 'inline' in get_backend()
if is_ipython:
    from IPython import display

plt.ion()

2023-11-15 14:17:26.497410: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-15 14:17:26.497463: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-15 14:17:26.497496: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-15 14:17:26.507018: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from tensorflow.tsl.python.lib.core impo

<contextlib.ExitStack at 0x7aa932d3a020>

In [3]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'final_state_bool'))
ramDict = dict(player_y=51, player_x=46, enemy_y=50, enemy_x=45, ball_x=49, ball_y=54) # Retrieved from github.com/mila-iqia/atari-representation-learning

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        sample = random.sample(self.memory, batch_size)
        return sample

    def __len__(self):
        return len(self.memory)

def create_model(num_actions):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(300, activation='relu'),
        tf.keras.layers.Dense(num_actions)
    ])
    return model

def map_action(action):
    action_map = {0:0, 1:4, 2:5}
    return action_map[action]

def moving_average(a, n=3):
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

def process_state(state):
    state = state.reshape(1, -1)
    state = state/255
    return state

In [4]:
env = gym.make("ALE/Pong-ram-v5") # Since we aren't using a convolution layer, we can use the ram version of the game
state, info = env.reset()
state = process_state(state)
n_actions = 3 # Reduce the action space to only the relevant actions

device = tf.device("/GPU:0")
weightDir = "./q2a/policyWeights"

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
2023-11-15 14:17:31.249080: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-15 14:17:31.295699: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-15 14:17:31.296055: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See mor

In [5]:
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.3
EPS_DECAY = 50000
TAU = 0.005
LR = 1e-4

loss_object = tf.keras.losses.Huber()
optimizer = tf.keras.optimizers.Adam(learning_rate=LR, clipvalue = 100)


policy_net = create_model(n_actions)
target_net = create_model(n_actions)

policy_net(state)
target_net(state)
target_net.set_weights(policy_net.get_weights())

memory = ReplayMemory(10000)

In [6]:
def select_action(state, steps_done):
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1.0 * steps_done / EPS_DECAY)

    if sample > eps_threshold:
        return policy_net(state).numpy().argmax()
    else:
        return random.randrange(0, n_actions)

def plot_state(state):

    plt.clf()
    plt.imshow(state)

    if is_ipython:
        display.display(plt.gcf())
        display.clear_output(wait=True)

In [7]:
@tf.function
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for detailed explanation). This converts batch-array of Transitions to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements (a final state would've been the one after which simulation ended)
    final_mask = tf.convert_to_tensor(batch.final_state_bool)
    non_final_next_states = tf.reshape(tf.convert_to_tensor([s.next_state for s in transitions if not s.final_state_bool]), [sum(~final_mask.numpy()), -1])

    state_batch = tf.reshape(tf.convert_to_tensor(batch.state), [BATCH_SIZE, -1])
    action_batch = tf.reshape(tf.convert_to_tensor(batch.action), [BATCH_SIZE, -1])
    reward_batch = tf.reshape(tf.convert_to_tensor(batch.reward), [BATCH_SIZE, -1])
    reward_batch = (reward_batch - tf.reduce_mean(reward_batch)) / tf.math.reduce_std(reward_batch)

    with tf.GradientTape(watch_accessed_variables=False) as tape:
      tape.watch(policy_net.trainable_variables)

      # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken. These are the actions which would've been taken for each batch state according to policy_net
      state_action_values = tf.gather(policy_net(state_batch), action_batch, axis = 1, batch_dims = 1)

      # Compute V(s_{t+1}) for all next states. Expected values of actions for non_final_next_states are computed based on the "older" target_net; selecting their best reward with max(1)[0]. This is merged based on the mask, such that we'll have either the expected state value or 0 in case the state was final.
      next_state_values = np.zeros([BATCH_SIZE, 1])
      next_state_values[~final_mask] = tf.reshape(tf.reduce_max(target_net(non_final_next_states), axis = 1), [sum(~final_mask.numpy()), 1])
      # Compute the expected Q values
      expected_state_action_values = (next_state_values * GAMMA) + reward_batch

      # Compute Huber loss
      loss = loss_object(state_action_values, expected_state_action_values)


    # Optimize the model
    gradients = tape.gradient(loss, policy_net.trainable_variables)
    optimizer.apply_gradients(zip(gradients, policy_net.trainable_variables))

In [8]:
num_episodes = 500
sumr = 0
steps_done = 0
rewards = []

for i_episode in tqdm(range(0, num_episodes)):
    # Initialize the environment and get its state
    state, info = env.reset(seed = i_episode)
    state = process_state(state)
    done = False
    while not done:
        action = select_action(state, steps_done)
        steps_done += 1
        exec_action = map_action(action)
        next_state, reward, terminated, truncated, info = env.step(exec_action)
        next_state = process_state(next_state)
        done = terminated or truncated

        sumr += reward
        reward -= abs(next_state[0][ramDict["ball_y"]] - next_state[0][ramDict["player_y"]]) # Punishment for not moving towards the ball

        # Store the transition in memory
        memory.push(state, action, next_state, reward, done)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()

        new_weights = [TAU*x + (1-TAU)*y for x,y in zip(policy_net.get_weights(), target_net.get_weights())]
        target_net.set_weights(new_weights)

        if done:
            if (i_episode + 1) % 50 == 0:
                policy_net.save_weights(weightDir + "_" + str(i_episode+1))
            rewards.append(sumr)
            if (i_episode % 100) == 0:
              print('episode: %3d \t return: %.3f' % (i_episode, np.mean(rewards)))
            sumr = 0
            break

  0%|          | 1/500 [00:08<1:08:58,  8.29s/it]

episode:   0 	 return: -18.000


 20%|██        | 101/500 [09:43<37:27,  5.63s/it]

episode: 100 	 return: -20.366


 40%|████      | 201/500 [19:39<30:11,  6.06s/it]

episode: 200 	 return: -20.478


 60%|██████    | 301/500 [29:33<19:18,  5.82s/it]

episode: 300 	 return: -20.512


 80%|████████  | 401/500 [39:29<10:10,  6.17s/it]

episode: 400 	 return: -20.506


100%|██████████| 500/500 [49:16<00:00,  5.91s/it]


In [10]:
from time import sleep
rewards = []
num_episodes = 10
sumr = 0
env = gym.make("ALE/Pong-ram-v5") # Since we aren't using a convolution layer, we can use the ram version of the game
state, info = env.reset()

for i_episode in tqdm(range(num_episodes)):
    # Initialize the environment and get its state
    state, info = env.reset(seed=i_episode)
    state = process_state(state)
    done = False
    while not done:
        action = policy_net(state).numpy().argmax()
        next_state, reward, terminated, truncated, info = env.step(action)
        next_state = process_state(next_state)
        done = terminated or truncated

        sumr += reward

        if done:
            rewards.append(sumr)
            sumr = 0
            break

plt.plot(rewards)

100%|██████████| 10/10 [00:25<00:00,  2.55s/it]
