<a href="https://colab.research.google.com/github/adedert/IANNWTF_FinalProject/blob/main/Delayed_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install swig
!pip install gymnasium[box2d]

Collecting swig
  Downloading swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.2.1
Collecting gymnasium[box2d]
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium[box2d])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import tensorflow as tf
import gymnasium as gym
import keras
import numpy as np
import matplotlib.pyplot as plt
import pickle

Mounted at /content/drive


In [None]:
class ExperienceReplayBuffer():
  def __init__(self, max_size, env_name, parallel_games, steps_per_sample):
    self.max_size = max_size
    self.parallel_games = parallel_games
    self.steps_per_sample = steps_per_sample
    self.envs = gym.make_vec(env_name, num_envs=self.parallel_games)
    self.current_states, _ = self.envs.reset()
    self.num_possible_actions = gym.make(env_name).action_space.n
    self.data = []

  def fill_buffer(self, dqn, epsilon):
    states_list = []
    actions_list = []
    rewards_list = []
    terminateds_list = []
    next_states_list = []

    for i in range(self.steps_per_sample):
      actions = self.sample_from_policy(dqn, epsilon)
      next_states, rewards, terminateds, _, _ = self.envs.step(actions)
      states_list.append(self.current_states)
      actions_list.append(actions)
      rewards_list.append(rewards)
      terminateds_list.append(terminateds)
      next_states_list.append(next_states)
      self.current_states = next_states

    def data_generator():
      for states_batch, actions_batch, rewards_batch, terminateds_batch, next_states_batch in zip(states_list, actions_list, rewards_list, terminateds_list, next_states_list):
          for i in range(self.parallel_games):
              state = states_batch[i,:]
              action = actions_batch[i]
              reward = rewards_batch[i]
              terminated = terminateds_batch[i]
              next_state = next_states_batch[i,:]
              yield(state, action, reward, next_state, terminated)

    dataset_tensor_specs = (tf.TensorSpec(shape=(8,), dtype=tf.float32),
                              tf.TensorSpec(shape=(), dtype=tf.int32),
                              tf.TensorSpec(shape=(), dtype=tf.float32),
                              tf.TensorSpec(shape=(8,), dtype=tf.float32),
                              tf.TensorSpec(shape=(), dtype=tf.bool))
    new_samples_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=dataset_tensor_specs)
    new_samples_dataset = new_samples_dataset.cache().shuffle(buffer_size=self.steps_per_sample * self.parallel_games, reshuffle_each_iteration=True)

    self.data.append(new_samples_dataset)

    #check if buffer is full and delete data if necessary
    datapoints_in_data = len(self.data) * self.parallel_games * self.steps_per_sample
    #print(len(self.data))
    if datapoints_in_data > self.max_size:
        self.data.pop(0)

  def get_data(self):
    erp_data = tf.data.Dataset.sample_from_datasets(self.data, weights=[1/float(len(self.data)) for _ in self.data], stop_on_empty_dataset = False)
    return erp_data

  def sample_from_policy(self, dqn, epsilon):
    q_values = dqn(self.current_states)
    greedy_actions = tf.argmax(q_values, axis=1)
    #sample random action
    random_actions = tf.random.uniform(shape=(self.parallel_games,), minval=0, maxval=self.num_possible_actions, dtype=tf.int64)
    sample_epsilon = tf.random.uniform(shape=(self.parallel_games,), minval=0, maxval=1, dtype=tf.float32) > epsilon
    actions = tf.where(sample_epsilon, greedy_actions, random_actions).numpy()
    return actions

In [None]:
class DQN(tf.keras.Model):
  def __init__(self, num_actions):
    super().__init__()

    self.fc1 = tf.keras.layers.Dense(64, activation="relu")
    self.fc2 = tf.keras.layers.Dense(64, activation="relu")
    self.out = tf.keras.layers.Dense(num_actions)

    self.metrics_list = [tf.keras.metrics.Mean(name="loss")]

    self.optimizer = tf.keras.optimizers.Adam()

  @property
  def metrics(self):
      return self.metrics_list

  def reset_metrics(self):
      for metric in self.metrics:
          metric.reset_state()

  def call(self, input):
    x = self.fc1(input)
    x = self.fc2(x)
    x = self.out(x)
    return x

In [None]:
class dqn_agent():
  def __init__(self, dqn_network, target_network, erp, env_name, gamma, tau):
    self.dqn_network = dqn_network
    self.target_network = target_network
    self.erp = erp
    self.gamma = gamma
    self.tau = tau
    self.epsilon = 1
    self.env_name = env_name

  def polyak_average(self, polyak_factor):
    dqn_network_weights = self.dqn_network.get_weights()
    target_network_weights = self.target_network.get_weights()
    averaged_weights = []
    for source_weight, target_weight in zip(dqn_network_weights, target_network_weights):
        fraction_kept_weights = polyak_factor * target_weight
        fraction_updated_weights = (1-polyak_factor) * source_weight
        averaged_weight = fraction_kept_weights + fraction_updated_weights
        averaged_weights.append(averaged_weight)
    self.target_network.set_weights(averaged_weights)

  def train_step(self, dataset, gamma, num_training_steps, batch_size=128):

    @tf.function
    def compute_gradients(q_target, states, actions):
      #compute loss and apply gradients
      tmp1 = []
      tmp2 = []
      with tf.GradientTape() as tape:
          q_pred = self.dqn_network(states) # shape (batch_size, num_actions)
          tmp1.append(q_pred)
          q_pred = tf.gather(q_pred, actions, batch_dims=1)
          tmp2.append(q_pred)
          loss = tf.reduce_mean(tf.square(q_pred - q_target))
      gradients = tape.gradient(loss, self.dqn_network.trainable_variables)
      self.dqn_network.optimizer.apply_gradients(zip(gradients, self.dqn_network.trainable_variables))
      return loss, tmp1, tmp2

    dataset = dataset.batch(batch_size).prefetch(3)

    losses, q_values = [], []
    for i, values in enumerate(dataset):
      state, action, reward, next_state, terminated = values
      q_vals = self.target_network(next_state)
      q_values.append(q_vals.numpy())
      max_q_values = tf.reduce_max(q_vals, axis=1)
      check_terminateds = tf.where(terminated, tf.zeros_like(max_q_values, dtype=tf.float32), tf.ones_like(max_q_values, dtype=tf.float32))
      q_target = reward + (gamma*max_q_values*check_terminateds)
      loss, tmp1, tmp2 = compute_gradients(q_target, states=state, actions=action)
      #print(f'state: {state.shape}, q vals: {q_vals.shape}, max q values: {max_q_values.shape}')
      #print(f'shape of q_target: {q_target}, before: {tmp1}, after: {tmp2}')
      losses.append(loss)
      if i >= num_training_steps:
          break
    return np.mean(losses), np.mean(q_values)

  def test_step(self, env_name, num_test_envs, gamma):
    envs = gym.make_vec(env_name, num_envs=num_test_envs)
    num_possible_actions = envs.single_action_space.n
    states, _ = envs.reset()
    done = False
    timestep = 0
    #track reward and which envs are finished
    score = np.zeros(num_test_envs)
    episodes_finished = np.zeros(num_test_envs, dtype=bool)
    test_steps = 0
    while not done:
        q_values = self.dqn_network(states)
        actions = tf.argmax(q_values, axis=1) # tensor of type tf.int64, shape (num_parallel_tests,)
        states, rewards, terminateds, _, _ = envs.step(actions.numpy())
        # compute pointwise or between episodes_finished and terminateds
        episodes_finished = np.logical_or(episodes_finished, terminateds)
        #create vector to only add rewards from runnning games
        unfinished_games = np.where(episodes_finished == 1, 0, 1)
        #print(unfinished_games)
        score += rewards*unfinished_games
        #returns += ((gamma**timestep)*rewards)*(np.logical_not(episodes_finished).astype(np.float32))
        timestep += 1
        # done if all episodes are finished
        done = np.all(episodes_finished)
        test_steps += 1
        if test_steps % 100 == 0:
           print(f"test_steps: {test_steps} {np.sum(episodes_finished)/num_test_envs} {terminateds.shape} {episodes_finished.shape}")
    return np.mean(score)

  def train(self, epochs):

    self.polyak_average(polyak_factor=0.0)
    rewards = []
    #prefill buffer
    for _ in range(20):
      self.erp.fill_buffer(self.dqn_network, self.epsilon)

    for step in range(epochs):
      self.erp.fill_buffer(self.dqn_network, epsilon=self.epsilon)
      data = self.erp.get_data()
      avg_loss, avg_q_values = self.train_step(dataset=data, gamma=self.gamma, num_training_steps=4)
      self.epsilon = max(self.epsilon*0.992, 0.05)
      self.polyak_average(self.tau)
      if (step+1) % 20 == 0:
        print(f"epoch: {step+1}, avg_loss: {avg_loss}, epsilon: {self.epsilon}")
      if (step+1) % 100 == 0:
        self.dqn_network.save_weights('/content/drive/MyDrive/Lunar Lander/checkpoints/dqn_checkpoint')
        self.target_network.save_weights('/content/drive/MyDrive/Lunar Lander/checkpoints/target_checkpoint')
        avg_reward = self.test_step(env_name="LunarLander-v2", num_test_envs=64, gamma=self.gamma)
        rewards.append(avg_reward)
        with open('/content/drive/MyDrive/Lunar Lander/dqn_rewards.pkl', 'wb') as f:  # open a text file
          pickle.dump(rewards, f)
        f.close()
        print(f"saved model, avg reward on test step: {avg_reward}")


In [None]:
dqn_network = DQN(num_actions=4)
dqn_network(tf.random.uniform(shape=(1,8)))
target_network = DQN(num_actions=4)
target_network(tf.random.uniform(shape=(1,8)))
erp = ExperienceReplayBuffer(max_size=50000, env_name="LunarLander-v2", parallel_games=64, steps_per_sample=5)

basic_agent = dqn_agent(dqn_network=dqn_network, target_network=target_network, erp=erp, env_name="LunarLander-v2", gamma=0.99, tau=0.99)

In [None]:
model = DQN(num_actions=4)
model(tf.random.uniform(shape=(1,8)))
model.set_weights(dqn_network.get_weights())

In [None]:
model.test_step("LunarLander-v2", 64, 0.99)

test_steps: 100 0.09375 (64,) (64,)
test_steps: 200 0.8125 (64,) (64,)
test_steps: 300 0.953125 (64,) (64,)


-430.1063479820993

In [None]:
basic_agent.train(epochs=1000)

epoch: 20, avg_loss: 37.61334991455078, epsilon: 0.8515956670851491
epoch: 40, avg_loss: 72.8701400756836, epsilon: 0.7252151801981999
epoch: 60, avg_loss: 84.17674255371094, epsilon: 0.6175901051611625
epoch: 80, avg_loss: 42.7049674987793, epsilon: 0.5259370575899076
epoch: 100, avg_loss: 74.7244644165039, epsilon: 0.447885719403078
test_steps: 100 0.0 (64,) (64,)
test_steps: 200 0.421875 (64,) (64,)
test_steps: 300 0.703125 (64,) (64,)
test_steps: 400 0.765625 (64,) (64,)
test_steps: 500 0.765625 (64,) (64,)
test_steps: 600 0.796875 (64,) (64,)
test_steps: 700 0.796875 (64,) (64,)
test_steps: 800 0.796875 (64,) (64,)
test_steps: 900 0.796875 (64,) (64,)
test_steps: 1000 0.796875 (64,) (64,)
test_steps: 1100 0.796875 (64,) (64,)
test_steps: 1200 0.921875 (64,) (64,)
test_steps: 1300 0.9375 (64,) (64,)
test_steps: 1400 0.953125 (64,) (64,)
test_steps: 1500 0.953125 (64,) (64,)
test_steps: 1600 0.953125 (64,) (64,)
test_steps: 1700 0.96875 (64,) (64,)
test_steps: 1800 0.96875 (64,) (64