In [1]:
from environment.models.simple_control_fixed import SimpleControlledFixedEnv

import matplotlib.pyplot as plt
import numpy as np
from tensorflow import keras
import tensorflow as tf
import random
from collections import namedtuple, deque


2024-07-08 08:42:33.273175: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-08 08:42:33.334057: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
class Agent(object):
    ''' Base agent class, used as a parent class

        Args:
            n_actions (int): number of actions

        Attributes:
            n_actions (int): where we store the number of actions
            last_action (np.array): last action taken by the agent
    '''
    def __init__(self, n_actions: int):
        self.n_actions = n_actions
        self.last_action = None

    def forward(self, state: np.ndarray):
        ''' Performs a forward computation '''
        pass

    def backward(self):
        ''' Performs a backward pass on the network '''
        pass

In [3]:
class RandomAgent(Agent):
    ''' Agent taking actions uniformly at random, child of the class Agent'''
    def __init__(self, n_actions: int):
        super(RandomAgent, self).__init__(n_actions)

    def forward(self, state: np.ndarray) -> int:
        ''' Compute an action uniformly at random across n_actions possible
            choices

            Returns:
                action np.array(int): the random action for each angle
        '''
        action = []
        for i in range(self.n_actions):
            action.append(np.random.randint(-2*np.pi, 2*np.pi))
        self.last_action = np.array(action)
        return self.last_action

Create a class for the experience replay buffer

In [4]:
Experience = namedtuple('Experience',
                        ['state', 'action', 'reward', 'next_state', 'done'])
class ExperienceReplayBuffer(object):
    """ Class used to store a buffer containing experiences of the RL agent.
    """
    def __init__(self, maximum_length):
        # Create buffer of maximum length
        self.buffer = deque(maxlen=maximum_length)
        self.latest_experience = None

    def append(self, experience):
        # Append experience to the buffer
        if(self.latest_experience is not None):
            self.buffer.append(self.latest_experience)

        self.latest_experience = experience

    def __len__(self):
        # overload len operator
        return len(self.buffer)

    def sample_batch(self, n):
        """ Function used to sample experiences from the buffer.
            returns 5 lists, each of size n. Returns a list of state, actions,
            rewards, next states and done variables.
        """
        # If we try to sample more elements that what are available from the
        # buffer we raise an error
        if n > len(self.buffer):
            raise IndexError('Tried to sample too many elements from the buffer!')
        
        # combined experience replay
        # # inclued latest experience in the sampled batch
                    
        batch = random.sample(self.buffer, n - 1)
        batch.append(self.latest_experience)

        # batch is a list of size n, where each element is an Experience tuple
        # of 5 elements. To convert a list of tuples into
        # a tuple of list we do zip(*batch). In this case this will return a
        # tuple of 5 elements where each element is a list of n elements.

        return zip(*batch)

In [5]:
class QNetwork(keras.models.Model):
    def __init__(self, input_size, output_size):
        super(QNetwork, self).__init__()

        self.input_layer = keras.layers.keras.layers.Dense(64, activation='relu')
        self.hidden_layer1 = keras.layers.keras.layers.Dense(16, activation='relu')
        
        self.hidden_value_layer1 = keras.layers.keras.layers.Dense(128, activation='relu')
        self.hidden_advantage_layer1 = keras.layers.keras.layers.Dense(128, activation='relu')
        self.value_layer = keras.layers.keras.layers.Dense(1)
        self.advantage_layer = keras.layers.keras.layers.Dense(output_size)

    def call(self, x):
        _in = keras.layers.ReLU()(self.input_layer(x))
        l1 = keras.layers.ReLU()(self.hidden_layer1(_in))

        v1 = keras.layers.ReLU()(self.hidden_value_layer1(l1))
        v2 = self.value_layer(v1)

        a1 = keras.layers.ReLU()(self.hidden_advantage_layer1(l1))
        a2 = self.advantage_layer(a1)

        q = v2 + a2 - tf.reduce_mean(a2, axis=-1, keepdims=True)
        return q
    
    def compute_q_values(self, states, actions):
        q_values = self(states)
        selected_q_values = tf.gather(q_values, actions, axis=1)
        return selected_q_values

    def update(self, optimizer, loss_function, predicted_q_values, target_values):
        with tf.GradientTape() as tape:
            loss = loss_function(predicted_q_values, target_values)
        gradients = tape.gradient(loss, self.trainable_variables)
        optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        return loss

def epsilon_decay(epsilon_min, epsilon_max, decay_step, k):
    decayed_epsilon = max(epsilon_min, epsilon_max * (epsilon_min / epsilon_max) ** ((k - 1)/(decay_step - 1)))
    return decayed_epsilon

In [6]:
class DDPGAgent(Agent):
    def __init__(self, state_size, action_size, replay_length=5000, batch_size=64, gamma=0.99, learning_rate=1e-3, n_episodes=800):
        super(DDPGAgent, self).__init__(action_size)
        
        self.learning_rate = learning_rate
        self.n_episodes = n_episodes
        self.episode = 0
        self.epsilon = 1
        self.Z = 0.9*self.n_episodes
        self.epsilon_max = 0.99
        self.epsilon_min = 0.05
        
        # env specific
        self.min_action = -2*np.pi
        self.max_action = 2*np.pi
        
        # step 1:
        ### Create critic network
        self.optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        self.critic_q_network = self._build_network(state_size, action_size)
        self.critic_target_network = self._build_network(state_size, action_size)
        self.critic_target_network.set_weights(self.critic_q_network.get_weights())
        ### Create actor network
        self.actor_q_network = self._build_network(state_size, action_size)
        self.actor_target_network = self._build_network(state_size, action_size)
        self.actor_target_network.set_weights(self.actor_q_network.get_weights())
        
        # step 2:
        ### Create Experience replay buffer
        self.buffer = ExperienceReplayBuffer(maximum_length=replay_length)
        self.batch_size = batch_size
        self.gamma = gamma
        
        ### Agent init
        self.state_size = state_size
        self.action_size = action_size
        
        ### Steps
        self.target_update_rate = int(replay_length/batch_size) # suggested as tip
        self.steps = 0  # Counter for steps taken
        
    def _build_network(self, state_size, action_size):
        model = keras.models.Sequential()
        model.add(keras.layers.Dense(24, input_dim=state_size, activation='relu'))
        model.add(keras.layers.Dense(24, activation='relu'))
        model.add(keras.layers.Dense(action_size, activation='linear'))
        model.compile(loss='mse', optimizer=self.optimizer)
        return model

    def forward(self, state):
        # step 7:
        # take noisy continuous action a_t at s_t   
        q = self.actor_q_network.predict(state)
        # loop over the actions and add noise
        for i in range(len(q)):
            # assign noise
            noise = np.random.uniform(self.min_action, self.max_action)
            q[i] = q[i] + noise

        return q
     
    def sample(self, state):
        if not isinstance(state[0], np.float32):
            state = np.array(state[0])
        q = self.q_network.predict(state)
        return np.argmax(q)

    def backward(self):
        if len(self.buffer.buffer) < self.batch_size:
            return
        
        # step 9:
        # Sample a batch of experiences from the buffer
        batch = Experience(*self.buffer.sample_batch(self.batch_size))        
        batch_mask = ~np.array(batch.done, dtype=bool)

        next_state = np.stack(batch.next_state)
        states = np.stack(batch.state)
        actions = np.concatenate(batch.action)
        rewards = np.concatenate(batch.reward)        

        # step 10:
        # Compute target values for each experience in the batch
        target_values = tf.where(batch_mask, rewards, 0)
        target_values = tf.where(~batch_mask, target_values + self.gamma * self.critic_target_network.predict(next_state), target_values)


        # step 11:
        # Compute predicted Q-values for the states and actions in the batch
        predicted_q_values = self.critic_q_network.predict(states)
        predicted_q_values = predicted_q_values[tf.range(len(actions)), actions]

        # Update critic Q-network weights using the computed values (backward pass SGD on the MSE loss)
        # self.critic_q_network.fit(np.stack(states, actions), target_values, epochs=1, verbose=0)
        def loss_function_MSE(predicted_q_values, target_values):
            _ret = target_values - predicted_q_values
            return tf.reduce_mean(tf.square(_ret))
        self.critic_q_network.update(self.optimizer, loss_function_MSE, predicted_q_values, target_values)

        # step 12:
        if self.steps % self.target_update_rate == self.target_update_rate - 1:
            # step 13:
            # update critic
            def loss_function_J(states):
                _ret = -self.critic_q_network.compute_q_values(states, self.actor_q_network.predict(states))
                return tf.reduce_mean(_ret)
            self.actor_q_network.update(self.optimizer, loss_function_J, states, target_values)

            # step 14:
            # stof update target networks
            
            pass
        
        # step 16:
        # Increment steps counter
        self.steps += 1

In [7]:
def running_average(x, N):
    ''' Function used to compute the running average
        of the last N elements of a vector x
    '''
    if len(x) >= N:
        y = np.copy(x)
        y[N-1:] = np.convolve(x, np.ones((N, )) / N, mode='valid')
    else:
        y = np.zeros_like(x)
    return y


def faster_running_average(x, N, last_average):
    if len(x) > N:
        return last_average + (1./N)*(x[-1] - x[-N - 1])
    else:
        return sum(x)/len(x)

In [8]:
# Parameters
N_episodes = 1000                 # Number of episodes # 100 - 1000
discount_factor = 0.99            # Value of the discount factor
n_ep_running_average = 50         # Running average of 50 episodes
n_actions = 4                     # Number of available actions (nr of angles)
dim_state = 2                     # State dimensionality
replay_size = 5000                # 5000 - 30000
batch_size = 64                   # 4 - 128
learning_rate = 1e-4              # 1e-3 - 1e-4
target_reward = 0                 # specified in lab
max_env_steps = 1000              # to stop the episode

# We will use these variables to compute the average episodic reward and
# the average number of steps per episode
episode_reward_list = []       # this list contains the total reward per episode
episode_number_of_steps = []   # this list contains the number of steps per episode

# agent initialization
# agent = RandomAgent(n_actions) # random
# step 1 & 2:
agent = DDPGAgent(dim_state, n_actions, replay_size, batch_size=batch_size, gamma=discount_factor, learning_rate=learning_rate, n_episodes=N_episodes)


2024-07-08 08:42:34.455640: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
env = SimpleControlledFixedEnv()

In [10]:
### Training process
from tqdm import trange
# trange is an alternative to range in python, from the tqdm library
# It shows a nice progression bar that you can update with useful information
EPISODES = trange(N_episodes, desc='Episode: ', leave=True)
actual_episodes = 0

Episode:   0%|          | 0/1000 [00:00<?, ?it/s]

To facilitate getting higher-quality training data, you may reduce the scale of the noise over the course of training. (We do not do this in our implementation, and keep noise scale fixed throughout.)

At test time, to see how well the policy exploits what it has learned, we do not add noise to the actions.

Our DDPG implementation uses a trick to improve exploration at the start of training. For a fixed number of steps at the beginning (set with the start_steps keyword argument), the agent takes actions which are sampled from a uniform random distribution over valid actions. After that, it returns to normal DDPG exploration.

In [11]:
avg_reward = 0.
avg_steps = 0.
# step 3:
# episode loop
for i in EPISODES:
    # Reset environment data and initialize variables
    done = False
    # step 4:
    state = env.reset()
    state = np.array(state, dtype=np.float32)
 
    total_episode_reward = 0.
    # step 5:
    t = 0
    # step 6: 
    # environment loop
    while not done:
        # necessary for lunar lander. It doesn't implement a default
        # max-timesteps and rover hovers forever

        # step 7:
        # noisy actor action
        try:
            state = state.reshape(1, dim_state)
            print(state)
        except:
            print("passed state reshape")
            print(state)
            pass
        action = agent.forward(state)

        # step 8:
        # Execute action in the environment and append
        next_state, reward, done = env.step(action)
        # append to buffer
        agent.buffer.append(Experience(state, action, np.array([reward]), next_state, done))
        
        # step 9-15: see function definition
        agent.backward()
        
        # Update episode reward
        total_episode_reward += reward

        # step 16: Update state for next iteration
        state = next_state
        t += 1

    # Append episode reward and total number of steps
    episode_reward_list.append(total_episode_reward)
    episode_number_of_steps.append(t)

    avg_reward = faster_running_average(episode_reward_list, n_ep_running_average, avg_reward)
    avg_steps = faster_running_average(episode_number_of_steps, n_ep_running_average, avg_steps)

    agent.episode += 1

    # Updates the tqdm update bar with fresh information
    # (episode number, total reward of the last episode, total number of Steps
    # of the last episode, average reward, average number of steps)
    EPISODES.set_description(
        "Episode {} - Reward/Steps: {:.1f}/{} - Avg. Reward/Steps: {:.1f}/{:.1f}".format(
        i, total_episode_reward, t,
        avg_reward,
        avg_steps)
        )

    actual_episodes += 1
    
    # stop if we hit reward threshold
    if avg_reward >= target_reward:
        break

[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[[0.19564064 0.06074065]]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[[1]]
passed state reshape
[0.199595711118774, 0.05903001212455178]


Episode:   0%|          | 0/1000 [00:00<?, ?it/s]


InvalidArgumentError: Graph execution error:

Detected at node 'sequential_2/dense_6/Relu' defined at (most recent call last):
    File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/root/.local/lib/python3.8/site-packages/ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "/usr/local/lib/python3.8/dist-packages/traitlets/config/application.py", line 1075, in launch_instance
      app.start()
    File "/root/.local/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 739, in start
      self.io_loop.start()
    File "/root/.local/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "/usr/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/usr/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/usr/lib/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/root/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "/root/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "/root/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
      await result
    File "/root/.local/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 359, in execute_request
      await super().execute_request(stream, ident, parent)
    File "/root/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "/root/.local/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 446, in do_execute
      res = shell.run_cell(
    File "/root/.local/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/root/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3009, in run_cell
      result = self._run_cell(
    File "/root/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell
      result = runner(coro)
    File "/root/.local/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/root/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/root/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/root/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_6962/1418870914.py", line 30, in <module>
      action = agent.forward(state)
    File "/tmp/ipykernel_6962/4245071269.py", line 53, in forward
      q = self.actor_q_network.predict(state)
    File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 2350, in predict
      tmp_batch_outputs = self.predict_function(iterator)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 2137, in predict_function
      return step_function(self, iterator)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 2123, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 2111, in run_step
      outputs = model.predict_step(data)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 2079, in predict_step
      return self(x, training=False)
    File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 561, in __call__
      return super().__call__(*args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/sequential.py", line 413, in call
      return super().call(inputs, training=training, mask=mask)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/functional.py", line 511, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/functional.py", line 668, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/layers/core/dense.py", line 255, in call
      outputs = self.activation(outputs)
    File "/usr/local/lib/python3.8/dist-packages/keras/activations.py", line 317, in relu
      return backend.relu(
    File "/usr/local/lib/python3.8/dist-packages/keras/backend.py", line 5369, in relu
      x = tf.nn.relu(x)
Node: 'sequential_2/dense_6/Relu'
In[0] is not a matrix
	 [[{{node sequential_2/dense_6/Relu}}]] [Op:__inference_predict_function_632]