## Custom Fit Function

In [1]:
import numpy as np
import gym
from gym import wrappers
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Concatenate, Layer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import mean_squared_error
from keras import backend as K
from matplotlib import pyplot as plt
import random

In [2]:
# set random seed
seed = 0
np.random.seed(seed) 
tf.random.set_seed(seed)
random.seed(seed)

In [3]:
# list of phi matrix ranks after each step, used later for plotting rank collapse
ranks = []

In [4]:
# global variables used for custom loss functions 

global current_state_vector, next_state_vector # for dr3 regularizer
global phi_matrix # for phi penalty

In [5]:
# default loss function - mean squared error

def default_loss(y_true, y_pred):
    
    print("default loss function called!")
    
    loss = K.mean(K.square(y_true-y_pred))
    
    return loss

In [6]:
# custom loss function - implements explicit regularizer DR3

c_0 = 0.01 # dr3 coefficient

# add dot product between each state action and subsequent one’s feature vector to loss
def dr3(y_true, y_pred):
    
    global current_state_vector, next_state_vector 
    
    print("dr3 loss function called!")
    
    loss = K.mean(K.square(y_true-y_pred))
    
    # Explicit Regularization
    if (isinstance(current_state_vector, list) and isinstance(next_state_vector, list)):
        # take dot product of curr and next state
        loss += c_0 * np.dot(np.array(current_state_vector), np.array(next_state_vector))
    
    return loss

In [7]:
# custom loss function - random dot product from phi matrix

c = 0.01 # coefficient

# randomly sample two vectors from the phi matrix and add dot product of those vectors to loss
def random_dot(y_true, y_pred):

    global phi_matrix
    
    print("random dot product loss function called!")
    
    loss = K.mean(K.square(y_true-y_pred))
    
    # Explicit Regularization
    if ((phi_matrix is not None) and (len(phi_matrix) > 1)):
        
        v1 = phi_matrix(random.randrange(len(phi_matrix)))
        v2 = phi_matrix(random.randrange(len(phi_matrix)))
        
        loss += c * np.dot(np.array(v1), np.array(v2))
        
    return loss

In [8]:
# custom loss function - implements regulizer based on min/max singular values in phi matrix

alpha = 0.01 # tradeoff factor

# add difference between max entry in phi matrix ** 2 and min entry in phi matrix ** 2 to loss
def phi_penalty(y_true, y_pred):
    
    global phi_matrix
    
    print("phi penalty loss function called!")
    
    loss = K.mean(K.square(y_true-y_pred))
    
    # Explicit Regularization
    if ((phi_matrix is not None) and (len(phi_matrix) > 0)):
        minimum = min([min(value) for value in phi_matrix])
        maximum = max([max(value) for value in phi_matrix])
        loss += alpha * (maximum**2 - minimum**2)
            
    return loss

In [9]:
global loss
loss = None

class CustomModel(keras.Model):
    
    def train_step(self, data):
        
        global loss
        
        # Unpack the data. Its structure depends on your model and
        # on what you pass to `fit()`.
        x, y = data

        with tf.GradientTape() as tape:
            
            y_pred = self(x, training=True)  # Forward pass
            # Compute the loss value
            # (the loss function is configured in `compile()`)
            loss = self.compiled_loss(y, y_pred)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        # Update metrics (includes the metric that tracks the loss)
        self.compiled_metrics.update_state(y, y_pred)
        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}

In [10]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        
        self.n_actions = action_size
        # we define some parameters and hyperparameters:
        # "lr" : learning rate
        # "gamma": discounted factor
        # "exploration_proba_decay": decay of the exploration probability
        # "batch_size": size of experiences we sample to train the DNN
        # "reward_scale": factor by which to scale stored rewards to maximize reproducibility
        self.lr = 0.001
        self.gamma = 0.99
        self.exploration_proba = 1.0
        self.exploration_proba_decay = 0.005
        self.batch_size = 32
        self.reward_scale = 0.1
        
        global current_state_vector, next_state_vector, phi_matrix
        current_state_vector, next_state_vector, phi_matrix = None, None, None
        
        # We define our memory buffer where we will store our experiences
        # We stores only the 2000 last time steps
        self.memory_buffer= list()
        self.max_memory_buffer = 2000
        
        # We create our model having to hidden layers of 24 units (neurones)
        # The first layer has the same size as a state size
        # The last layer has the size of actions space      
        input_layer = Input(shape=state_size)
        dense_layer_1 = Dense(24, activation='relu')(input_layer)
        dense_layer_2 = Dense(24, activation='relu')(input_layer)
        merged_layer = Concatenate()([dense_layer_1, dense_layer_2])
        final_layer = Dense(action_size, activation = 'linear')(merged_layer)

        self.model = CustomModel(inputs=input_layer, outputs=final_layer)
        self.model.compile(optimizer=Adam(learning_rate=self.lr), loss=dr3)
        
        print(self.model.summary())
        
    # The agent computes the action to perform given a state 
    def compute_action(self, current_state):
        # We sample a variable uniformly over [0,1]
        # if the variable is less than the exploration probability
        #     we choose an action randomly
        # else
        #     we forward the state through the DNN and choose the action 
        #     with the highest Q-value.
        if np.random.uniform(0,1) < self.exploration_proba:
            return np.random.choice(range(self.n_actions))
        q_values = self.model.predict(current_state)[0]
        return np.argmax(q_values) # returns index of highest q value - index corresponds to action

    # when an episode is finished, we update the exploration probability using 
    # espilon greedy algorithm
    def update_exploration_probability(self):
        self.exploration_proba = self.exploration_proba * np.exp(-self.exploration_proba_decay)
        print(self.exploration_proba)
    
    # At each time step, we store the corresponding experience
    def store_episode(self,current_state, action, reward, next_state, done):
        #We use a dictionnary to store them
        self.memory_buffer.append({
            "current_state":current_state,
            "action":action,
            "reward":reward*self.reward_scale, # reward scaling
            "next_state":next_state,
            "done" :done
        })
        # If the size of memory buffer exceeds its maximum, we remove the oldest experience
        if len(self.memory_buffer) > self.max_memory_buffer:
            self.memory_buffer.pop(0)
    
    # At the end of each step, we train our model
    def train(self):
        
        global current_state_vector, next_state_vector, phi_matrix
        global loss
        
        phi_matrix = []
        # We shuffle the memory buffer and select a batch size of experiences
        np.random.shuffle(self.memory_buffer)
        batch_sample = self.memory_buffer[0:self.batch_size]
        
        # We iterate over the selected experiences
        for experience in batch_sample:
            
            # We compute the Q-values of S_t
            q_current_state = self.model.predict(experience["current_state"])
            
            # We compute the Q-target using Bellman optimality equation
            q_target = experience["reward"]
            if not experience["done"]:
                q_target = q_target + self.gamma*np.max(self.model.predict(experience["next_state"])[0])
            q_current_state[0][experience["action"]] = q_target
            
            # function to output final hidden layer feature vectors
            last_hidden_layer = self.model.layers[-3]
            func = K.function([self.model.input], [last_hidden_layer.output])
            
            # vectors for loss function
            current_state_vector = list(func([experience["current_state"], ])[0][0]) # for loss function  
            next_state_vector = list(func([experience["next_state"], ])[0][0]) # for loss function
            
            # train the model
            self.model.fit(experience["current_state"], q_current_state, verbose=0) # input data, output data
            with tf.compat.v1.Session() as sess:
                loss.eval()
            
            # Add output from final hidden layer after each experience to phi matrix
            output = func([experience["current_state"], ])          
            row = list(output[0][0])
            phi_matrix.append(row)
        
        # print rank of phi matrix
        phi_matrix = np.array(phi_matrix)
        rank = np.linalg.matrix_rank(phi_matrix)
        ranks.append(rank)
        print("Rank: " + str(rank))      

In [11]:
# We create our gym environment 
env = gym.make("CartPole-v1")
env.seed(seed)
env.action_space.seed(seed)
# We get the shape of a state and the actions space size
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# Number of episodes to run
n_episodes = 10
# Max iterations per epiode
max_iteration_ep = 500
# We define our agent
agent = DQNAgent(state_size, action_size)
total_steps = 0

# We iterate over episodes
for e in range(n_episodes):
    # We initialize the first state and reshape it to fit 
    #  with the input layer of the DNN
    current_state = env.reset()
    current_state = np.array([current_state])
    for step in range(max_iteration_ep):
        total_steps = total_steps + 1
        # the agent computes the action to perform
        action = agent.compute_action(current_state)
        # the envrionment runs the action and returns
        # the next state, a reward and whether the agent is done
        next_state, reward, done, _ = env.step(action)
        next_state = np.array([next_state])
        
        # We sotre each experience in the memory buffer
        agent.store_episode(current_state, action, reward, next_state, done)
        
        # if the episode is ended, we leave the loop after
        # updating the exploration probability
        if done:
            print("Episode " + str(e+1) + ":", end=" ")
            agent.update_exploration_probability()
            break
        current_state = next_state
        # if the have at least batch_size experiences in the memory buffer
        # than we train our model
        if total_steps >= agent.batch_size:
            agent.train()

Model: "custom_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4)]          0           []                               
                                                                                                  
 dense (Dense)                  (None, 24)           120         ['input_1[0][0]']                
                                                                                                  
 dense_1 (Dense)                (None, 24)           120         ['input_1[0][0]']                
                                                                                                  
 concatenate (Concatenate)      (None, 48)           0           ['dense[0][0]',                  
                                                                  'dense_1[0][0]']     

ValueError: Cannot use the default session to evaluate tensor: the tensor's graph is different from the session's graph. Pass an explicit session to `eval(session=sess)`.

In [None]:
# plot rank collapse

print("Rank Collapse: " + str(max(ranks) - min(ranks)))

plt.scatter(list(range(len(ranks))), ranks)

plt.ylim(0, 25) # set range for y axis
plt.ylabel('rank') # set the label for y axis
plt.xlabel('train iteration') # set the label for x-axis
plt.title("Rank Collapse") # set the title of the graph

plt.show() # display the graph

In [None]:
def make_video():
    env_to_wrap = gym.make('CartPole-v1')
    env = wrappers.Monitor(env_to_wrap, 'videos', force = True)
    env.seed(seed) # set env random seed
    env.action_space.seed(seed) # set env random seed
    rewards = 0
    steps = 0
    done = False
    state = env.reset()
    state = np.array([state])
    while not done:
        action = agent.compute_action(state)
        state, reward, done, _ = env.step(action)
        state = np.array([state])            
        steps += 1
        rewards += reward
    print(rewards)
    env.close()
    env_to_wrap.close()
make_video()