In [1]:
import gym
from gym import wrappers
import numpy as np
import random, tempfile, os
from collections import deque
import tensorflow as tf

In [2]:
class Brain:
    """we use a nn to approximate Q value"""
    def __init__(self, size_of_the_state_inputs, size_of_the_action_output, scope='brain', hidden_layer_sizes = [32,32],
                 lr=0.0001, global_step=None, dir_summaries='./summaries/'):
        self.nS = size_of_the_state_inputs
        self.nA = size_of_the_action_output
        self.global_step = global_step
        self.hidden_layer_sizes = hidden_layer_sizes
        self.lr = lr
        self.scope = scope
        with tf.variable_scope(scope):
            self.build_network(network=self.MLP)
        dir_summary = os.path.join(dir_summaries, 'summary_{}'.format(scope))
        if not os.path.exists(dir_summary):
            os.makedirs(dir_summary)
        self.summary_writer = tf.summary.FileWriter(dir_summary)
        
    def MLP(self, x):
        layer = tf.keras.layers.Dense(self.hidden_layer_sizes[0], 'relu')(x)
        for l in self.hidden_layer_sizes:
            layer = tf.keras.layers.Dense(l, 'relu')(layer)
        return tf.keras.layers.Dense(self.nA, 'linear')(layer)
    
    def build_network(self, network):
        self.X = tf.placeholder(shape=[None, self.nS], dtype=tf.float32, name='X')
        self.y = tf.placeholder(shape=[None, self.nA], dtype=tf.float32, name='y')
        self.predictions = network(self.X)
        self.loss = tf.reduce_mean(tf.squared_difference(self.y, self.predictions))
        self.train_op = tf.contrib.layers.optimize_loss(self.loss,
                                                       global_step=tf.train.get_global_step(),
                                                       learning_rate=self.lr, optimizer='Adam')
        self.summaries = tf.summary.merge([tf.summary.scalar('loss', self.loss), 
                                          tf.summary.scalar('max_Q_value', tf.reduce_max(self.predictions)),
                                          tf.summary.scalar('mean_Q_value', tf.reduce_mean(self.predictions))])
    
    def predict(self, sess, s):
        return sess.run(self.predictions, {self.X: s})
    
    def fit(self, sess, s , r, epochs=1):
        for epoch in range(epochs):
            res = sess.run([self.summaries, self.train_op, self.predictions, tf.train.get_global_step()], 
                          {self.X: a, self.y: r})
        self.summary_writer.add_summary(summaries, self.global_step)

In [3]:
brain = Brain(4,4)

In [5]:
class Memory:
    "define memory with deque lib"
    def __init__(self, memory_size=5000):
        self.memory = deque(maclen=memory_size)
    def __len__(self):
        return len(self.memory)
    def add_to_memory(self, s, a, r , s_, status):
        self.memory.append((s, a, r , s_, status))
    def recall_memories(self):
        return list(self.memory)

In [6]:
class Agent:
    def __init__(self, nS, nA, experiment_dir):
        # Initializing
        self.nS = nS
        self.nA = nA
        self.epsilon = 1.0  # exploration-exploitation ratio
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9994
        self.gamma = 0.99  # reward decay
        self.learning_rate = 0.0001
        self.epochs = 1  # training epochs
        self.batch_size = 32
        self.memory = Memory(memory_size=250000)

        # Creating estimators
        self.experiment_dir = os.path.abspath("./experiments/{}".format(experiment_dir))
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.model = Brain(nS=self.nS, nA=self.nA, scope="q",
                           learning_rate=self.learning_rate,
                           global_step=self.global_step,
                           summaries_dir=self.experiment_dir)
        self.target_model = Brain(nS=self.nS, nA=self.nA, scope="target_q",
                                  learning_rate=self.learning_rate,
                                  global_step=self.global_step)

        # Adding an op to initialize the variables.
        init_op = tf.global_variables_initializer()

        # Adding ops to save and restore all the variables.
        self.saver = tf.train.Saver()

        # Setting up the session
        self.sess = tf.Session()
        self.sess.run(init_op)

    def epsilon_update(self, t):
        """
        Updating epsilon based on experienced episodes
        """
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save_weights(self, filename):
        """
        Saving the weights of a model
        """
        save_path = self.saver.save(self.sess, "%s.ckpt" % filename)
        print("Model saved in file: %s" % save_path)

    def load_weights(self, filename):
        """
        Restoring the weights of a model
        """
        self.saver.restore(self.sess, "%s.ckpt" % filename)
        print("Model restored from file")

    def set_weights(self, model_1, model_2):
        """
        Replicates the model parameters of one estimator to another.
          model_1: Estimator to copy the parameters from
          model_2: Estimator to copy the parameters to
        """

        # Enumerating and sorting the parameters of the two models
        model_1_params = [t for t in tf.trainable_variables() if t.name.startswith(model_1.scope)]
        model_2_params = [t for t in tf.trainable_variables() if t.name.startswith(model_2.scope)]
        model_1_params = sorted(model_1_params, key=lambda x: x.name)
        model_2_params = sorted(model_2_params, key=lambda x: x.name)

        # Enumerating the operations to be done
        operations = [coef_2.assign(coef_1) for coef_1, coef_2 in zip(model_1_params, model_2_params)]
        # Executing the operations to be done
        self.sess.run(operations)

    def target_model_update(self):
        """
        Setting the model weights to the target model's ones
        """
        self.set_weights(self.model, self.target_model)

    def act(self, s):
        """
        Having the agent act based on learned Q* function
        or by random choice (based on epsilon)
        """
        # Based on epsilon predicting or randomly choosing the next action
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.nA)
        else:
            # Estimating q for all possible actions
            q = self.model.predict(self.sess, s)[0]
            # Returning the best action
            best_action = np.argmax(q)
            return best_action

    def replay(self):
        # Picking up a random batch from memory
        batch = np.array(random.sample(self.memory.recall_memories(), self.batch_size))
        # Retrieving the sequence of present states
        s = np.vstack(batch[:, 0])
        # Recalling the sequence of actions
        a = np.array(batch[:, 1], dtype=int)
        # Recalling the rewards
        r = np.copy(batch[:, 2])
        # Recalling the sequence of resulting states
        s_p = np.vstack(batch[:, 3])
        # Checking if the reward is relative to a not terminal state
        status = np.where(batch[:, 4] == False)

        # We use the model to predict the rewards by our model and the target model
        next_reward = self.model.predict(self.sess, s_p)
        final_reward = self.target_model.predict(self.sess, s_p)

        if len(status[0]) > 0:
            # Non-terminal update rule using the target model
            # If a reward is not from a terminal state, the reward is just a partial one (r0)
            # We should add the remaining and obtain a final reward using target predictions
            best_next_action = np.argmax(next_reward[status, :][0], axis=1)
            # adding the discounted final reward
            r[status] += np.multiply(self.gamma, final_reward[status, best_next_action][0])

        # We replace the expected rewards for actions when dealing with observed actions and rewards
        expected_reward = self.model.predict(self.sess, s)
        expected_reward[range(self.batch_size), a] = r

        # We re-fit status against predicted/observed rewards
        self.model.fit(self.sess, s, expected_reward, epochs=self.epochs)