In [1]:
import numpy as np
import tensorflow as tf
import gym
import pandas as pd
import seaborn as sns
sns.set()

# Deep $Q$ networks (for real)
In the previous notebook, we used a neural network to learn to play `FrozenLake` with the same performance as our other tabular approaches (using `numpy` and `tensorflow`). In this notebook, we add additional elements to our agent that improve stability.

### Target Network
The $Q$-learning update rule involves a maximization over outputs of the same network:
$$
Q(s_t, a_t) \gets Q(s_t, a_t) + \alpha \left( r_t + (1-d_t)\gamma \max_{a_{t+1}} \left( Q(s_{t+1}, a_{t+1}) \right) - Q(s_t, a_t) \right)
$$
(Note that we write $Q_\theta$ to mean the $Q$ function parametrized by $\theta$).

When learning this function using a differentiable function approximator like a neural network, this causes a problem. Generally, this update *increases* the prediction for $Q(s_t, a_t)$. Since the function approximator is differentiable, it also increases predictions for states that are similar to $s_t$. You can imagine this like picking up a fishing net; the area around where you grab the net also gets pulled up around it. Frequently, $s_{t+1}$ is similar to $s_t$ due to a small temporal separation between them! Since we are boostrapping our predictions for $Q(s_t, a_t)$ (that is, we use the output of the network in the prediction target itself), our predictions tend to increase quickly and can diverge from what is reasonable.

To combat this, we have two networks: $\theta$ and $\theta_\text{targ}$ that share the same architecture. $\theta_\text{targ}$ initially has the same parameters as $\theta$, but we freeze it so that it stays the same for some period of time. We use $\theta_\text{targ}$ in our target $ \max_{a_{t+1}} \left( Q(s_{t+1}, a_{t+1}) \right)$ so that the maximization step doesn't influence $\theta$, which updates at every time step.

We implement this in `tensorflow` by making use of `scope`s (which allow us to describe hierarchies of `tensorflow` variables using a naming scheme similar to directories).

### Experience Replay

As a neural network, our DQN relies on data to perform well. If we only feed the agent the most recent data (updating it 'online'), then our agent is only good at making predictions about the most recent data. It may become less accurate for older or less common states, a problem known as **catastrophic forgetting**. Thus, we make use of a **buffer** of state transitions $\langle s_t, a_t, r_t, s_{t+1}, d_t \rangle$ that store the agent's experiences, and update the network by randomly sampling batches of transitions from the buffer. This improves stability of training.

Our implementation will assume that the environments are vectorized.

In [2]:
class ReplayBuffer:
    def __init__(self, state_shape, num_envs, max_size=1000000):
        self.max_size = max_size
        self.num_envs = num_envs
        self.s_t_buf = np.zeros((self.max_size, *state_shape))
        self.a_t_buf = np.zeros(self.max_size)
        self.r_t_buf = np.zeros(self.max_size)
        self.s_t_next_buf =  np.zeros((self.max_size, *state_shape))
        self.d_t_buf = np.zeros(self.max_size)
        self.pointer = 0
        self.filled = 0
    
    def store(self, s_t, a_t, r_t, s_t_next, d_t):
        indices = slice(self.pointer, self.pointer+self.num_envs)
        self.s_t_buf[indices] = s_t
        self.a_t_buf[indices] = a_t
        self.r_t_buf[indices] = r_t
        self.s_t_next_buf[indices] = s_t_next
        self.d_t_buf[indices] = d_t
        self.pointer = (self.pointer + num_envs) % self.max_size
        self.filled = min(self.max_size, self.filled+self.num_envs)
        
    def get(self, number=32):
        number = min(number, self.filled)
        indices = np.random.choice(np.arange(self.filled), number, replace=False)
        return self.s_t_buf[indices], self.a_t_buf[indices], self.r_t_buf[indices], self.s_t_next_buf[indices], self.d_t_buf[indices]

In [3]:
class Network:
    def __init__(self, hidden_units):
        self.layers = [tf.keras.layers.Dense(units, 
                                    activation='relu', 
                                    kernel_initializer='he_uniform', 
                                    use_bias=False) for units in hidden_units[:-1]]
        self.layers.append(tf.keras.layers.Dense(hidden_units[-1],
                                                activation='linear',
                                                kernel_initializer='zeros',
                                                use_bias=False))
    
    def call(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def get_weights(self):
        return [layer.get_weights() for layer in self.layers]
    
    def set_weights(self, weights_list):
        for layer, weights in zip(self.layers, weights_list):
            layer.set_weights(weights)

In [4]:
class Agent:
    def __init__(self, state_shape, num_actions, num_envs,
                 epsilon_i=1.0, 
                 epsilon_f=0.0, 
                 n_epsilon=0.1, 
                 alpha=0.5, 
                 gamma = 0.95,
                 hidden_units = []
                ):
        
        tf.reset_default_graph()
        
        self.epsilon_i = epsilon_i
        self.epsilon_f = epsilon_f
        self.epsilon = tf.get_variable("epsilon", initializer=tf.constant(self.epsilon_i))
        self.n_epsilon = n_epsilon
        self.state_shape = state_shape
        self.num_actions = num_actions
        self.num_envs = num_envs
        self.alpha = alpha
        self.gamma = gamma

        self.s_t_ph = tf.placeholder(shape=(None, *state_shape), dtype=tf.float32, name="state")
        self.a_t_ph = tf.placeholder(shape=(None, ), dtype=tf.int32, name="action")
        self.r_t_ph = tf.placeholder(shape=(None, ), dtype=tf.float32, name="reward")
        self.s_t_next_ph = tf.placeholder(shape=(None, *state_shape), dtype=tf.float32, name="next_state")
        self.d_t_ph = tf.placeholder(shape=(None, ), dtype=tf.float32, name="done")
        self.n_ph = tf.placeholder(shape=(), dtype=tf.float32, name="n")
        
        self.Q = Network(hidden_units+[self.num_actions])
        self.Q_targ = Network(hidden_units+[self.num_actions])

        self.decay_epsilon = self.decay_epsilon_tf(self.n_ph)
        self.act = self.act_tf(self.s_t_ph)
        self.update = self.update_tf(self.s_t_ph, self.a_t_ph, self.r_t_ph, self.s_t_next_ph, self.d_t_ph)
        self.sync_params = self.sync_params_tf()
        
        self.test1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Q')
        self.test2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Q_targ')
        
    def sync_params_tf(self):
        self.Q_targ.set_weights(self.Q.get_weights())
        
    def decay_epsilon_tf(self, n):
        return tf.assign(self.epsilon, tf.maximum(
            self.epsilon_f, 
            self.epsilon_i - (n/self.n_epsilon)*(self.epsilon_i - self.epsilon_f)))
    
    def act_tf(self, s_t):
        return tf.where(tf.random_uniform(shape=(self.num_envs, ), minval=0, maxval=1, dtype=tf.float32) < self.epsilon,
                      tf.random_uniform(shape=(self.num_envs, ), minval=0, maxval=self.num_actions, dtype=tf.int32) ,
                      tf.argmax(self.Q.call(s_t), output_type=tf.int32, axis=1))
    
    def update_tf(self, s_t, a_t, r_t, s_t_next, d_t):
        Q_next = tf.reduce_max(self.Q.call(s_t_next), axis=1)
#         Q_next = tf.reduce_max(self.Q_targ.call(s_t_next), axis=1)
        TD = r_t + (1-d_t)*self.gamma*Q_next
        Q_pred = tf.reduce_sum(self.Q.call(s_t) * tf.one_hot(a_t, self.num_actions), axis=1)
        loss = tf.reduce_mean(0.5*(Q_pred - tf.stop_gradient(TD))**2)
        return tf.train.GradientDescentOptimizer(self.alpha).minimize(loss)

In [5]:
def train(env, agent, T=100000, sync_every=1):
    
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    
    t1, t2 = sess.run([agent.test1, agent.test2])
    
    print(t1)
    print(t2)
    
    sess.run(agent.sync_params)
    
    t_11, t_12 = sess.run([agent.test1, agent.test2])
    print(t_11)
    print(t_12)
    
    print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))
    
    buffer = ReplayBuffer(env.observation_space.shape, env.num_envs)
    
    rewards = []
    episode_rewards = 0
    
    s_t = env.reset()
    
    for t in range(T):
        if t%1000 == 0:
            print(f'{100*t/T}%', end='\r')
        if t%sync_every == 0:
            sess.run(agent.sync_params)
            
        a_t = sess.run(agent.act, 
                       feed_dict={
                           agent.s_t_ph:s_t
                       }
                      )   

        s_t_next, r_t, d_t, info = env.step(a_t)
        
        buffer.store(s_t, a_t, r_t, s_t_next, d_t)
        
        s_t_train, a_t_train, r_t_train, s_t_next_train, d_t_train = buffer.get()
        
        sess.run([agent.update, agent.decay_epsilon], 
                 feed_dict={
                     agent.s_t_ph:s_t_train,
                     agent.a_t_ph:a_t_train,
                     agent.r_t_ph:r_t_train,
                     agent.s_t_next_ph:s_t_next_train,
                     agent.d_t_ph:d_t_train,
                     agent.n_ph:t/T,
                 }
                )
        
        s_t = s_t_next
        episode_rewards += r_t
        
        for env_index in range(env.num_envs):
            if d_t[env_index]:
                rewards.append(episode_rewards[env_index])
                episode_rewards[env_index] = 0
                s_t[env_index] = env.reset_at(env_index)
            
    sess.close()
    return rewards

In [6]:
def plot(data, window=100):
    sns.lineplot(
        data=data.rolling(window=window).mean()[window-1::window]
    )

In [7]:
class DiscreteToBoxWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete), \
            "Should only be used to wrap Discrete envs."
        self.n = self.observation_space.n
        self.observation_space = gym.spaces.Box(0, 1, (self.n,))
    
    def observation(self, obs):
        new_obs = np.zeros(self.n)
        new_obs[obs] = 1
        return new_obs

In [8]:
class VectorizedEnvWrapper(gym.Wrapper):
    def __init__(self, make_env, num_envs=1):
        super().__init__(make_env())
        self.num_envs = num_envs
        self.envs = [make_env() for env_index in range(num_envs)]
    
    def reset(self):
        return np.asarray([env.reset() for env in self.envs])
    
    def reset_at(self, env_index):
        return self.envs[env_index].reset()
    
    def step(self, actions):
        next_states, rewards, dones, infos = [], [], [], []
        for env, action in zip(self.envs, actions):
            next_state, reward, done, info = env.step(action)
            next_states.append(next_state)
            rewards.append(reward)
            dones.append(done)
            infos.append(info)
        return np.asarray(next_states), np.asarray(rewards), \
            np.asarray(dones), np.asarray(infos)

In [9]:
num_envs = 10
make_env = lambda: DiscreteToBoxWrapper(gym.make("FrozenLake-v0"))
env = VectorizedEnvWrapper(make_env, num_envs=num_envs)
state_shape = env.observation_space.shape
num_actions = env.action_space.n
agent = Agent(state_shape, num_actions, num_envs, alpha=0.8, gamma=0.95, epsilon_i=1.0, epsilon_f=0.0)

rewards = train(env, agent, T=20000)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


ValueError: You called `set_weights(weights)` on layer "dense_1" with a  weight list of length 1, but the layer was expecting 0 weights. Provided weights: [array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],...

In [None]:
plot(pd.DataFrame(rewards))