# Approximate q-learning

In this notebook you will teach a __tensorflow__ neural network to do Q-learning.

__Frameworks__ - we'll accept this homework in any deep learning framework. This particular notebook was designed for tensorflow, but you will find it easy to adapt it to almost any python-based deep learning framework.

In [1]:
# XVFB will be launched if you run on a server
# import os
# if os.environ.get("DISPLAY") is str and len(os.environ.get("DISPLAY"))!=0:
#     !bash ../xvfb start
#     %env DISPLAY=:1

In [2]:
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# env = gym.make("CartPole-v0")
env = gym.make("LunarLander-v2")

env.reset()
n_actions = env.action_space.n
state_dim = env.observation_space.shape
print(n_actions)
print(state_dim)
# env.render("rgb_array")
# plt.imshow(env.render("rgb_array"))

ImportError: No module named 'Box2D'

# Approximate (deep) Q-learning: building the network

In this section we will build and train naive Q-learning with theano/lasagne

First step is initializing input variables

In [3]:
import tensorflow as tf
import tensorflow.contrib.layers as tflayers  # Let's make TF simple again

In [4]:
#create input variables. We'll support multiple states at once
current_states = tf.placeholder(dtype=tf.float32,shape=(None,) + state_dim)
actions = tf.placeholder(tf.int32,shape=[None])
rewards = tf.placeholder(tf.float32,shape=[None])
next_states = tf.placeholder(tf.float32, shape=(None,) + state_dim)
is_end = tf.placeholder(tf.bool,shape=[None])

In [5]:
def net1(inputs, out_dim):
    n1 = tflayers.fully_connected(inputs=inputs, num_outputs = 4 * state_dim[0], activation_fn=tf.nn.tanh)
    n2 = tflayers.fully_connected(inputs=n1, num_outputs = state_dim[0], activation_fn=tf.nn.tanh)
    n3 = tflayers.fully_connected(inputs=n2, num_outputs = out_dim, activation_fn=None)
    return n3

def network(l_states, scope=None, reuse=False):
    assert l_states.get_shape().as_list() == list((None,) + state_dim)
    with tf.variable_scope(scope or "network") as scope:
        if reuse:
            scope.reuse_variables()

        # <Your architecture. Please start with a single-layer network>
        l_qvalues = net1(inputs=l_states,out_dim=n_actions)

        return l_qvalues

#### Predicting Q-values for `current_states`

In [6]:
#get q-values for ALL actions in current_states
predicted_qvalues = network(current_states)

In [7]:
#select q-values for chosen actions
predicted_qvalues_for_actions = predicted_qvalues * tf.one_hot(actions,depth=n_actions,axis=-1)

#### Loss function and `update`
Here we write a function similar to `agent.update`.

In [8]:
predicted_next_qvalues = network(next_states, reuse=True)
gamma = 0.8
# <target Q-values using rewards and predicted_next_qvalues>
target_qvalues_for_actions = rewards + gamma * predicted_next_qvalues
target_qvalues_for_actions = tf.where(
    is_end, 
    tf.zeros_like(target_qvalues_for_actions),
    target_qvalues_for_actions)

In [9]:
#mean squared error loss function
# <mean squared between target_qvalues_for_actions and predicted_qvalues_for_actions>
loss = tf.reduce_mean(tf.reduce_sum((target_qvalues_for_actions - predicted_qvalues_for_actions)**2, axis=-1))

In [10]:
#network updates. Note the small learning rate (for stability)
#Training function that resembles agent.update(state,action,reward,next_state) 
#with 1 more argument meaning is_end
train_step = tf.train.AdamOptimizer(1e-4).minimize(
    loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="network"))

### Playing the game

In [11]:
# Tensorflow feature - session
sess = tf.InteractiveSession()
# Tensorflow feature 2 - variables initializer
sess.run(tf.global_variables_initializer())
# You can check all your valiables by:
[v.name for v in tf.trainable_variables()]
# they should all starts with "network"

['network/fully_connected/weights:0',
 'network/fully_connected/biases:0',
 'network/fully_connected_1/weights:0',
 'network/fully_connected_1/biases:0',
 'network/fully_connected_2/weights:0',
 'network/fully_connected_2/biases:0']

In [14]:
initial_epsilon = epsilon = 0.5
final_epsilon = 0.01
n_epochs = 1000

def generate_session(t_max=1000):
    """play env with approximate q-learning agent and train it at the same time"""
    
    total_reward = 0
    s = env.reset()
    total_loss = 0
    
    for t in range(t_max):
        
        #get action q-values from the network
        q_values = sess.run(
            predicted_qvalues,feed_dict={current_states : np.array([s])})[0]

        #   <sample action with epsilon-greedy strategy>
        if np.random.sample() < initial_epsilon:
            a = np.asarray(np.random.choice(np.arange(n_actions,dtype=int)))
        else :
            a = np.asarray(np.argmax(q_values))
        new_s,r,done,info = env.step(a)
        
        #train agent one step. Note that we use one-element arrays instead of scalars 
        #because that's what function accepts.
        curr_loss, _ = sess.run(
            [loss, train_step], 
            feed_dict = {
                    current_states:np.array([s]),
                    actions:np.array([a]),
                    rewards: np.array([r]),
                    next_states: np.array([new_s]),
                    is_end: np.array([done])})

        total_reward += r
        total_loss += curr_loss

        s = new_s
        if done: break
            
    return total_reward, total_loss/float(t), t

In [None]:
from tqdm import trange
tr = trange(
    n_epochs,
    desc="mean reward = {:.3f}\tepsilon = {:.3f}\tloss = {:.3f}\tsteps = {:.3f}".format(0.0, 0.0, 0.0, 0.0),
    leave=True)


for i in tr:
    
    sessions = [generate_session() for _ in range(100)] #generate new sessions
    session_rewards, session_loss, session_steps = map(np.array, zip(*sessions))
    
    epsilon -= (initial_epsilon - final_epsilon) / float(n_epochs) 
    
    tr.set_description("mean reward = {:.3f}\tepsilon = {:.3f}\tloss = {:.3f}\tsteps = {:.3f}".format(
        np.mean(session_rewards), epsilon, np.mean(session_loss), np.mean(session_steps)))

    if np.mean(session_rewards) > 0:
        print ("You Win!")
        break
        
    assert epsilon!=0, "Please explore environment"


mean reward = 0.000	epsilon = 0.000	loss = 0.000	steps = 0.000:   0%|          | 0/1000 [00:00<?, ?it/s][A
mean reward = -233.896	epsilon = 0.499	loss = 44.137	steps = 158.050:   0%|          | 2/1000 [01:27<12:39:48, 45.68s/it]

### Video

In [None]:
epsilon=0 #Don't forget to reset epsilon back to initial value if you want to go on training

In [None]:
#record sessions
import gym.wrappers
env = gym.wrappers.Monitor(env,directory="videos",force=True)
sessions = [generate_session() for _ in range(100)]
env.close()
#unwrap 
env = env.env.env
#upload to gym
#gym.upload("./videos/",api_key="<your_api_key>") #you'll need me later

#Warning! If you keep seeing error that reads something like"DoubleWrapError",
#run env=gym.make("CartPole-v0");env.reset();

In [None]:
#show video
from IPython.display import HTML
import os

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./videos/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices