In [1]:
from __future__ import print_function, division, unicode_literals

import os
import sys
import numpy as np

In [2]:
# incase this notebook is rerun without restarting the kernel
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [3]:
%matplotlib nbagg
import matplotlib
import matplotlib.animation as animation
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

PROJECT_ROOT_DIR = '.'
CHAPTER_ID = 'rl'

In [4]:
def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, 'images', CHAPTER_ID, fig_id + '.png')
    print('Saving figure', fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [None]:
import gym
env = gym.make('MsPacman-v0')

In [None]:
obs = env.reset() # returns the first frame

In [None]:
obs.shape

In [None]:
img = env.render(mode='rgb_array')
plt.figure(figsize=(5, 4))
plt.imshow(img)
plt.axis('off')
save_fig('MsPacman')
plt.show();

In [None]:
# to verify if the image is the same as the observation
(img == obs).all()

In [None]:
# helper function to create an environment
def plot_environment(env, figsize=(5, 4)):
    plt.close() # or else nbagg sometimes plots in the previous cell
    plt.figure(figsize=figsize)
    img  = env.render(mode='rgb_array')
    plt.imshow(img)
    plt.axis('off')
    plt.show()

In [None]:
# find the action space for the environment
env.action_space

In [None]:
env.reset()
for step in range(110):
    env.step(3)
for step in range(40):
    env.step(8)

In [None]:
plot_environment(env)

In [None]:
obs, reward, done, info = env.step(0)

In [None]:
obs.shape

In [None]:
reward

In [None]:
done

In [None]:
info

In [None]:
# info is an environment-specific dictionary that can provide 
# some extra information about the internal state of the 
# environment. This is useful for debugging, but your 
# agent should not use this information for learning
# (it would be cheating)

In [None]:
# play one full game by moving in random directions for 10 steps at a time
frames = []

n_max_steps = 1000
n_change_steps = 10

obs = env.reset()
for step in range(n_max_steps):
    img = env.render('rgb_array')
    frames.append(img)
    if step % n_change_steps == 0:
        action = env.action_space.sample() # play randomly
    obs, reward, done, info = env.step(action)
    if done:
        break

In [None]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch

def plot_animation(frames, repeat=False, interval=40):
    plt.close()
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    return animation.FuncAnimation(fig, update_scene, fargs=(frames, patch), frames=len(frames), repeat=repeat, interval=interval)

In [None]:
video = plot_animation(frames)
plt.show()

In [None]:
env.close()

In [None]:
# starting with a simpler environment
env = gym.make('CartPole-v0')
obs = env.reset()
obs

In [None]:
# the four parameters in the observation array are:
# 0: the cart's horizontal position
# 1: its velocity
# 2: the angle of the pole
# 3: the angular velocity

In [None]:
# some environments open up in a separate window even when the rgb_array mode is specified
# helper function to run the environment in jupyter itself
from PIL import Image, ImageDraw

try:
    from pyglet.gl import gl_info
    openai_cart_pole_rendering = True
except Exception:
    openai_cart_pole_rendering = False
    
def render_cart_pole(env, obs):
    if openai_cart_pole_rendering:
        return env.render(mode='rgb_array')
    else:
        img_w = 600
        img_h = 500
        cart_w = img_w // 12
        cart_h = img_h // 15
        pole_len = img_h // 3.5
        pole_w = img_w // 80 + 1
        x_width = 2
        max_ang = 0.2
        bg_col = (255, 255, 255)
        cart_col = 0x000000
        pole_col = 0x669acc
        
        pos, vel, ang, ang_vel = obs
        img = Image.new('RGB', (img_w, img_h), bg_col)
        draw = ImageDraw.Draw(img)
        cart_x = pos * img_w // x_width + img_w // x_width
        cart_y = img_h * 95 // 100
        top_pole_x = cart_x + pole_len * np.sin(ang)
        top_pole_y = cart_y - cart_h // 2 - pole_len * np.cos(ang)
        draw.line((0, cart_y, img_w, cart_y), fill=0)
        draw.rectangle((cart_x - cart_w // 2, cart_y - cart_h // 2, cart_x + cart_w // 2, cart_y + cart_h // 2), fill=cart_col)
        draw.line((cart_x, cart_y - cart_h // 2, top_pole_x, top_pole_y), fill=pole_col,  width=pole_w)
        return np.array(img)
    
def plot_cart_pole(env, obs):
    plt.close()
    img = render_cart_pole(env, obs)
    plt.imshow(img)
    plt.axis('off')
    plt.show()

In [None]:
plot_cart_pole(env, obs)

In [None]:
# action space
env.action_space

In [None]:
obs = env.reset()
while True:
    obs, reward, done, info = env.step(0)
    if done:
        break

In [None]:
plt.close()

In [None]:
img = render_cart_pole(env, obs)
plt.imshow(img)
plt.axis('off')
save_fig('cart_pole_plot')

In [None]:
img.shape

In [None]:
obs = env.reset()
while True:
    obs, reward, done, info = env.step(1)
    if done:
        break

In [None]:
plot_cart_pole(env, obs)

In [None]:
# testing a simple hard-coded policy
frames = []
n_max_steps = 1000
n_change_steps = 10

obs = env.reset()
for step in range(n_max_steps):
    img = render_cart_pole(env, obs)
    frames.append(img)
    
    # hard-coded policy
    position, velocity, angle, angular_velocity = obs
    if angle < 0:
        action = 0
    else:
        action = 1
    
    obs, reward, done, info = env.step(action)
    if done:
        break

In [None]:
video = plot_animation(frames)
plt.show()

In [None]:
# Neural network policy
import tensorflow as tf

n_inputs = 4 # env.observation_space.shape
n_hidden = 4
n_outputs = 1
initializer = tf.contrib.layers.variance_scaling_initializer()

X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
outputs = tf.layers.dense(hidden, n_outputs, activation=tf.nn.sigmoid, kernel_initializer=initializer)

p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)

init = tf.global_variables_initializer()

In [None]:
# randomly initialize this neural netowrk and use it to play a game
n_max_steps = 1000
frames = []

with tf.Session() as sess:
    init.run()
    obs = env.reset()
    for step in range(n_max_steps):
        img = render_cart_pole(env, obs)
        frames.append(img)
        action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})
        obs, reward, done, info = env.step(action_val[0][0])
        if done:
            break
            
env.close()

In [None]:
video = plot_animation(frames)
plt.show()

In [None]:
import tensorflow as tf

reset_graph()

n_inputs = 4
n_hidden = 4
n_outputs = 1
learning_rate = 0.01

initializer = tf.contrib.layers.variance_scaling_initializer()

X = tf.placeholder(tf.float32, shape=[None, n_inputs])
y = tf.placeholder(tf.float32, shape=[None, n_outputs])

hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs)
outputs = tf.nn.sigmoid(logits) # probability of action 0 (left)
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)

cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(cross_entropy)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
# play 10 different environments in parallel and train for 1000 iterations
n_environments = 10
n_iterations = 1000

envs = [gym.make('CartPole-v0') for _ in range(n_environments)]
observations = [env.reset() for env in envs]

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        target_probas = np.array([([1.] if obs[2] < 0 else [0.]) for obs in observations]) # if angle < 0 we want proba(left)=1., or else proba(left)=0.
        action_val, _ = sess.run([action, training_op], feed_dict={X: np.array(observations), y: target_probas})
        for env_index, env in enumerate(envs):
            obs, reward, done, info = env.step(action_val[env_index][0])
            observations[env_index] = obs if not done else env.reset()
    saver.save(sess, './my_policy_net_basic.ckpt')
        
for env in envs:
    env.close()

In [None]:
def render_policy_net(model_path, action, X, n_max_steps=1000):
    frames = []
    env = gym.make('CartPole-v0')
    obs = env.reset()
    with tf.Session() as sess:
        saver.restore(sess, model_path)
        for step in range(n_max_steps):
            img = render_cart_pole(env, obs)
            frames.append(img)
            action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})
            obs, reward, done, info = env.step(action_val[0][0])
            if done:
                break
    env.close()
    return frames

In [None]:
frames = render_policy_net('./my_policy_net_basic.ckpt', action, X)
video = plot_animation(frames)
plt.show()

In [None]:
# Policy gradients
import tensorflow as tf

reset_graph()

n_inputs = 4
n_hidden = 4
n_outputs = 1

learning_rate = 0.01

initializer = tf.contrib.layers.variance_scaling_initializer()

X = tf.placeholder(tf.float32, shape=[None, n_inputs])

hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs)
outputs = tf.nn.sigmoid(logits)
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)

y = 1. - tf.to_float(action)
loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(loss)
gradients = [grad for grad, variable in grads_and_vars]
gradient_placeholders = []
grads_and_vars_feed = []
for grad, variable in grads_and_vars:
    gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    grads_and_vars_feed.append((gradient_placeholder, variable))
training_op = optimizer.apply_gradients(grads_and_vars_feed)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.zeros(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

In [None]:
def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]

In [None]:
discount_rewards([10, 0, -50], discount_rate=0.8)

In [None]:
discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_rate=0.8)

In [None]:
env = gym.make('CartPole-v0')

n_games_per_update = 10
n_max_steps = 1000
n_iterations = 250
save_iterations = 10
discount_rate = 0.95

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        print(f'\rIteration: {iteration}', end='')
        all_rewards = []
        all_gradients = []
        for game in range(n_games_per_update):
            current_rewards = []
            current_gradients = []
            obs = env.reset()
            for step in range(n_max_steps):
                action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})
                obs, reward, done, info = env.step(action_val[0][0])
                current_rewards.append(reward)
                current_gradients.append(gradients_val)
                if done:
                    break
                    
            all_rewards.append(current_rewards)
            all_gradients.append(current_gradients)
        all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
        feed_dict = {}
        for var_index, gradient_placeholder in enumerate(gradient_placeholders):
            mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index] for game_index, rewards in enumerate(all_rewards) for step, reward in enumerate(rewards)], axis=0)
            feed_dict[gradient_placeholder] = mean_gradients
        sess.run(training_op, feed_dict=feed_dict)
        if iteration % save_iterations == 0:
            saver.save(sess, './my_policy_nt_pg.ckpt')

In [None]:
env.close()

In [None]:
frames = render_policy_net('./my_policy_nt_pg.ckpt', action, X, n_max_steps=1000)
video = plot_animation(frames)
plt.show()

In [None]:
# Policy gradients
import tensorflow as tf

reset_graph()

n_inputs = 4
n_hidden = 4
n_outputs = 1

learning_rate = 1e-2

initializer = tf.contrib.layers.variance_scaling_initializer()

X = tf.placeholder(tf.float32, shape=[None, n_inputs])

hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs)
outputs = tf.nn.sigmoid(logits)
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)

In [None]:
action.shape

In [None]:
action

In [None]:
y = 1. - tf.to_float(action)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(cross_entropy)
gradients = [grad for grad, variable in grads_and_vars]
gradient_placeholders = []
grads_and_vars_feed = []
for grad, variable in grads_and_vars:
    gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    grads_and_vars_feed.append((gradient_placeholder, variable))
training_op = optimizer.apply_gradients(grads_and_vars_feed)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.zeros(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    rewards_mean = flat_rewards.mean()
    rewards_std = flat_rewards.std()
    return [(discounted_rewards - rewards_mean)/rewards_std for discounted_rewards in all_discounted_rewards]

In [None]:
discount_rewards([10, 0, -50], discount_rate=0.8)

In [None]:
discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_rate=0.8)

In [None]:
env = gym.make('CartPole-v0')

n_iterations = 250
n_games_per_update = 10
save_iterations = 10
n_max_steps = 1000
discount_rate = 0.95

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        print(f'\rIteration: {iteration}', end='')
        all_rewards = []
        all_gradients = []
        for game in range(n_games_per_update):
            current_rewards = []
            current_gradients = []
            obs = env.reset()
            for step in range(n_max_steps):
                action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})
                obs, reward, done, info = env.step(action_val[0][0])
                current_rewards.append(reward)
                current_gradients.append(gradients_val)
                if done:
                    break
            all_rewards.append(current_rewards)
            all_gradients.append(current_gradients)
            
        all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
        feed_dict = {}
        for var_index, gradient_placeholder in enumerate(gradient_placeholders):
            mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index] for game_index, rewards in enumerate(all_rewards) for step, reward in enumerate(rewards)], axis=0)
            feed_dict[gradient_placeholder] = mean_gradients
        sess.run(training_op, feed_dict=feed_dict)
        if iteration % save_iterations == 0:
            saver.save(sess, './policy_gradients_cartpole-v0-v0.00')

In [None]:
env.close()

In [None]:
frames = render_policy_net('./policy_gradients_cartpole-v0-v0.00', action, X, n_max_steps=1000)
video = plot_animation(frames)
plt.show()

In [5]:
# Markov chains
transition_probabilities = [
    [0.7, 0.2, 0.0, 0.1],
    [0.0, 0.0, 0.9, 0.1],
    [0.0, 1.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 1.0],
]

n_max_steps = 50

def print_sequence(start_state=0):
    current_state = start_state
    print('States:', end=' ')
    for step in range(n_max_steps):
        print(current_state, end=' ')
        if current_state == 3:
            break
        current_state = np.random.choice(range(4), p=transition_probabilities[current_state])
    else:
        print('...', end='')
    print()
    
for _ in range(10):
    print_sequence()

States: 0 0 0 0 0 1 3 
States: 0 0 3 
States: 0 3 
States: 0 1 2 1 2 1 2 1 2 1 2 1 2 1 3 
States: 0 0 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 ...
States: 0 0 3 
States: 0 0 0 0 0 0 0 1 2 1 2 1 3 
States: 0 3 
States: 0 0 0 0 1 3 
States: 0 3 


In [6]:
transition_probabilities = [
    [[0.7, 0.3, 0.0], [1.0, 0.0, 0.0], [0.8, 0.2, 0.0]],
    [[0.0, 1.0, 0.0], None, [0.0, 0.0, 1.0]],
    [None, [0.8, 0.1, 0.1], None]
]
# transition_probabilities have the dimensions s x a x s'

rewards = [
    [[+10, 0, 0], [0, 0, 0], [0, 0, 0]],
    [[0, 0, 0], [0, 0, 0], [0, 0, -50]],
    [[0, 0, 0], [+40, 0, 0], [0, 0, 0]],
]

possible_actions = [[0, 1, 2], [0, 2], [1]]

def policy_fire(state):
    return [0, 2, 1][state]

def policy_random(state):
    return np.random.choice(possible_actions[state])

def policy_safe(state):
    return [0, 0, 1][state]

class MDPEnvironment(object):
    def __init__(self, start_state=0):
        self.start_state=start_state
        self.reset()
    def reset(self):
        self.total_rewards = 0
        self.state = self.start_state
    def step(self, action):
        next_state = np.random.choice(range(3), p=transition_probabilities[self.state][action])
        reward = rewards[self.state][action][next_state]
        self.state = next_state
        self.total_rewards += reward
        return self.state, reward
    
def run_episode(policy, n_steps, start_state=0, display=True):
    env = MDPEnvironment()
    if display:
        print('States (+rewards):', end=' ')
    for step in range(n_steps):
        if display:
            if step == 10:
                print('...', end=' ')
            elif step < 10:
                print(env.state, end=' ')
        action = policy(env.state)
        state, reward = env.step(action)
        if display and step < 10:
            if reward:
                print(f'({reward})', end=' ')
    if display:
        print('Total rewards: ', env.total_rewards)
    return env.total_rewards

for policy in (policy_fire, policy_random, policy_safe):
    all_totals = []
    print(policy.__name__)
    for episode in range(1000):
        all_totals.append(run_episode(policy, n_steps=100, display=(episode < 5)))
    print('Summary: mean={:.1f}, std={:1f}, min={}, max={}'.format(np.mean(all_totals), np.std(all_totals), np.min(all_totals), np.max(all_totals)))
    print()

policy_fire
States (+rewards): 0 (10) 0 (10) 0 (10) 0 1 (-50) 2 (40) 0 (10) 0 (10) 0 (10) 0 (10) ... Total rewards:  120
States (+rewards): 0 (10) 0 (10) 0 (10) 0 (10) 0 1 (-50) 2 (40) 0 (10) 0 (10) 0 ... Total rewards:  210
States (+rewards): 0 (10) 0 (10) 0 (10) 0 (10) 0 (10) 0 1 (-50) 2 (40) 0 (10) 0 (10) ... Total rewards:  220
States (+rewards): 0 (10) 0 (10) 0 (10) 0 1 (-50) 2 (40) 0 (10) 0 (10) 0 (10) 0 (10) ... Total rewards:  -40
States (+rewards): 0 (10) 0 (10) 0 1 (-50) 2 (40) 0 (10) 0 (10) 0 (10) 0 (10) 0 ... Total rewards:  210
Summary: mean=119.0, std=132.643875, min=-280, max=470

policy_random
States (+rewards): 0 0 0 1 (-50) 2 (40) 0 0 0 (10) 0 (10) 0 ... Total rewards:  -40
States (+rewards): 0 0 1 1 (-50) 2 1 (-50) 2 (40) 0 0 0 ... Total rewards:  -60
States (+rewards): 0 0 (10) 0 0 0 0 (10) 0 0 0 0 ... Total rewards:  0
States (+rewards): 0 0 (10) 0 0 (10) 0 1 (-50) 2 (40) 0 0 0 (10) ... Total rewards:  110
States (+rewards): 0 0 1 (-50) 2 (40) 0 0 0 1 1 (-50) 2 (40

In [None]:
# Q-learning
n_states = 3
n_actions = 3
n_steps = 20000
alpha = 0.01
gamma = 0.99

exploration_policy = policy_random
q_values = np.full((n_states, n_actions), -np.inf)
for state, actions in enumerate(possible_actions):
    q_values[state][actions] = 0
    
env = MDPEnvironment()
for step in range(n_steps):
    action = exploration_policy(env.state)
    state = env.state
    next_state, reward = env.step(action)
    next_value = np.max(q_values[next_state])
    q_values[state, action] = (1 - alpha) * q_values[state, action] + alpha * (reward + gamma * next_value)