# Reinforcement Learning

In [None]:
# Common imports
import gym
import numpy as np

# Visualization imports
import matplotlib
# to enable interactive figures in a live notebook session
matplotlib.use("nbagg")
import matplotlib.animation as animation
import matplotlib.pyplot as plt
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12


In [None]:
# To make notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# Helpers for rendering animation of the environments
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    plt.close()  # or else nbagg sometimes plots in the previous cell
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    return animation.FuncAnimation(fig, update_scene, fargs=(frames, patch), frames=len(frames), repeat=repeat, interval=interval)

## Attributes of a environment
Space objects - Describe valid actions and observations
Discrete space - allows fixed range of non-negative numbers that are actions
Box space - allows n-dimensional box of valid observations

In [None]:
# Create the environment
env = gym.make("CartPole-v0")

print("Actions :", env.action_space)
print("Observations :", env.observation_space)
print("Observations - lower limit :", env.observation_space.low)
print("Observations - higher limit :", env.observation_space.high)
env.close()

## Cart pole environment where agent performs random action

In [None]:
# Create the environment
env = gym.make("CartPole-v0")

# Initialize the environment
obs = env.reset()

frames = []

# Sample the steps in the environment
for _ in range(100):
    img = env.render(mode="rgb_array")
    frames.append(img)
    
    env.step(env.action_space.sample())
    
env.close()

In [None]:
video = plot_animation(frames)
plt.show()

## Cart pole environment with hardcoded policy

In [None]:
# Create the environment
env = gym.make("CartPole-v0")

# hardcoded policy
def basic_policy(angle):
    return 0 if angle < 0 else 1

frames = []

n_max_steps = 1000
n_change_steps = 10

# Initialize the environment
obs = env.reset()

for step in range(n_max_steps):
    img = env.render(mode="rgb_array")
    frames.append(img)
    
    # environment observations
    postion, velocity, angle, angular_velocity = obs
    # detemine next action using basic policy based on angle
    action = basic_policy(angle)
    # peform the action
    obs, reward, done, info = env.step(action)
    if done:
        break
    
# Close the environment
env.close()

In [None]:
video = plot_animation(frames, repeat=True)
plt.show()

## Cart Pole environment with neural network that learns random policy

In [None]:
import tensorflow as tf

reset_graph()

# Construction Phase
# 1) Specify the number of input, hidden and output neurons
n_inputs = 4
n_hidden = 4
n_outputs = 1
# Xavier and He initialization
initializer = tf.contrib.layers.variance_scaling_initializer()

# 2) Use placeholders to represent the data
X = tf.placeholder(tf.float32, shape=[None, n_inputs], name="X")

# 3) Create the different layers of the network
with tf.name_scope("nn"):
    hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer, name="hidden")
    outputs = tf.layers.dense(hidden, n_outputs, activation=tf.nn.sigmoid, kernel_initializer=initializer, name="outputs")
    
# 4) Select a random action based on the estimated probabilities
p_left_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_right), num_samples=1)

# 5) Initalizer
init = tf.global_variables_initializer()


In [None]:
# Create the environment
env = gym.make("CartPole-v0")

# Execution Phase
n_max_steps = 1000
frames = []

with tf.Session() as sess:
    sess.run(init)
    # Initialize the evnvironment
    obs = env.reset()
    for _ in range(n_max_steps):
        img = env.render(mode="rgb_array")
        frames.append(img)
        # Get the action from the neural policy
        action_val = action.eval(feed_dict={X:obs.reshape(1, n_inputs)})
        obs, reward, done, info = env.step(action_val[0][0])
        if done:
            break
        
env.close()

In [None]:
video = plot_animation(frames, repeat=True)
plt.show()

## Cart pole environment with neural network that learns a basic policy
if angle < 0 move left else move right

In [None]:
import tensorflow as tf

# parameters
learning_rate = 0.01
reset_graph()

# Construction Phase
# 1) Specify the number inputs, hidden and outputs neurons
n_inputs = 4
n_hidden = 4
n_outputs = 1
# Xavier and He initialization
initializer = tf.contrib.layers.variance_scaling_initializer()

# 2) Use placeholders to represent the observations and target actions
X = tf.placeholder(tf.float32, shape=[None, n_inputs], name="X")
y = tf.placeholder(tf.float32, shape=[None, n_outputs], name="y")

# 3) Create the different layers of the network
with tf.name_scope("nn"):
    hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer, name="hidden")
    # Logistic(sigmoid) activiation to output a probability from 0.0 to 1.0
    logits = tf.layers.dense(hidden, n_outputs, name="logits")
    outputs = tf.nn.sigmoid(logits)
    # To pick a random action
    p_left_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
    action = tf.multinomial(tf.log(p_left_right), num_samples=1)
    
# 4) Define the cost function i.e the cross entropy to train the network,
# cross entropy penalizes a model that estimates a low probability for a target class
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(cross_entropy)

# 5) Initializer and Saver
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
# Execution phase

# paramters for environment
n_environment = 10
n_iterations = 1000

envs = [gym.make("CartPole-v0") for _ in range(n_environment)]
observations = [env.reset() for env in envs]

with tf.Session() as sess:
    sess.run(init)
    for iteration in range(n_iterations):
        target_probas = np.array([([1.] if obs[2] < 0 else [0.]) for obs in observations])
        action_val, _ = sess.run([action, training_op], feed_dict={X:np.array(observations), y:target_probas})
        
        for env_index, env in enumerate(envs):
            obs, reward, done, info = env.step(action_val[env_index][0])
            observations[env_index] = obs if not done else env.reset()
            
    saver.save(sess, "./my_policy_net_basic.ckpt")
    
for env in envs:
    env.close()

In [None]:
def render_policy_net(model_path, action, X, n_max_steps=1000):
    frames = []
    env = gym.make("CartPole-v0")
    obs = env.reset()
    with tf.Session() as sess:
        saver.restore(sess, model_path)
        for step in range(n_max_steps):
            img = env.render(mode="rgb_array")
            frames.append(img)
            action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})
            obs, reward, done, info = env.step(action_val[0][0])
            if done:
                break
                
    env.close()
    return frames

In [None]:
frames = render_policy_net("./my_policy_net_basic.ckpt", action, X)

In [None]:
video = plot_animation(frames)
plt.show()

# Cart pole environment with neural network that learns using policy gradients

In [None]:
import tensorflow as tf

reset_graph()

# parameters
learning_rate = 0.01

# Construction phase
# 1) Specify the number of neurons in the input, hidden and output layers
n_inputs = 4
n_hidden = 4
n_outputs = 1

# Xavier and He initializations
initializer = tf.contrib.layers.variance_scaling_initializer()

# 2) Use placeholder to represent the input observations
X = tf.placeholder(tf.float32, shape=[None, n_inputs], name="X")

# 3) Create the different layers of the network
with tf.name_scope("nn"):
    hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer, name="hidden")
    # logistic (sigmoid) activation to output a probability between 0.0 to 1.0
    logits = tf.layers.dense(hidden, n_outputs, name="logits")
    outputs = tf.nn.sigmoid(logits)
    # to pick a random action
    p_left_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
    action = tf.multinomial(tf.log(p_left_right), num_samples=1)
    
# assume action taken to be best action, 
# target prob 1.0 - action left 0
# target prob 0.0 - actiob right 1
y = 1 - tf.to_float(action)

# 4) Define the cost function i.e. the cross entropy to train the network
# cross entropy penalizes a model that predicts low probability for a target
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
# Get the gradients
grads_vars = optimizer.compute_gradients(cross_entropy)
gradients = [grads for grads, var in grads_vars]
# Create a placeholder to apply tweaked gradients
gradients_placeholders = []
grads_vars_feed = []

for grad, var in grads_vars:
    gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
    gradients_placeholders.append(gradient_placeholder)
    grads_vars_feed.append((gradient_placeholder, var))
training_op = optimizer.apply_gradients(grads_vars_feed)

# 5) Initializer and Saver
init = tf.global_variables_initializer()
saver = tf.train.Saver()



In [None]:
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.empty(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(reward, discount_rate) for reward in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]

In [None]:
# Execution phase

# create the game environment
env = gym.make("CartPole-v0")
# parameters
# number of training iterations
n_iterations = 250
# max steps per episode
n_max_steps = 1000
# train the policy for every 10 games
n_games_per_update = 10
# save model for every 10 training iterations
save_iterations = 10
discount_rate = 0.95

with tf.Session() as sess:
    sess.run(init)
    for iteration in range(n_iterations):
        print("r\Iteration:{}".format(iteration), end="")
        all_rewards = [] # all sequences of rewards for each episode
        all_gradients = [] # gradients saved at each step of each episode
        for game in range(n_games_per_update):
            current_rewards = [] # all rewards of current episode
            current_gradients = [] # all gradients of current episode
            # initialize the env
            obs = env.reset()
            for step in range(n_max_steps):
                action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})
                obs, reward, done, info = env.step(action_val[0][0])
                current_rewards.append(reward)
                current_gradients.append(gradients_val)
                if done:
                    break
            all_rewards.append(current_rewards)
            all_gradients.append(current_gradients)
            
        # Policy update after every 10 episodes
        all_rewards = discount_normalize_rewards(all_rewards, discount_rate)
        feed_dict = {}
        for var_index, gradient_placeholder in enumerate(gradients_placeholders):
            mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
                                     for game_index, rewards in enumerate(all_rewards)
                                     for step, reward in enumerate(rewards)], axis=0)
            feed_dict[gradient_placeholder] = mean_gradients
        sess.run(training_op, feed_dict=feed_dict)
        if iteration % save_iterations == 0:
            saver.save(sess, "./my_policy_net_pg.ckpt")

In [None]:
env.close()

In [None]:
frames = render_policy_net("./my_policy_net_pg.ckpt", action, X, n_max_steps=2000)

In [None]:
video = plot_animation(frames)
plt.show()