<a href="https://colab.research.google.com/github/andyk/vanilla_policy_gradient/blob/master/PolicyGradientFromScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
%tensorflow_version 2.x

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_probability as tfp
import numpy as np
import gym

num_epochs = 100
num_episodes_per_epoch = 10
max_steps_per_episode = 200
discount_rate = 0.9

model = keras.Sequential([keras.layers.Dense(4, activation='relu'),
                          keras.layers.Dense(1, activation='sigmoid')])
optimizer = keras.optimizers.Adam(lr=0.01)
#TODO: implement my own binary cross entropy function so 
#      that i see how the derivation of `grad E_tau~pi{theta}[r(tau)]` is implemented in code.
#      grads = sum_episodes sum_{timesteps t} grad log pi_{theta}(a_t, s_t) * r(s_t, a_t)
#      theta <- theta + alpha * theta * grads
loss_fn = keras.losses.binary_crossentropy

for epoch_num in range(num_epochs):
    #frames = []
    env = gym.make("CartPole-v1")
    grads = []
    rewards = []
    for episode_num in range(num_episodes_per_epoch):
        rewards.append([])
        grads.append([])
        obs = env.reset()
        done = False
        for step_num in range(max_steps_per_episode):
            #frames.append(env.render(mode="rgb_array"))
            with tf.GradientTape() as tape:
                leftprob = model(np.array(obs)[np.newaxis])
                action = tfp.distributions.Bernoulli(probs=leftprob).sample()[0][0].numpy() # in CartPole-v1, 0=left, 1=right
                loss = tf.reduce_mean(loss_fn(action, leftprob))
            obs, reward, done, _ = env.step(action)
            rewards[-1].append(reward)
            grads[-1].append(tape.gradient(loss, model.trainable_variables))
            if done:
                #print("Epoch {} episode {} had {} steps".format(epoch_num, episode_num, step_num+1))
                break
        env.close()
    print("Epoch {} avg ep length: {}".format(epoch_num, float(sum(([len(x) for x in rewards])) / float(num_episodes_per_epoch))))

    # Compute discounted normalized rewards
    d_rewards = None
    for reward_list in rewards:
        for i in range(len(reward_list)-2, -1, -1):
            reward_list[i] += reward_list[i+1] * discount_rate
        if d_rewards is not None:
            d_rewards = tf.concat([d_rewards, [reward_list]], axis=0)
        else:
            d_rewards = tf.ragged.constant([reward_list]) 

    # Normalize rewards
    avg_rewards = tf.math.reduce_mean(d_rewards, keepdims=False)
    std_rewards = tf.math.reduce_std(d_rewards.flat_values)
    normalized_rewards = (d_rewards - avg_rewards) / std_rewards

    # weight the loss function gradients by the normalized discounted rewards
    avg_weighted_grads = []
    for model_var_num in range(len(model.trainable_variables)):
        weighted_grads = [reward * grads[ep_num][st_num][model_var_num]
            for ep_num, rewards in enumerate(normalized_rewards)
                for st_num, reward in (enumerate(rewards))]
        avg_weighted_grads.append(tf.reduce_mean(weighted_grads, axis=0))

    optimizer.apply_gradients(zip(avg_weighted_grads, model.trainable_variables))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 0 avg ep length: 37.9
Epoch 1 avg ep length: 23.7
Epoch 2 avg ep length: 24.0
Epoch 3 avg ep length: 25.9
Epoch 4 avg ep length: 23.2
Epoch 5 avg ep length: 25.3
Epoch 6 avg ep length: 28.3
Epoch 7 avg ep length: 45.3
Epoch 8 avg ep length: 36.8
Epoch 9 avg ep length: 32.9
Epoch 10 avg ep length: 38.1
Epoch 11 avg ep length: 29.1
Epoch 12 avg ep length: 34.0
Epoch 13 avg ep length: 41.7
Epoch 14 avg ep length: 36.1
Epoch 15 avg ep length: 30.2
Epoch 16 avg ep length: 53.2
Epoch 17 avg ep length: 47.4
Epoch 18 avg ep length: 37.7
Epoch 19 avg ep length: 38.7
Epoch 20 avg ep length: 35.9
Epoch 21 avg ep length: 35.1
Epoch 22 avg ep length: 43.0
Epoch 23 avg ep length: 36.6
Epoch 24 avg 

In [0]:
plot_animation(frames)

In [1]:
# The following is copied & pasted from Aurelien Geron's O'Reilly book
# example code in his notebook called 18_reinforcement_learning.ipynb

##########################################################
##### SETUP NECESSARY IMPORTS
##########################################################

%tensorflow_version 2.x

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0-preview is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "rl"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

try:
    did_install
except:
    #remove " > /dev/null 2>&1" to see what is going on under the hood
    !pip install gym pyvirtualdisplay > /dev/null 2>&1
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

    !apt-get update > /dev/null 2>&1
    !apt-get install cmake > /dev/null 2>&1
    !pip install --upgrade setuptools 2>&1
    !pip install ez_setup > /dev/null 2>&1
    !pip install gym[atari] > /dev/null 2>&1
    did_install = True

import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML

######################################################
## SETUP ANIMATIONS
######################################################

from IPython import display as ipythondisplay

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

try:
    import pyvirtualdisplay
    display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()
except ImportError:
    pass

# to show a single frame, use:
#     plot_environment(env)
#     plt.show()
def plot_environment(env, figsize=(5,4)):
    plt.figure(figsize=figsize)
    img = env.render(mode="rgb_array")
    plt.imshow(img)
    plt.axis("off")
    return img

def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim


def render_policy_net(model, n_max_steps=200, seed=42):
    frames = []
    env = gym.make("CartPole-v1")
    env.seed(seed)
    np.random.seed(seed)
    obs = env.reset()
    for step in range(n_max_steps):      
        frames.append(env.render(mode="rgb_array"))
        left_proba = model.predict(obs.reshape(1, -1))
        #action = tfp.distributions.Bernoulli(probs=leftprob_a.numpy()).sample(
        action = int(np.random.rand() > left_proba)
        obs, reward, done, info = env.step(action)
        if done:
            print("{0} steps".format(step))
            break
    env.close()
    return frames

TensorFlow 2.x selected.
Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (42.0.1)


xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!
xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!
