In [1]:
import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
from time import time
%matplotlib inline

In [2]:
alpha = 0.02
discount = 0.95
episodes = 5000
report = 100

tf.reset_default_graph()
sess = tf.Session()

S_t = tf.placeholder(tf.float32, [None, 8])
expected_t = tf.placeholder(tf.float32, [None, ])
A_t = tf.placeholder(tf.int32, [None, ])

opt = tf.train.AdamOptimizer(alpha)

logits = tf.layers.dense(S_t, 10, activation=tf.nn.relu)
logits = tf.layers.dense(logits, 10, activation=tf.nn.relu)
logits = tf.layers.dense(logits, 4)

action_prob = tf.nn.softmax(logits)

loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=A_t, logits=logits) # This is basically log prob.
loss = tf.reduce_mean(tf.multiply(loss, expected_t))
update_op = opt.minimize(loss) # Maximize expected rewards

In [3]:
def get_action(state):
    probs = sess.run(action_prob, feed_dict = {S_t: state})
    action = np.random.choice(range(len(probs.ravel())), p=probs.ravel())
    return action
    

In [None]:
env = gym.make('LunarLander-v2')
episode_rewards = []
losses = []
sess.run(tf.global_variables_initializer())

for e in range(episodes):
    states, actions, rewards = [], [], []
    state = env.reset()
    done = False

    # Get episode states, actions and rewards
    while not done:
        states.append(state)
        state = state.reshape(1, -1)
        action = get_action(state)
        state, reward, done, _ = env.step(action)
        actions.append(action)
        rewards.append(reward)
        if np.sum(rewards) < -250:
            done = True
        
    episode_rewards.append(np.sum(rewards))
    actions = np.array(actions)
    states = np.array(states)
    
    # Get expected discounted returns for each state
    expected_rewards = np.zeros_like(rewards)
    expected = 0
    for t in reversed(range(len(rewards))):
        expected = discount*expected + rewards[t]
        expected_rewards[t] = expected
        
    expected_rewards = np.array(expected_rewards)
    # print(expected_rewards)
    # Normalize expected discounted returns
    expected_rewards = (expected_rewards - np.mean(expected_rewards)) / np.std(expected_rewards)
    
    l, _ = sess.run([loss, update_op], feed_dict={S_t:states, 
                                                  expected_t:expected_rewards, 
                                                  A_t: actions})
    losses.append(l)
    
    
    if e % report == 0:
        print("Episode {}, max reward: {}".format(e, max(episode_rewards)))
        if e >= 100:
            print("\tLast 100 mean: {}".format(np.mean(episode_rewards[e-100:e])))
    if e >= 100:
        if np.mean(episode_rewards[e-100:e]) >= 200:
            print("Solved at {}".format(e))
            break

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode 0, max reward: -292.0062446062984
Episode 100, max reward: -34.071588424313234
	Last 100 mean: -187.1744979821303
Episode 200, max reward: 8.631536974739785
	Last 100 mean: -164.03956685931087


In [None]:
plt.plot(episode_rewards)
print(max(episode_rewards))

In [None]:
plt.plot(losses)

In [None]:
def show_state(env, step=0):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("Step: %d" % (step))
    
    #plt.pause(0.001)  # pause for plots to update
    
    display.clear_output(wait=True)
    display.display(plt.gcf())
        
done = False
state = env.reset()
step = 0
while not done:
    step += 1
    show_state(env, step)
    action = np.argmax(sess.run(logits, feed_dict={S_t: state.reshape(1,-1)})[0])
    state, _, done, _ = env.step(action)