In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
import numpy as np
import tempfile
import tensorflow as tf

from tf_rl.controller import DiscreteDeepQ, HumanController
from tf_rl import simulate
from tf_rl.models import MLP
from maddux.rl_experiments.planning import Planning
from maddux.rl_experiments.environments import get_simple_environment

In [4]:
LOG_DIR = tempfile.mkdtemp()
print(LOG_DIR)

/tmp/tmpRiJijn


In [5]:
SAVE_DIR = "/home/ben/Development/maddux/saved_experiments"

In [6]:
game = Planning(get_simple_environment())

In [None]:
# Tensorflow business - it is always good to reset a graph before creating a new controller.
tf.reset_default_graph()
session = tf.InteractiveSession()

# This little guy will let us run tensorboard
#      tensorboard --logdir [LOG_DIR]
journalist = tf.train.SummaryWriter(LOG_DIR)

# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
brain = MLP([game.observation_size,], [200, 200, game.num_actions], 
            [tf.tanh, tf.tanh, tf.identity])

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9)

# DiscreteDeepQ object
current_controller = DiscreteDeepQ(game.observation_size, game.num_actions, brain, 
                                   optimizer, session, discount_rate=0.90, 
                                   exploration_period=5000, max_experience=10000, 
                                   store_every_nth=4, train_every_nth=4,
                                   summary_writer=journalist)

session.run(tf.initialize_all_variables())
session.run(current_controller.target_network_update)
# graph was not available when journalist was created  
journalist.add_graph(session.graph_def)

game_idx = 0

In [None]:
import time
import matplotlib

iterations_needed = []
total_rewards = []

try:
    for game_idx in range(game_idx, 1000):
        game = Planning(get_simple_environment())
        game_iterations = 0

        observation = game.observe()
        while game_iterations < 1000 and not game.is_over():
            action = current_controller.action(observation)
            reward = game.collect_reward(action)
            new_observation = game.observe()
            current_controller.store(observation, action, reward, new_observation)
            current_controller.training_step()
            observation = new_observation
            game_iterations += 1
        total_rewards.append(sum(game.collected_rewards))
        iterations_needed.append(game_iterations)
        rewards = []
        if game_idx % 5 == 0:
            print "\rGame %d: iterations before success %d." % (game_idx, game_iterations),
            print "Total Rewards: %s" % (sum(game.collected_rewards)),
            print "Last 5 rewards: {}".format(game.collected_rewards[-5:]),
            game.save_path(SAVE_DIR, game_idx)


except KeyboardInterrupt:
    print "Interrupted"

Game 635: iterations before success 28. Total Rewards: 4.86741582968 Last 5 rewards: [0.1899961688949352, 0.33462235032097531, 0.28486640293048271, 0.22076604846123782, 0.270252150307213]

In [None]:
plt.figure(figsize=(12, 8))

plt.plot(total_rewards[0:500], label='Reward')
plt.plot(iterations_needed[0:500], label='Iterations Needed')
plt.legend()

In [None]:
plt.figure(figsize=(12, 12))
sns.jointplot(np.array(iterations_needed), np.array(total_rewards))