In [1]:
%matplotlib inline

import gym
import itertools
import matplotlib
import numpy as np
import sys
import tensorflow as tf
import collections
import datetime

## V1: Using the full action space
## V2: Simplified model
#from SimioEnv_v2 import SimioPickDontMoveEnv
from SimioEnv_v2p1 import SimioPickDontMoveEnv
## TF2
from FunctionApproximators_TF2 import ValueEstimator, PolicyEstimator

from IPython.core.debugger import set_trace
from gym_helpers import flatten_space_sample

if "../" not in sys.path:
  sys.path.append("../") 
#from lib.envs.cliff_walking import CliffWalkingEnv
from lib import plotting

matplotlib.style.use('ggplot')

In [2]:
## TF2

####################
## RUNNING ON GPU ##
####################
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

#####################
## EAGER EXECUTION ##
#####################
# # tf.compat.v1.disable_eager_execution()
print("TF Eager execution active:", tf.executing_eagerly()) # https://www.tensorflow.org/guide/eager
# TensorFlow's eager execution is an imperative programming environment that evaluates operations immediately, 
# without building graphs: operations return concrete values instead of constructing a computational graph to run later.
# This makes it easy to get started with TensorFlow and debug models, and it reduces boilerplate as well.

Num GPUs Available:  1
TF Eager execution active: True


In [3]:
# env = gym.make('FrozenLake8x8-v0') # FrozenLake8x8-v0 FrozenLake-v0
# env = SimioFrozenLakeEnv(num_states=64, num_actions=4, log_output=False)
env = SimioPickDontMoveEnv(
    num_locations=8, 
    num_pickers=1, 
    num_agvs=2,
    log_output=False, 
    log_end_episode_only=False
    )

print("Num Pickers: ", env.num_pickers)
print("Num AGVs: ", env.num_agvs)
print("Num Warehouse Locations: ", env.num_locations)

print()
print("Action Space:")
print("=============")
print(env.action_space)
print()
print("Observation Space:")
print("==================")
print(env.observation_space)

Num Pickers:  1
Num AGVs:  2
Num Warehouse Locations:  8

Action Space:
MultiDiscrete([3 8 8])

Observation Space:
MultiDiscrete([11 11 11 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20])


In [4]:
# Reshapes a list of integers into a format tensorflow can understand
def reshape_state(state):
    return np.array(state).reshape(-1, 1).T

In [5]:
def actor_critic(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):
    """
    Actor Critic Algorithm. Optimizes the policy function approximator using policy gradient.
    
    Args:
        env: OpenAI environment.
        estimator_policy: Policy Function to be optimized 
        estimator_value: Value function approximator, used as a critic
        num_episodes: Number of episodes to run for
        discount_factor: Time-discount factor
    
    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Keeps track of useful statistics
    
    Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
    for i_episode in range(num_episodes):
        # Reset the environment and pick the fisrst action
        state = env.reset()
        state = reshape_state(state)
        
        episode = []
        
        # One step in the environment
        for t in itertools.count():
            
            action_probs = estimator_policy.predict(state)
            # action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            
            
            #TF1
            # actions = [np.random.choice(np.arange(len(prob)), p=prob) for prob in action_probs] # action for each picker, agv
            
            
            #TF2
            action_probs_flattened = [x.flatten() for x in action_probs]
            actions = [np.random.choice(np.arange(len(prob)), p=prob) for prob in action_probs_flattened]
            
            next_state, reward, done, _ = env.step(actions) 
            next_state = reshape_state(next_state)
            
            
            # Keep track of the transition, update statistics
            episode.append(Transition(state=state, action=actions, reward=reward, next_state=next_state, done=done))
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            # Calculate TD Target
            value_next = estimator_value.predict(next_state)
            td_target = reward + discount_factor * value_next
            td_error = td_target - estimator_value.predict(state)
            
            # TF1
            # # Update the value estimator
            value_loss = estimator_value.update(state, td_target)
            # # Update the policy estimator
            # # using the td error as our advantage estimate
            policy_loss = estimator_policy.update(state, td_error, actions)
            
            # TF2
            # Update the value estimator
            value_loss = estimator_value.update(state, td_target)
            # Update the policy estimator
            # using the td error as our advantage estimate
            policy_loss = estimator_policy.update(state, td_error, actions)   
            policy_loss = policy_loss[0]
            
            
            # print("\r[step {}][ep {}/{}] Vn={} TD={} TDE={} r={} Vloss={} Ploss={}".format(
            #      t,
            #      i_episode+1,
            #      num_episodes,
            #      
            #      value_next,
            #      td_target,
            #      td_error,
            #      reward,
            #      value_loss,
            #      policy_loss), end="")
            print("\r\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", end="")
            print("\r[step {}][ep {}/{}] cumulative_reward={:.2f} Vloss={} Ploss={}".format(
                 t,
                 i_episode+1,
                 num_episodes,
                 stats.episode_rewards[i_episode],
                 value_loss,
                 policy_loss), end="")
            
            # probs_string = ["{:.2f}".format(x) for x in action_probs[1]]
            # print("\r[step {}][ep {}/{}] action_probs={}".format(
            #     t, 
            #     i_episode+1, 
            #     num_episodes, 
            #     probs_string), end="")
            
            # if td_error > 0.001 or td_error < -0.001:
            #     variables = tf.trainable_variables()
            #     variables_vals = sess.run(variables)
            #     varlist = []
            #     for var, val in zip(variables, variables_vals):
            #         print("var: {}, value: {}".format(var.name, val))
            
            # Print out which step we're on, useful for debugging.
            # print("\rStep {} @ Episode {}/{} ({})\t\t\t\t\t".format(t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end="")
            # if td_target > 0.0001 or td_target < -0.0001:
            #     print("============================")
            #     print("Step {} @ Episode {}/{} ({})".format(t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]))
            #     print("td target", td_target, "td error", td_error)
            #     print("value loss", value_loss, "policy loss", policy_loss)
            #     print("============================")
            # print("\rStep {} @ Episode {}/{} ({})\t\t\t\t\t".format(t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end="")
            if done:
                break
                
            state = next_state
        env.finalize()
        
    return stats

In [7]:

input_size = len(flatten_space_sample(env.observation_space.sample()))
input_shape = (input_size,)
# 
policy_estimator = PolicyEstimator(input_shape, env.picker_actions, env.agv_actions, learning_rate=0.00001)
value_estimator = ValueEstimator(input_shape, learning_rate=0.00001)

Model: "PolicyEstimator"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 19)]         0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 30)           600         input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 3)            93          dense[0][0]                      
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 8)            248         dense[0][0]                      
____________________________________________________________________________________

In [8]:
# #%load_ext tensorboard.notebook
# %reload_ext tensorboard.notebook
# logs_path = "./tensorboard_actorcriticsimio111"
# summary_writer = tf.summary.FileWriter(logdir=logs_path, graph=tf.get_default_graph()) # , graph=g
# %tensorboard --logdir tensorboard_actorcriticsimio111/

In [None]:
## TF2
tf.keras.backend.clear_session()  # For easy reset of notebook state.
print("#######################################")
print("started run:", datetime.datetime.now())
print("#######################################")
stats = actor_critic(env, policy_estimator, value_estimator, num_episodes=200, discount_factor=1.0)
print("######################################")
print("ended run:", datetime.datetime.now())
print("######################################")

#######################################
started run: 2020-01-07 16:23:54.770285
#######################################
Instructions for updating:
Use tf.identity instead.
[step 25][ep 2/200] cumulative_reward=0.00 Vloss=0.008726099506020546 Ploss=0.251715481281280528377234

In [None]:
# varlist

In [None]:
plotting.plot_episode_stats(stats, smoothing_window=50)