In [1]:
from SimioEnv_v2 import SimioPickDontMoveEnv
from gym_helpers import flatten_space_sample
env = SimioPickDontMoveEnv(
    num_locations=8, 
    num_pickers=1, 
    num_agvs=1,
    log_output=False, 
    log_end_episode_only=False
    )
input_size = len(flatten_space_sample(env.observation_space.sample()))
input_shape = (input_size,)

In [10]:
import tensorflow as tf
import numpy as np
from FunctionApproximators_TF2 import ValueEstimator, PolicyEstimator

In [11]:
tf.keras.backend.clear_session()  # For easy reset of notebook state.

In [12]:
pe = PolicyEstimator(input_shape, env.picker_action_space.nvec, env.agv_action_space.nvec)

Model: "PolicyEstimator"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 30)           330         input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 2)            62          dense[0][0]                      
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 8)            248         dense[0][0]                      
____________________________________________________________________________________

In [13]:
ve = ValueEstimator(input_shape)

Model: "ValueEstimator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 10)]              0         
_________________________________________________________________
dense_4 (Dense)              (None, 30)                330       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 31        
Total params: 361
Trainable params: 361
Non-trainable params: 0
_________________________________________________________________


### Generate Observation

In [14]:
# Reshapes a list of integers into a format tensorflow can understand
def reshape_state(state):
    return np.array(state).reshape(-1, 1).T

In [15]:
# Generate an observation in the environment
observation = env.observation_space.sample()
flat_observation = flatten_space_sample(observation)
reshaped_flat_observation = reshape_state(flat_observation)

print(observation)
print(reshaped_flat_observation)
print(reshaped_flat_observation.shape)

(3, 10, array([35, 61, 29, 44, 94, 53,  4, 61], dtype=int64))
[[ 3 10 35 61 29 44 94 53  4 61]]
(1, 10)


## Value Estimator Predict & Update

In [16]:
# VALUE ESTIMATOR PREDICT
output = ve.predict(reshaped_flat_observation)
print(output)

[[21.188808]]


In [24]:
# VALUE ESTIMATOR UPDATE
td_target = np.array([0.3])
loss = ve.update(reshaped_flat_observation, td_target)
#print(history.history)
print(loss)

22.647835


## Policy Estimator Predict & Update

In [18]:
# POLICY ESTIMATOR PREDICT
action_probabilities = pe.predict(reshaped_flat_observation)

print("output shapes:", [x.shape for x in action_probabilities])

action_probabilities_flattened = [x.flatten() for x in action_probabilities]
chosen_actions = [np.random.choice(np.arange(len(prob)), p=prob) for prob in action_probabilities_flattened]

print(chosen_actions)

output shapes: [(1, 2), (1, 8), (1, 8)]
[0, 1, 3]


In [26]:
# POLICY ESTIMATOR UPDATE
td_error = np.array([.01])
losses = pe.update(reshaped_flat_observation, td_error, actions=chosen_actions)
print(losses)

[0.0, 0.0, 0.0, 0.0]
