In [10]:
import os
import datetime
import tensorflow as tf
import tensorflow_models as tfm
import tensorflow_probability as tfp
from preprocess_data import load_data
import numpy as np
import logging
from typing import Tuple, Optional

In [2]:
train_dataset = tf.data.experimental.load(
    "/home/pbr-student/personal/thesis/test/PedestrianTrajectoryPrediction/model/train_dataset"
    )

Instructions for updating:
Use `tf.data.Dataset.load(...)` instead.


[libprotobuf ERROR external/com_google_protobuf/src/google/protobuf/text_format.cc:337] Error parsing text-format tensorflow.data.experimental.DistributedSnapshotMetadata: 1:1: Invalid control characters encountered in text.
[libprotobuf ERROR external/com_google_protobuf/src/google/protobuf/text_format.cc:337] Error parsing text-format tensorflow.data.experimental.DistributedSnapshotMetadata: 1:3: Expected identifier, got: 18117979812700169767


In [11]:
class PreprocessLayer(tf.keras.layers.Layer):
    """ Applies the masking to the sequence
    """

    def __init__(self):
      super().__init__()

    def calc_hidden_mask(self, batch_size=32, sequence_length=15):
        # create mask array, False = needs to be predicted
        mask_arrays = []
        #print("batch_size:", batch_size)
        #print("sequence_length:", sequence_length)
        for i in range(batch_size):
          mask_arr = [True] * 6 + [False] * (sequence_length-6)
          # hide 0-2 in between steps (for lazyness whole datapoint)
          hidden_nr = np.random.randint(3)
          hidden_idx = np.random.choice(range(6),hidden_nr, replace=False)
          for v in hidden_idx:
              mask_arr[v] = False
          mask_arrays.append(mask_arr)
        #print("mask aaray:", np.asarray(mask_arrays).shape)
        return np.asarray(mask_arrays)

    def call(self,
           raw_input_batch: Tuple[tf.Tensor, tf.Tensor],
           is_hidden: Optional[tf.Tensor] = None) -> Tuple[Tuple[tf.Tensor, tf.Tensor], tf.Tensor]:
        input_batch = raw_input_batch
  

        batch_size = tf.shape(input_batch[0])[0]
        sequence_length = tf.shape(input_batch[0])[1]
        feature_size1 = tf.shape(input_batch[0])[2]
        feature_size2 = tf.shape(input_batch[1])[2]

        mask = self.calc_hidden_mask() #tf.convert_to_tensor

        mask_tensor = tf.constant(mask, dtype=tf.bool)

        # Expand dimensions of mask to match the input tensor
        #expanded_mask = tf.expand_dims(mask_tensor, axis=0)  # Add batch dimension
        expanded_mask = tf.expand_dims(mask_tensor, axis=-1)  # Add feature dimension

        # Broadcast mask to match input tensor shape
        broadcasted_mask_pos = tf.broadcast_to(expanded_mask, (batch_size, sequence_length, feature_size1))
        broadcasted_mask_pose = tf.broadcast_to(expanded_mask, (batch_size, sequence_length, feature_size2))

        #batch_mask = tf.broadcast_to(expanded_mask, (batch_size, sequence_length))

        # Apply mask
        masked_input_pos = tf.where(broadcasted_mask_pos, input_batch[0], tf.zeros_like(input_batch[0]))
        masked_input_pose = tf.where(broadcasted_mask_pose, input_batch[1], tf.zeros_like(input_batch[1]))
        targets = tf.where(tf.math.logical_not(broadcasted_mask_pos), input_batch[0], tf.zeros_like(input_batch[0]))

      # scale
        scale_factor = 100.0
        masked_input_pos = tf.math.scalar_mul(scale_factor, masked_input_pos)
        masked_input_pose = tf.math.scalar_mul(scale_factor, masked_input_pose) 

        return (masked_input_pos, masked_input_pose), mask, targets

In [170]:
""" Adapted Sinusoidal Embedding Layer from source: https://github.com/google-research/human-scene-transformer/blob/main/human_scene_transformer/model/embedding.py    """
class SinusoidalEmbeddingLayer(tf.keras.layers.Layer):
  """Sinusoidal Postional Embedding for xyz and time."""

  def __init__(self, min_freq=4, max_freq=256, hidden_size=256):
    super().__init__()
    self.min_freq = float(min_freq)
    self.max_freq = float(max_freq)
    self.hidden_size = hidden_size
    if hidden_size % 2 != 0:
      raise ValueError('hidden_size ({hidden_size}) must be divisible by 2.')
    self.num_freqs_int32 = hidden_size // 2
    self.num_freqs = tf.cast(self.num_freqs_int32, dtype=tf.float32)

  def build(self, input_shape):
    log_freq_increment = (
        tf.math.log(float(self.max_freq) / float(self.min_freq)) /
        tf.maximum(1.0, self.num_freqs - 1))
    # [num_freqs]
    self.inv_freqs = self.min_freq * tf.exp(
        tf.range(self.num_freqs, dtype=tf.float32) * -log_freq_increment)

  def call(self, input_tensor):
    
    # [batch_size, sequence_length, feature_size, num_freqs]
    input_tensor = tf.expand_dims(input_tensor, axis=-1)
    input_tensor = tf.repeat(input_tensor, self.num_freqs_int32, axis=-1)

    # [batch_size, sequence_length, feature_size, hidden_size]
    embedded = tf.concat([
        tf.sin(input_tensor * self.inv_freqs),
        tf.cos(input_tensor * self.inv_freqs)
    ], axis=-1)
    return embedded

In [181]:
""" Adapted Agent Position Encoding Layer from source: https://github.com/google-research/human-scene-transformer/blob/main/human_scene_transformer/model/agent_feature_encoder.py    """
class AgentPositionEncoder(tf.keras.layers.Layer):
  """Encodes agents spatial positions."""

  def __init__(self, output_shape, embedding_size):
    
    super().__init__()

    self.embedding_layer = SinusoidalEmbeddingLayer(
        hidden_size=embedding_size) # output_shape (batch_sie, sequence_length, feature size, hidden_size)
    self.layer_norm = tf.keras.layers.LayerNormalization(axis=-1)  
    self.mlp = tf.keras.layers.EinsumDense(
        '...f,fh->...h',
        output_shape=output_shape,
        bias_axes='h',
        
        activation=None)

  def call(self, input_batch):
    normalized_input = input_batch[0] #self.layer_norm(input_batch[0])
    embedded_input = self.embedding_layer(normalized_input)
    return self.mlp(embedded_input)
  


class AgentTemporalEncoder(tf.keras.layers.Layer):
  """Encodes agents temporal positions."""

  def __init__(self,output_shape, embedding_size, num_steps):
    super().__init__()
    self.embedding_layer = SinusoidalEmbeddingLayer(
        max_freq=num_steps,
        hidden_size=embedding_size)

    self.mlp = tf.keras.layers.EinsumDense(
        '...f,fh->...h',
        output_shape=output_shape,
        bias_axes='h',
        activation=None)

  def _get_temporal_embedding(self, input_batch):
    # This weird thing is for exporting and loading keras model...
    b = tf.shape(input_batch[0])[0]
    num_steps = tf.shape(input_batch[0])[1]

    t = tf.range(0, num_steps, dtype=tf.float32)
    t = t[tf.newaxis, :]
    t = tf.tile(t, [b, 1])
    return self.embedding_layer(t[..., tf.newaxis])

  def call(self, input_batch):
    return self.mlp(self._get_temporal_embedding(input_batch))
  

  """ Adapted Agent Keypoint Encoding Layer from source: https://github.com/google-research/human-scene-transformer/blob/main/human_scene_transformer/model/agent_feature_encoder.py    """
class AgentKeypointsEncoder(tf.keras.layers.Layer):
  """Encodes the agent's keypoints."""

  def __init__(self, output_shape, embedding_size):
    super().__init__()

    self.mlp1 = tf.keras.layers.EinsumDense(
        '...f,fh->...h',
        output_shape=output_shape,
        bias_axes='h',
        activation=tf.nn.relu)

  def call(self, input_batch, training=None):

    keypoints = input_batch[1]

    out = self.mlp1(keypoints)[..., tf.newaxis, :]

    return out


In [171]:
class FeatureConcatAgentEncoderLayer(tf.keras.layers.Layer):
  """Independently encodes features and attends to them.

  Agent features are cross-attended with a learned query or hidden_vecs instead
  of MLP.
  """

  def __init__(self, input_length, batch_size=32, hidden_size=128, num_heads=4, ln_eps=1e-6, transformer_ff_dim=128, drop_prob=0.1):
    super().__init__()

    # Cross Attention and learned query.
    self.ff_layer2 = tf.keras.layers.EinsumDense(
        '...f,fh->...h',
        output_shape=hidden_size,
        bias_axes='h',
        activation=None,
    )
    self.ff_dropout = tf.keras.layers.Dropout(drop_prob)

    self.agent_feature_embedding_layers = []
    # Position Feature [batch, sequence_len, feature_size, hidden_size]
    self.agent_feature_embedding_layers.append(
        AgentPositionEncoder(output_shape=hidden_size-8, embedding_size=hidden_size))
    # Feature Embedding - keypoints [batch, sequence_len, hidden_size]
    self.agent_feature_embedding_layers.append(
        AgentKeypointsEncoder(output_shape=hidden_size-8, embedding_size=hidden_size))

    # Temporal Embedding [batch, sequence_len, 1, hidden_size]
    self.agent_feature_embedding_layers.append(
        AgentTemporalEncoder(output_shape=hidden_size-8, embedding_size=hidden_size, num_steps=input_length))


  def call(self, input_batch: Tuple[Tuple[tf.Tensor, tf.Tensor], tf.Tensor],
           training: Optional[bool] = None):
    mask = input_batch[1]
    input_batch = input_batch[0]
    layer_embeddings = []
    for layer in self.agent_feature_embedding_layers:
      layer_embedding = layer(input_batch, training=training)
      layer_embedding = tf.reshape(
          layer_embedding,
          layer_embedding.shape[:-2]
          + [layer_embedding.shape[-2] * layer_embedding.shape[-1]],
      )
      layer_embeddings.append(layer_embedding)
    embedding = tf.concat(layer_embeddings, axis=-1)

    out = self.ff_layer2(embedding)

    return out

In [172]:
class HST(tf.keras.Model):
    def __init__(self, input_length):
        super().__init__()

        hidden_size = 128
        self.preprocess_layer = PreprocessLayer() 
        self.agent_pose_encoder = AgentKeypointsEncoder(output_shape=hidden_size-8, embedding_size=hidden_size)
        self.agent_pos_encoder = AgentPositionEncoder(output_shape=hidden_size-8, embedding_size=hidden_size)
        self.agent_temp_encoder = AgentTemporalEncoder(output_shape=hidden_size-8, embedding_size=hidden_size, num_steps=input_length)
        self.agent_encoder = FeatureConcatAgentEncoderLayer(input_length=input_length)

    def call(self, input_batch, training = False):
        (input_1, input_2) = input_batch
        masked_inputs, mask, targets = self.preprocess_layer((input_1, input_2)) # output shape (batch_size, 15, 3)
  
        encoded_keys = self.agent_pose_encoder(masked_inputs)
        encoded_pos = self.agent_pos_encoder(masked_inputs)
        encoded_temp = self.agent_temp_encoder(masked_inputs)
        encoded_agent = self.agent_encoder((masked_inputs, mask))

        output_dict={
            "masked_inputs": masked_inputs,
            "encoded_keys": encoded_keys,
            "encoded_pos": encoded_pos,
            "encoded_temp": encoded_temp,
            "encoded_agent": encoded_agent,
            "mask": mask, 
            "targets": targets
        }

        return output_dict

In [182]:
strategy = tf.distribute.OneDeviceStrategy('cpu')
with strategy.scope():
    model = HST(15)
    model.compile(loss='msle', optimizer='rmsprop')

In [183]:
for (batch_x1, batch_x2) in train_dataset.take(1):
    input_batch = (batch_x1, batch_x2)
    output = model(input_batch, training=False)
    break

<__main__.AgentPositionEncoder object at 0x7f739636af40>
(32, 15, 360)
<__main__.AgentKeypointsEncoder object at 0x7f739630a0a0>
(32, 15, 120)
<__main__.AgentTemporalEncoder object at 0x7f739630a790>
(32, 15, 120)


[libprotobuf ERROR external/com_google_protobuf/src/google/protobuf/text_format.cc:337] Error parsing text-format tensorflow.data.experimental.DistributedSnapshotMetadata: 1:1: Invalid control characters encountered in text.
[libprotobuf ERROR external/com_google_protobuf/src/google/protobuf/text_format.cc:337] Error parsing text-format tensorflow.data.experimental.DistributedSnapshotMetadata: 1:3: Expected identifier, got: 18117979812700169767


In [189]:
output["encoded_agent"][0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([ -3598.3284  ,   3669.6824  ,  -1108.4233  ,   6216.9717  ,
         4478.1216  ,   6694.082   ,  11930.198   ,   1407.5459  ,
        11565.539   ,  -2656.3096  ,   5338.704   ,   -723.87555 ,
         1385.4274  ,    -70.11976 ,   3937.3447  ,   3330.6067  ,
         1776.8395  ,   3642.5796  ,  -6280.229   ,    557.0555  ,
         1392.6835  ,    792.41864 ,   3056.999   ,   5369.44    ,
         3407.309   ,   3031.4333  ,  -4531.4478  ,  -1885.7577  ,
        -4108.478   ,  -2197.484   ,  -5746.8477  ,   1387.911   ,
         4124.1777  ,   2118.2458  ,  -5028.1284  ,  -7050.427   ,
         7873.676   ,   1253.348   ,   1032.2197  ,   6468.8403  ,
        -2805.4321  ,  11784.496   ,   2987.487   ,   8209.725   ,
         -977.20355 ,  -2739.6658  ,  -3253.9353  ,   5345.1304  ,
          229.07828 ,   9424.035   ,  -2878.368   ,    986.3005  ,
        -2161.611   ,   9170.379   ,  -3237.4458  ,    410.93564 ,
        -7998.

In [108]:
output["encoded_agent"][0][3]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([ 0.81089574,  0.79600906, -0.70223266,  0.28319982, -1.0845588 ,
       -1.1686233 , -0.36819622,  0.2499477 , -0.6736832 ,  1.7577071 ,
        0.24864085, -0.01561074, -0.9247952 ,  0.5717657 , -0.05981993,
       -0.58544475, -0.6305727 ,  0.23966281, -0.04127312, -0.07515548,
       -0.7684814 ,  0.46016762, -1.2552994 , -0.11084849,  0.38545057,
       -0.35010007, -0.65156966,  0.43060076, -1.8044866 ,  0.42750764,
        1.9555049 , -0.39100063, -0.37983632,  0.31078747,  0.23831747,
       -0.8058383 , -0.569484  , -0.13494697,  0.38479537,  0.406515  ,
        0.21507372,  1.5170391 , -0.45104837,  0.02057071,  0.25320405,
        0.03062555,  0.24210678, -1.5947713 ,  0.8438032 , -0.6528839 ,
        0.36894557, -0.13338293, -0.70126474,  0.5603812 , -0.19411625,
        1.1276221 , -0.58010834,  0.73732305, -0.47832206,  0.19454718,
        1.0612475 , -0.13222125, -0.3588956 , -0.8896779 , -0.35958642,
        0.272765