# Primitive Segmentation using ConvLSTM

**Author:** Aditya Jain <br>
**Date started:** 27th July, 2020<br>
**Description:** Predict primitves actions in a human demonstration using ConvLSTM model.

### Setup


In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [31]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import pylab as plt
import os
from glob import glob
import cv2
from google.colab.patches import cv2_imshow

In [37]:
N_CLASSES  = 2          # no. of primitves in the TADL
PX         = 128        # no. of rows in training/test images
PY         = 128       # no. of columns in training/test images
CHANNELS   = 3          # no. of channels in the image
N_FRAMES   = 15         # no. of frames in each training/test video
BATCH_SIZE = 32         # size of the batches
DATA_DIR   = "/content/drive/My Drive/TCS FullTime Work/LfD/Liquid_Pouring/TADL/"

## Building the Dataset

Reads the datafiles and builds the dataset

In [72]:
def video_path(dataset_dir):
  '''
  returns the paths of all video files in the dataset; takes input the parent directory
  '''
  # no. of primitives in the library
  prim_actions    = [dI for dI in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir,dI))]
  video_path_list = []
  
  # building a dictionary of primtive actions for label generation
  prim_dict       = {}
  i               = 0

  for action in prim_actions:
    prim_dict[action] = i
    prim_path         = os.path.join(DATA_DIR, action)   # gives path for each primitive  
    i                 += 1
  
    for video in os.listdir(prim_path):
      video_path  = os.path.join(prim_path, video)        # path to all videos in a prim
      video_path_list.append(video_path)
      
  return video_path_list, prim_dict


video_list, primitives_dict = video_path(DATA_DIR)
N_CLASSES                   = len(primitives_dict)


def build_dataset(vid_list, primit_dict):
    '''
    This function builds the dataset given the video frames and no of primitives
    '''

    image_data  = []
    label_data  = []

    for video_path in vid_list:

      label        = video_path.split(os.sep)[-2]
      label        = primit_dict[label]
      temp_image   = []
      temp_label   = []
    
      for image in os.listdir(video_path):
        image_path    = video_path + "/" + image 
      
        # taking care of labels
        # temp_label.append(label)

        # load the raw data from the file as a string
        # img = tf.io.read_file(image_path)      
        # img = tf.image.decode_jpeg(img, channels=3)
        # img = tf.image.resize(img, [PX, PY])
        # img = tf.image.convert_image_dtype(img, tf.float32) # Cast and normalize the image to [0,1]
        # temp_image.append(img)

        img = cv2.imread(image_path, 1)       # image read
        img = cv2.resize(img, (PX, PY))       # image resize
        # cv2_imshow(img)                     # optional command to visualize the read image
        img = img.astype("float32") / 255     # rescale the image from 0-1
        temp_image.append(img)

      image_data.append(temp_image)
      temp_label = label       # temporary
      label_data.append(temp_label)

    # return image_data, label_data
    return  image_data,  tf.one_hot(label_data, len(primit_dict))

train_data, train_label = build_dataset(video_list, primitives_dict)
dataset = tf.data.Dataset.from_tensor_slices((train_data, train_label))

## Trying to print the data
# for itemx, itemy in dataset.take(3):
#   print(itemx.shape, itemy.shape)

In [73]:
test_data = np.asarray(train_data)
test_label = np.asarray(train_label)

print(test_data.shape)
print(test_label.shape)

(2, 15, 128, 128, 3)
(2, 2)


In [65]:
print(np.shape(train_data))
print(train_label)

(2, 15, 128, 128, 3)
[0, 1]


## Build a model

We create a model which take as input movies of shape
`(n_frames, width, height, channels)` and returns a movie
of identical shape.


In [71]:
print(CHANNELS)

model = keras.Sequential(
    [
        keras.Input(shape = (N_FRAMES, PX, PY, CHANNELS)),  
        layers.ConvLSTM2D(
            filters=64, kernel_size=(3, 3), padding="same", return_sequences=False
        ),
        # layers.ConvLSTM2D(
        #     filters = 64, kernel_size = (3, 3), return_sequences = False, data_format = "channels_last", input_shape = (N_FRAMES, PX, PY, 3)
        #     ),
        # layers.BatchNormalization(),
        layers.Flatten(),
        layers.Dropout(0.5),
        # layers.Dense(100, activation='relu'),
        layers.Dense(N_CLASSES, activation='softmax'),
        
    ]
)

model.summary()

# model compilation
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])

# model = Sequential()
# model.add(ConvLSTM2D(filters = 64, kernel_size = (3, 3), return_sequences = False, data_format = "channels_last", input_shape = (seq_len, img_height, img_width, 3)))
# model.add(Dropout(0.2))
# model.add(Flatten())
# model.add(Dense(256, activation="relu"))
# model.add(Dropout(0.3))
# model.add(Dense(6, activation = "softmax"))

3
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_lst_m2d_8 (ConvLSTM2D)  (None, 128, 128, 64)      154624    
_________________________________________________________________
flatten_8 (Flatten)          (None, 1048576)           0         
_________________________________________________________________
dropout_13 (Dropout)         (None, 1048576)           0         
_________________________________________________________________
dense_13 (Dense)             (None, 2)                 2097154   
Total params: 2,251,778
Trainable params: 2,251,778
Non-trainable params: 0
_________________________________________________________________


In [69]:
model = keras.Sequential()
model.add(layers.ConvLSTM2D(filters = 64, kernel_size = (3, 3), return_sequences = False, data_format = "channels_last", input_shape = (N_FRAMES, PX, PY, 3)))
model.add(layers.Dropout(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(256, activation="relu"))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(2, activation = "softmax"))

model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_lst_m2d_7 (ConvLSTM2D)  (None, 126, 126, 64)      154624    
_________________________________________________________________
dropout_11 (Dropout)         (None, 126, 126, 64)      0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 1016064)           0         
_________________________________________________________________
dense_11 (Dense)             (None, 256)               260112640 
_________________________________________________________________
dropout_12 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 2)                 514       
Total params: 260,267,778
Trainable params: 260,267,778
Non-trainable params: 0
________________________________________

## Train the model


In [74]:
epochs = 2

model.fit(
    test_data,
    test_label,
    batch_size=2,
    epochs=epochs,
    verbose=2,
)


Epoch 1/2
1/1 - 0s - loss: 0.7667 - accuracy: 0.5000
Epoch 2/2
1/1 - 0s - loss: 9.2385 - accuracy: 0.5000


<tensorflow.python.keras.callbacks.History at 0x7fe8b4bfce48>

## Miscellaneous



```
# This is formatted as code
```

#### Test the model on one movie

Feed it with the first 7 positions and then
predict the new positions.


In [None]:
 movie_index = 1004
track = noisy_movies[movie_index][:7, ::, ::, ::]

for j in range(16):
    new_pos = seq.predict(track[np.newaxis, ::, ::, ::, ::])
    new = new_pos[::, -1, ::, ::, ::]
    track = np.concatenate((track, new), axis=0)


# And then compare the predictions
# to the ground truth
track2 = noisy_movies[movie_index][::, ::, ::, ::]
for i in range(15):
    fig = plt.figure(figsize=(10, 5))

    ax = fig.add_subplot(121)

    if i >= 7:
        ax.text(1, 3, "Predictions !", fontsize=20, color="w")
    else:
        ax.text(1, 3, "Initial trajectory", fontsize=20)

    toplot = track[i, ::, ::, 0]

    plt.imshow(toplot)
    ax = fig.add_subplot(122)
    plt.text(1, 3, "Ground truth", fontsize=20)

    toplot = track2[i, ::, ::, 0]
    if i >= 2:
        toplot = shifted_movies[movie_index][i - 1, ::, ::, 0]

    plt.imshow(toplot)
    plt.savefig("%i_animate.png" % (i + 1))


In [None]:
model = keras.Sequential(
    [
        keras.Input(shape = (None, PX, PY, CHANNELS)),  
        layers.ConvLSTM2D(
            filters=64, kernel_size=(3, 3), padding="same", return_sequences=False
        ),
        layers.BatchNormalization(),
        layers.ConvLSTM2D(
            filters=64, kernel_size=(3, 3), padding="same", return_sequences=True
        ),
        layers.BatchNormalization(),
        layers.ConvLSTM2D(
            filters=64, kernel_size=(3, 3), padding="same", return_sequences=True
        ),
        layers.BatchNormalization(),
        layers.Flatten(),
        layers.Dropout(0.5),
        
        # layers.Dense(100, activation='relu'),
        layers.Dense(N_CLASSES, activation='softmax'),
        
    ]
)

model.summary()

# model compilation
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])