# Primitive Segmentation using ConvLSTM

**Author:** Aditya Jain <br>
**Date started:** 27th July, 2020<br>
**Description:** Predict primitves actions in a human demonstration using ConvLSTM model.

### Setup


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

In [None]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import pylab as plt
import os
from glob import glob

In [None]:
N_CLASSES  = 2          # no. of primitves in the TADL
PX         = 256        # no. of rows in training/test images
PY         = 256        # no. of columns in training/test images
CHANNELS   = 3          # no. of channels in the image
N_FRAMES   = 15         # no. of frames in each training/test video
BATCH_SIZE = 32         # size of the batches
DATA_DIR   = "/content/drive/My Drive/TCS FullTime Work/LfD/Liquid_Pouring/TADL/"

## Building the Dataset

Reads the datafiles and builds the dataset

In [None]:
def video_path(dataset_dir):
  '''
  returns the paths of all video files in the dataset; takes input the parent directory
  '''
  # no. of primitives in the library
  prim_actions    = [dI for dI in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir,dI))]

  video_path_list = []

  for action in prim_actions:
    prim_path     = os.path.join(DATA_DIR, action)      # gives path for each primitive  
  
    for video in os.listdir(prim_path):
      video_path  = os.path.join(prim_path, video)      # path to all videos in a prim
      video_path_list.append(video_path)
      
  return video_path_list


video_list = video_path(DATA_DIR)


# this function builds the dataset
def build_dataset(vid_list):

    image_data  = []
    label_data  = []

    for video_path in vid_list:

      label        = tf.strings.split(video_path, os.sep)[-2]
      temp_image   = []
      temp_label   = []
    
      for image in os.listdir(video_path):
        image_path    = video_path + "/" + image 
      
        # taking care of labels
        temp_label.append(label)

        # load the raw data from the file as a string
        img = tf.io.read_file(image_path)      
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, [PX, PY])
        img = tf.image.convert_image_dtype(img, tf.float32) # Cast and normalize the image to [0,1]
        temp_image.append(img)

      image_data.append(temp_image)
      label_data.append(temp_label)

    return  image_data,  label_data

train_data, train_label = build_dataset(video_list)
dataset = tf.data.Dataset.from_tensor_slices((train_data, train_label))

## Trying to print the data
# for itemx, itemy in dataset.take(3):
#   print(itemy.numpy())

## Build a model

We create a model which take as input movies of shape
`(n_frames, width, height, channels)` and returns a movie
of identical shape.


In [None]:
model = keras.Sequential(
    [
        keras.Input(shape = (None, PX, PY, CHANNELS)),  
        layers.ConvLSTM2D(
            filters=64, kernel_size=(3, 3), padding="same", return_sequences=True
        ),
        layers.BatchNormalization(),
        layers.ConvLSTM2D(
            filters=64, kernel_size=(3, 3), padding="same", return_sequences=True
        ),
        layers.BatchNormalization(),
        layers.ConvLSTM2D(
            filters=64, kernel_size=(3, 3), padding="same", return_sequences=True
        ),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        # layers.Flatten(),
        layers.Dense(100, activation='relu'),
        layers.Dense(N_CLASSES, activation='softmax'),
        
    ]
)

model.summary()

# model compilation
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])

## Train the model


In [None]:
epochs = 2

model.fit(
    image_data,
    batch_size=10,
    epochs=epochs,
    verbose=2,
)


## Test the model on one movie

Feed it with the first 7 positions and then
predict the new positions.


In [None]:
 movie_index = 1004
track = noisy_movies[movie_index][:7, ::, ::, ::]

for j in range(16):
    new_pos = seq.predict(track[np.newaxis, ::, ::, ::, ::])
    new = new_pos[::, -1, ::, ::, ::]
    track = np.concatenate((track, new), axis=0)


# And then compare the predictions
# to the ground truth
track2 = noisy_movies[movie_index][::, ::, ::, ::]
for i in range(15):
    fig = plt.figure(figsize=(10, 5))

    ax = fig.add_subplot(121)

    if i >= 7:
        ax.text(1, 3, "Predictions !", fontsize=20, color="w")
    else:
        ax.text(1, 3, "Initial trajectory", fontsize=20)

    toplot = track[i, ::, ::, 0]

    plt.imshow(toplot)
    ax = fig.add_subplot(122)
    plt.text(1, 3, "Ground truth", fontsize=20)

    toplot = track2[i, ::, ::, 0]
    if i >= 2:
        toplot = shifted_movies[movie_index][i - 1, ::, ::, 0]

    plt.imshow(toplot)
    plt.savefig("%i_animate.png" % (i + 1))
