# Primitive Segmentation using ConvLSTM

**Author:** Aditya Jain <br>
**Date started:** 27th July, 2020<br>
**Description:** Predict primitves actions in a human demonstration using ConvLSTM model.

### Setup


In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import math
import pylab as plt
import os
import glob
import cv2
from google.colab.patches import cv2_imshow
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
import datetime
import pickle as pkl

In [3]:
N_CLASSES  = 0          # no. of primitves in the TADL, initializing by zero, will be later updated
PX         = 64        # no. of rows in training/test images
PY         = 64        # no. of columns in training/test images
CHANNELS   = 3          # no. of channels in the image
N_FRAMES   = 20         # no. of frames in each training/test video
DATA_DIR   = "/content/drive/My Drive/TCS FullTime Work/LfD/Liquid_Pouring/TADL-II/v2/"
TEST_SPLIT = 0.2        # no. of test samples to draw from data

# directory for saving the trained model
WRITE_DIR  =  "/content/drive/My Drive/TCS FullTime Work/LfD/Liquid_Pouring/Trained_Models/" 

# directory for saving the trained model
SAVED_DATA_DIR  =  "/content/drive/My Drive/TCS FullTime Work/LfD/Liquid_Pouring/Data_PKL/" 

## Building the Dataset

Reads the datafiles and builds the dataset

In [4]:
def preprocessing(img, width, height):
  '''
  returns a preprocessed image suitable for training
  '''
  img = cv2.resize(img, (width, height))       # image resize
  # cv2_imshow(img)                            # optional command to visualize the read image
  img = img.astype("float32") / 255            # rescale the image from 0-1

  return img



def build_dataset(dataset_dir, n_frames):
  '''   
  takes input the parent data directory and no of frames to sample from every video;
  returns the data and label in correct format and dimensions
  '''
  # no. of primitives in the library
  prim_actions    = [dI for dI in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir,dI))]
  prim_no         = 0                               # label for every primitive
  prim_dict       = {}                              # dictionary which stores label definition

  # stores the final data to be returned
  image_data  = []
  label_data  = []

  for action in prim_actions:
    prim_dict[prim_no] = action
    prim_path          = os.path.join(DATA_DIR, action)   # gives path for each primitive  
    # prim_no            += 1

    for video_path in glob.iglob(prim_path + '/*.mp4'):  # iterate over every primitive video
      # print(video_path)

      # for storing data for the current video
      temp_image   = []
      temp_label   = []
      
      sec         = 0      
      vidcap      = cv2.VideoCapture(video_path)     
      fps         = vidcap.get(cv2.CAP_PROP_FPS)           #  FPS of the video      
      frame_count = vidcap.get(cv2.CAP_PROP_FRAME_COUNT)   #  total frame count
      total_sec   = frame_count/fps                        #  total video length in seconds
      # print("Total Time: ", total_sec, frame_count)

      TIME_SEC    = total_sec/n_frames                     # the video will be sampled after every TIME_SEC          
      i = 0
      while (sec < total_sec):
        vidcap.set(cv2.CAP_PROP_POS_MSEC,sec*1000)         # setting which frame to get
    
        sec          += TIME_SEC
        success,image = vidcap.read()
    
        if success:
          process_image = preprocessing(image, PX, PY)
          temp_image.append(process_image)
          i += 1

      # print("Total frames in video taken: ", i) 
      if len(temp_image) == n_frames:
        image_data.append(temp_image)    
        temp_label          = np.zeros(len(prim_actions)) 
        temp_label[prim_no] = 1  
        label_data.append(temp_label)  
      
    prim_no            += 1 

  return np.asarray(image_data), np.asarray(label_data), prim_dict
  # return  np.asarray(image_data),  np.asarray(tf.one_hot(label_data, prim_no)), prim_no
      

### Extract data into train and test

In [None]:
data, label, primitive_dictionary = build_dataset(DATA_DIR, N_FRAMES)
N_CLASSES                         = len(primitive_dictionary)

# train_data, test_data, train_label, test_label = train_test_split(
#     data, label, test_size=0.20, shuffle=True, random_state=0)

In [6]:
print(data.shape)
print(primitive_dictionary)
print(N_CLASSES)

(76, 20, 64, 64, 3)
{0: 'retract', 1: 'release', 2: 'reach', 3: 'grasp', 4: 'tilt'}
5


#### Save the data to disk [Optional]

In [7]:
DTSTR      = datetime.datetime.now()
DTSTR      = DTSTR.strftime("%Y-%m-%d-%H-%M") 

# data_dict  = {'traind': train_data, 'testd': test_data, 'trainl': train_label, 'testl': test_label, 'labelling': primitive_dictionary}
data_dict  = {'data': data, 'label': label, 'prim_dict': primitive_dictionary}
filename   = 'data_' + DTSTR + '.pkl'
filename   = 'data.pkl'    # temporary
filepath   = SAVED_DATA_DIR + filename

outfile = open(filepath,'wb')
pkl.dump(data_dict,outfile)
outfile.close()



## Load the Saved Data 

In [8]:
filename   = 'data_2020-08-15-10-40.pkl'
filename   = 'data.pkl'    # temporary
filepath   = SAVED_DATA_DIR + filename
infile     = open(filepath,'rb')
new_dict   = pkl.load(infile)
infile.close()

# train_data, test_data, train_label, test_label = new_dict['traind'], new_dict['testd'], new_dict['trainl'], new_dict['testl']
data, label              = new_dict['data'], new_dict['label']
primitive_dictionary     = new_dict['prim_dict']
N_CLASSES                = len(primitive_dictionary)

train_data, test_data, train_label, test_label = train_test_split(
    data, label, test_size=0.20, shuffle=True, random_state=0)

print("Train: ", train_data.shape, train_label.shape)
print("Test: ", test_data.shape, test_label.shape)
print("Dictionary: ", primitive_dictionary)

Train:  (60, 20, 64, 64, 3) (60, 5)
Test:  (16, 20, 64, 64, 3) (16, 5)
Dictionary:  {0: 'retract', 1: 'release', 2: 'reach', 3: 'grasp', 4: 'tilt'}


## Build a model

We create a model which take as input movies of shape
`(n_frames, width, height, channels)` and returns a movie
of identical shape.


In [None]:
# Working model with 1 output format

model = keras.Sequential(
    [
        keras.Input(shape = (N_FRAMES, PX, PY, CHANNELS)),  
        layers.ConvLSTM2D(
            filters=64, kernel_size=(3, 3), padding="same", return_sequences=False
        ),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        # layers.Dropout(0.5),     # Enable if the model is overfitting
        # layers.Dense(100, activation='relu'),
        layers.Dense(N_CLASSES, activation='softmax'),
        
    ]
)

# print model summary
model.summary()

# model compilation
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_lst_m2d_1 (ConvLSTM2D)  (None, 128, 128, 64)      154624    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 64, 64, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 262144)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 1048580   
Total params: 1,203,204
Trainable params: 1,203,204
Non-trainable params: 0
_________________________________________________________________


## Train the model (and Test)


### Start Tensorboard

### Start Training

In [None]:
EPOCHS     = 200
BATCH_SIZE = 4
DTSTR      = datetime.datetime.now()
DTSTR      = DTSTR.strftime("%Y-%m-%d-%H-%M")

checkpoint           = ModelCheckpoint(WRITE_DIR + "best_model-" + DTSTR + ".h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto', period=1)

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir= WRITE_DIR + DTSTR + "/logs")

model.fit(
    train_data,
    train_label,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=2,
    validation_data = (test_data, test_label),
    callbacks=[checkpoint, tensorboard_callback]
)

# The final trained model
model.save(WRITE_DIR + "final_model-" + DTSTR + ".h5")

Epoch 1/200

Epoch 00001: loss improved from inf to 7.14541, saving model to /content/drive/My Drive/TCS FullTime Work/LfD/Liquid_Pouring/Trained_Models/best_model-2020-08-17-08-35.h5
12/12 - 7s - loss: 7.1454 - accuracy: 0.1250 - val_loss: 2.9765 - val_accuracy: 0.1538
Epoch 2/200

Epoch 00002: loss improved from 7.14541 to 1.50612, saving model to /content/drive/My Drive/TCS FullTime Work/LfD/Liquid_Pouring/Trained_Models/best_model-2020-08-17-08-35.h5
12/12 - 7s - loss: 1.5061 - accuracy: 0.2500 - val_loss: 1.3832 - val_accuracy: 0.2308
Epoch 3/200

Epoch 00003: loss improved from 1.50612 to 1.11708, saving model to /content/drive/My Drive/TCS FullTime Work/LfD/Liquid_Pouring/Trained_Models/best_model-2020-08-17-08-35.h5
12/12 - 7s - loss: 1.1171 - accuracy: 0.5833 - val_loss: 1.3583 - val_accuracy: 0.4615
Epoch 4/200

Epoch 00004: loss improved from 1.11708 to 0.97061, saving model to /content/drive/My Drive/TCS FullTime Work/LfD/Liquid_Pouring/Trained_Models/best_model-2020-08-17-

## Inference on Human Demonstration

Testing the trained model on human demonstration videos

In [None]:
def build_inference_data(video_path, n_frames):
  '''
  this function is used to build dataset for inference stage
  '''
  # for storing data for the current video
  temp_image   = []
  image_data   = []
      
  sec         = 0      
  vidcap      = cv2.VideoCapture(video_path)     
  fps         = vidcap.get(cv2.CAP_PROP_FPS)           #  FPS of the video      
  frame_count = vidcap.get(cv2.CAP_PROP_FRAME_COUNT)   #  total frame count
  total_sec   = frame_count/fps                        #  total video length in seconds
  # print("Total Time: ", total_sec, frame_count)

  TIME_SEC    = total_sec/n_frames                     # the video will be sampled after every TIME_SEC          
  i = 0
  while (sec < total_sec):
    vidcap.set(cv2.CAP_PROP_POS_MSEC,sec*1000)         # setting which frame to get
    
    sec          += TIME_SEC
    success,image = vidcap.read()
    
    if success:
      process_image = preprocessing(image, PX, PY)
      temp_image.append(process_image)
      i += 1

  # print("Total frames in video taken: ", i) 

  image_data.append(temp_image)  
  return np.asarray(image_data)
  

In [None]:
model_file       = "final_model-2020-08-17-07-21.h5"
latest_model     = tf.keras.models.load_model(WRITE_DIR + model_file)

test_video_dir   = "/content/drive/My Drive/TCS FullTime Work/LfD/Liquid_Pouring/Human Demonstrations/14th August/"
test_video       = "tilt2.MOV"
test_path        = test_video_dir + test_video
test_path        = "/content/drive/My Drive/TCS FullTime Work/LfD/Liquid_Pouring/TADL-II/Tilt/tilt3.mp4"


inference_data = build_inference_data(test_path, N_FRAMES)
print(inference_data.shape)

y = latest_model.predict(inference_data)
print(y)
y = np.argmax(y)
print("Predicted label by the model: ", primitive_dictionary[y])

(1, 20, 128, 128, 3)
[[1.2384035e-01 8.7257165e-01 4.3778209e-04 3.1502831e-03]]
Predicted label by the model:  reach


In [None]:
print(primitive_dictionary)

{0: 'tilt', 1: 'reach', 2: 'grasp', 3: 'release'}


## Miscellaneous - Not to be run

In [None]:
model = keras.Sequential(
    [
        keras.Input(shape = (None, PX, PY, CHANNELS)),  
        layers.ConvLSTM2D(
            filters=64, kernel_size=(3, 3), padding="same", return_sequences=True
        ),
        layers.ConvLSTM2D(
            filters=40, kernel_size=(3, 3), padding="same", return_sequences=False
        ),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        # layers.Dense(100, activation='relu'),
        layers.Dense(N_CLASSES, activation='softmax'),
        
    ]
)

# print model summary
model.summary()

# model compilation
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_lst_m2d (ConvLSTM2D)    (None, None, 128, 128, 64 154624    
_________________________________________________________________
conv_lst_m2d_1 (ConvLSTM2D)  (None, 128, 128, 40)      149920    
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 64, 64, 40)        0         
_________________________________________________________________
flatten (Flatten)            (None, 163840)            0         
_________________________________________________________________
dropout (Dropout)            (None, 163840)            0         
_________________________________________________________________
dense (Dense)                (None, 2)                 327682    
Total params: 632,226
Trainable params: 632,226
Non-trainable params: 0
__________________________________________________

### Model Evaluation on Test Set

In [None]:
score = model.evaluate(test_data, test_label, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])


Test loss: 99.90675354003906
Test accuracy: 0.0


In [None]:
model = keras.Sequential()
model.add(layers.ConvLSTM2D(filters = 64, kernel_size = (3, 3), return_sequences = False, data_format = "channels_last", input_shape = (N_FRAMES, PX, PY, 3)))
model.add(layers.Dropout(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(256, activation="relu"))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(2, activation = "softmax"))

model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])

In [None]:
model = keras.Sequential(
    [
        keras.Input(shape = (None, PX, PY, CHANNELS)),  
        layers.ConvLSTM2D(
            filters=64, kernel_size=(3, 3), padding="same", return_sequences=False
        ),
        layers.BatchNormalization(),
        layers.ConvLSTM2D(
            filters=64, kernel_size=(3, 3), padding="same", return_sequences=True
        ),
        layers.BatchNormalization(),
        layers.ConvLSTM2D(
            filters=64, kernel_size=(3, 3), padding="same", return_sequences=True
        ),
        layers.BatchNormalization(),
        layers.Flatten(),
        layers.Dropout(0.5),
        
        # layers.Dense(100, activation='relu'),
        layers.Dense(N_CLASSES, activation='softmax'),
        
    ]
)

model.summary()

# model compilation
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])

In [None]:
test_data = np.asarray(train_data)
test_label = np.asarray(train_label)

print(train_data.shape)
print(train_label.shape)
print(train_label)

(2, 15, 128, 128, 3)
(2, 2)
[[1. 0.]
 [0. 1.]]
