In [None]:
#Video classification using CNN-LSTM

In [1]:
#Import the required libraries.
import os
import cv2
#import pafy
import math
import random
import numpy as np
import datetime as dt
import tensorflow as tf
from collections import deque
import matplotlib.pyplot as plt
#from moviepy.editor import
%matplotlib inline
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
from moviepy.editor import VideoFileClip

In [2]:
#!unzip /content/test.zip -d /content/
'''For this stage, you may have to upload the test zip file added in the folder'''

dataset_path = os.listdir('/content/train')

CLASSES_LIST = os.listdir('/content/test')
print (CLASSES_LIST)

['running', 'dancing', 'walking']


In [3]:
'''Here I normalize the pixels, set the image height as well
  as the legnth of each video sequence'''
max_pixel_value = 255
image_height, image_width=64,64
SEQUENCE_LENGTH=100

'''Here, i extract the frames, iterate through them, resize based on the the set
  image size and normalize i.e 0-1 rather than 0-255. Everything is added to a list'''

def frames_extraction(video_path):
  frames_list = []

  #print(" the video file path is : {}".format(video_path))
  videoObj = cv2.VideoCapture(video_path)
  #print("the video object is: {}".format(videoObj))

  """ Iterating through Video Frames """
  while True:

    # Reading a frame from the video file
    success, image = videoObj.read()
    #print("the value of success is: {}".format(success))

    if not success:
      break

    resized_frame = cv2.resize(image, (image_height, image_width))

    """Normalize the resized frame by dividing it with 255 so that
    each pixel value then lies between 0 and 1"""

    normalized_frame = resized_frame / max_pixel_value
    frames_list.append(normalized_frame)


  videoObj.release()
  return frames_list

In [4]:
def create_dataset():

  DATASET_DIR='/content/train'
  # Declared Empty Lists to store the features, labels and video file path values.
  features = []
  labels = []
  video_files_paths = []

  # Iterating through all the classes mentioned in the classes list
  for class_index, class_name in enumerate(CLASSES_LIST):
    # Display the name of the class whose data is being extracted.
    print (f'Extracting Data of Class: {class_name}')

    # Get the list of video files present in the specific class name directory.
    files_list = os.listdir (os.path.join(DATASET_DIR, class_name))

    # Iterate through all the files present in the files list.
    for file_name in files_list:
    # Get the complete video path.
      video_file_path = os.path.join(DATASET_DIR, class_name, file_name)

      # Extract the frames of the video file.
      frames = frames_extraction (video_file_path)
      frames=frames[:SEQUENCE_LENGTH]

      # Check if the extracted frames are equal to the SEQUENCE_LENGTH specified above.
      # So ignore the vides having frames less than the SEQUENCE_LENGTH.
      if len(frames) == SEQUENCE_LENGTH:
        # Append the data to their repective lists.
        features.append(frames)
        labels.append(class_index)
        video_files_paths.append(video_file_path)
  # Converting the list to numpy arrays
  features= np.asarray(features)
  labels = np.array(labels)
  # Return the frames, class index, and video file path.
  return features, labels, video_files_paths

In [5]:
features, labels, video_file_path=create_dataset()

Extracting Data of Class: running
Extracting Data of Class: dancing
Extracting Data of Class: walking


In [6]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, GlobalAveragePooling2D

In [7]:
def create_LRCN_model():
  # We will use a Sequential model for model construction.
  model=Sequential()
  # Define the Model Architecture.
  model.add(TimeDistributed(Conv2D(16, (3, 3), padding='same', activation='relu'),
                            input_shape=(SEQUENCE_LENGTH, image_height, image_width, 3)))
  model.add(TimeDistributed (MaxPooling2D((4, 4))))
  model.add(TimeDistributed (Dropout (0.25)))
  model.add(TimeDistributed (Conv2D(32, (3, 3), padding='same', activation = 'relu')))
  model.add(TimeDistributed (MaxPooling2D((4, 4))))
  model.add(TimeDistributed (Dropout (0.25)))
  model.add(TimeDistributed (Conv2D(64, (3, 3), padding='same', activation = 'relu')))
  model.add(TimeDistributed (MaxPooling2D((2, 2))))
  model.add(TimeDistributed (Dropout (0.25)))
  model.add(TimeDistributed (Conv2D(64, (3, 3), padding='same', activation = 'relu')))
  model.add(TimeDistributed (MaxPooling2D((2, 2))))
  #model.add(TimeDistributed (Dropout (0.25)))
  model.add(TimeDistributed (Flatten()))
  model.add(LSTM(32))
  model.add(Dense(len(CLASSES_LIST), activation = 'softmax'))
  # Display the models summary.
  model.summary()

  return model

In [8]:
LRCN_model=create_LRCN_model()

  super().__init__(**kwargs)


In [9]:
one_hot_encoded_labels=to_categorical(labels)

In [10]:
features_train, features_test, labels_train, labels_test = train_test_split(
    features,
    one_hot_encoded_labels,
    test_size=0.25,
    shuffle=True,
    random_state=88
)

In [11]:
features_train

array([[[[[0.00784314, 0.04705882, 0.04313725],
          [0.03921569, 0.09019608, 0.0745098 ],
          [0.05098039, 0.1254902 , 0.10588235],
          ...,
          [0.20784314, 0.61960784, 0.63529412],
          [0.18823529, 0.75686275, 0.7372549 ],
          [0.14117647, 0.70588235, 0.67058824]],

         [[0.05882353, 0.09411765, 0.09411765],
          [0.03137255, 0.08235294, 0.06666667],
          [0.06666667, 0.12941176, 0.12156863],
          ...,
          [0.08627451, 0.71764706, 0.68627451],
          [0.10588235, 0.69019608, 0.66666667],
          [0.21568627, 0.61960784, 0.65882353]],

         [[0.05490196, 0.09411765, 0.09019608],
          [0.04705882, 0.09803922, 0.08235294],
          [0.05882353, 0.13333333, 0.12156863],
          ...,
          [0.2       , 0.60784314, 0.62745098],
          [0.2       , 0.53333333, 0.55294118],
          [0.17647059, 0.49411765, 0.50196078]],

         ...,

         [[0.11764706, 0.56470588, 0.41568627],
          [0.12941176,

In [12]:
from tensorflow.keras.callbacks import EarlyStopping

# Create an instance of Early Stopping Callback
early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=15,
    mode='min',
    restore_best_weights=True
)

# Compile the model and specify loss function, optimizer and metrics
LRCN_model.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

# Start training the model
LRCN_model_training_history = LRCN_model.fit(
    x=features_train,
    y=labels_train,
    epochs=70,
    batch_size=4,
    shuffle=True,
    validation_split=0.2,
    callbacks=[early_stopping_callback]
)


Epoch 1/70
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 11s/step - accuracy: 0.2430 - loss: 1.1441 - val_accuracy: 0.2222 - val_loss: 1.1490
Epoch 2/70
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 9s/step - accuracy: 0.4970 - loss: 1.0688 - val_accuracy: 0.1111 - val_loss: 1.2380
Epoch 3/70
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 8s/step - accuracy: 0.3094 - loss: 1.1219 - val_accuracy: 0.1111 - val_loss: 1.2203
Epoch 4/70
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 8s/step - accuracy: 0.3853 - loss: 1.0587 - val_accuracy: 0.2222 - val_loss: 1.1842
Epoch 5/70
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 9s/step - accuracy: 0.4970 - loss: 0.9701 - val_accuracy: 0.2222 - val_loss: 1.1872
Epoch 6/70
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 8s/step - accuracy: 0.5859 - loss: 0.9196 - val_accuracy: 0.2222 - val_loss: 1.0707
Epoch 7/70
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━

In [13]:
model_evaluation_history=LRCN_model.evaluate(features_test, labels_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.2667 - loss: 1.1092


In [21]:
#save model
tf.saved_model.save(LRCN_model, '/content/model')

In [None]:
#get model for use
LRCN_model = tf.model.load("model", None)

In [16]:
def predict_single_action(video_file_path, SEQUENCE_LENGTH):
    """
    This function will perform single action recognition prediction on a video using the LRCN model.

    Args:
    video_file_path: The path of the video stored in the disk on which the action recognition is to be performed.
    SEQUENCE_LENGTH: The fixed number of frames of a video that can be passed to the model as one sequence.
    """

    # Initialize the VideoCapture object to read from the video file.
    video_reader = cv2.VideoCapture(video_file_path)

    # Get the width and height of the video.
    original_video_width = int(video_reader.get(cv2.CAP_PROP_FRAME_WIDTH))
    original_video_height = int(video_reader.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Declare a list to store video frames we will extract.
    frames_list = []

    # Initialize a variable to store the predicted action being performed in the video.
    predicted_class_name = ''

    # Get the number of frames in the video.
    video_frames_count = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))

    # Calculate the interval after which frames will be added to the list.
    skip_frames_window = max(int(video_frames_count / SEQUENCE_LENGTH), 1)

    # Iterating the number of times equal to the fixed length of sequence.
    for frame_counter in range(SEQUENCE_LENGTH):
        # Set the current frame position of the video.
        video_reader.set(cv2.CAP_PROP_POS_FRAMES, frame_counter * skip_frames_window)

        # Read a frame.
        success, frame = video_reader.read()

        # Check if frame is not read properly then break the loop.
        if not success:
            break

        # Resize the frame to fixed dimensions.
        resized_frame = cv2.resize(frame, (image_height, image_width))

        # Normalize the resized frame by dividing it with 255 so that each pixel value then lies between 0 and 1.
        normalized_frame = resized_frame / 255

        # Appending the pre-processed frame into the frames list
        frames_list.append(normalized_frame)

    # Passing the pre-processed frames to the model and get the predicted probabilities.
    predicted_labels_probabilities = LRCN_model.predict(np.expand_dims(frames_list, axis=0))[0]

    # Get the index of class with highest probability.
    predicted_label = np.argmax(predicted_labels_probabilities)

    # Get the class name using the retrieved index.
    predicted_class_name = CLASSES_LIST[predicted_label]

    # Display the predicted action along with the prediction confidence.
    print(f'Action Predicted: {predicted_class_name}\nConfidence: {predicted_labels_probabilities[predicted_label]}')

    # Release the VideoCapture object.
    video_reader.release()


In [24]:
# Makes Prediction
input_video_file_path = "/content/test/walking/girl-walking-on-university-campus_b1s8vlgqr__8123088590bd8669ae28e877270ca090__P360.mp4"

# Perform single prediction on the test video.
predict_single_action(input_video_file_path, SEQUENCE_LENGTH)

# Display the input video.
VideoFileClip(input_video_file_path, audio=False, target_resolution=(300, None)).ipython_display()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
Action Predicted: walking
Confidence: 0.44506508111953735
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




In [23]:
import shutil
shutil.make_archive('model', 'zip', '/content/model')

'/content/model.zip'