In [17]:
!pip install -q opencv-python

import os
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
import numpy as np
import cv2
from IPython import display
import math

In [18]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [19]:
!ls /content/gdrive/MyDrive/

 3.mp4		  'Colab Notebooks'   demo    'Getting started.pdf'
 clip_test_video   darknet	      fk       sub.mp4
 Code.zip	  'Deep Learning'     frames   TestData


In [20]:
# Load the model once from TF-Hub.
hub_handle = 'https://tfhub.dev/deepmind/mil-nce/s3d/1'
#hub_handle = "https://tfhub.dev/deepmind/mil-nce/i3d/1"
hub_model = hub.load(hub_handle)

def generate_embeddings(model, input_frames, input_words):
  """Generate embeddings from the model from video frames and input words."""
  # Input_frames must be normalized in [0, 1] and of the shape Batch x T x H x W x 3
  vision_output = model.signatures['video'](tf.constant(tf.cast(input_frames, dtype=tf.float32)))
  text_output = model.signatures['text'](tf.constant(input_words))
  return vision_output['video_embedding'], text_output['text_embedding']

In [21]:
from IPython.display import HTML
from base64 import b64encode
mp4 = open('/content/gdrive/MyDrive/fk/yelling (2).mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)

In [22]:

def crop_center_square(frame):
  y, x = frame.shape[0:2]
  min_dim = min(y, x)
  start_x = (x // 2) - (min_dim // 2)
  start_y = (y // 2) - (min_dim // 2)
  return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]

def load_video(video_url, max_frames=32, resize=(224, 224)):
  cap = cv2.VideoCapture(video_url)
  frames = []
  try:
    while True:
      ret, frame = cap.read()
      if not ret:
        break
      frame = crop_center_square(frame)
      frame = cv2.resize(frame, resize)
      frame = frame[:, :, [2, 1, 0]]
      frames.append(frame)

      if len(frames) == max_frames:
        break
  finally:
    cap.release()
  frames = np.array(frames)
  if len(frames) < max_frames:
    n_repeat = int(math.ceil(max_frames / float(len(frames))))
    frames = frames.repeat(n_repeat, axis=0)
  frames = frames[:max_frames]
  return frames / 255.0


In [30]:
import glob
query = ['person is angry', 'people are drunk', 'operation is going on in hospital', 'shouting at someone', 'persons are fighting with each other']
for video_name in glob.glob("/content/gdrive/MyDrive/fk/*"):
  video = load_video(video_name)
  video = video.reshape(1, 32, 224, 224, 3)
  video_embd, text_embd = generate_embeddings(hub_model, video, query)
  all_scores = np.dot(text_embd, tf.transpose(video_embd))
  # print(all_scores)
  ind = np.argmax(all_scores, axis=0)
  print(f"{video_name} => {query[int(ind)]}")
  print(all_scores)
  

/content/gdrive/MyDrive/fk/drunk (1).mp4 => shouting at someone
[[3.8127153]
 [3.5898287]
 [2.8290124]
 [4.398959 ]
 [2.8719976]]
/content/gdrive/MyDrive/fk/anger (5).mp4 => person is angry
[[5.9871097]
 [4.541322 ]
 [2.9900815]
 [5.2741413]
 [4.544713 ]]
/content/gdrive/MyDrive/fk/anger (4).mp4 => shouting at someone
[[4.7000995]
 [3.411504 ]
 [1.3025465]
 [4.8391495]
 [4.345476 ]]
/content/gdrive/MyDrive/fk/anger (3).mp4 => shouting at someone
[[4.777812 ]
 [4.093173 ]
 [4.429241 ]
 [4.826609 ]
 [3.7901728]]
/content/gdrive/MyDrive/fk/anger (2).mp4 => shouting at someone
[[4.7281933]
 [5.251305 ]
 [3.1825361]
 [5.84426  ]
 [2.9707086]]
/content/gdrive/MyDrive/fk/anger (1) (online-video-cutter.com).mp4 => operation is going on in hospital
[[3.1816096]
 [3.8120306]
 [4.243023 ]
 [2.8883905]
 [3.1176705]]
/content/gdrive/MyDrive/fk/yelling (3).mp4 => person is angry
[[5.758766 ]
 [5.2219462]
 [3.790605 ]
 [5.384564 ]
 [5.463154 ]]
/content/gdrive/MyDrive/fk/yelling (4).mp4 => shouting a

In [None]:
video_url = 'angert.mp4'
video = load_video(video_url)
video = video.reshape(1, 32, 224, 224, 3)
query = ['person is happy', 'person is sad', 'person is angry']
video.shape

(1, 32, 224, 224, 3)

In [None]:
video_embd, text_embd = generate_embeddings(hub_model, video, query)
all_scores = np.dot(text_embd, tf.transpose(video_embd))
all_scores

array([[-0.098453 ],
       [ 5.0088706]], dtype=float32)

In [None]:
ind = np.argmax(all_scores, axis=0)
query[int(ind)]

'person is angry'

In [None]:
tf.nn.sigmoid(all_scores)

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[0.4754066],
       [0.9933658]], dtype=float32)>