In [1]:
!pip install ffmpeg-python
!pip install "tensorflow-gpu<2" "dm-sonnet<2" "tensorflow-probability==0.7.0"
!git clone https://github.com/deepmind/kinetics-i3d
!cp /content/kinetics-i3d/i3d.py i3d.py

Collecting ffmpeg-python
  Downloading https://files.pythonhosted.org/packages/d7/0c/56be52741f75bad4dc6555991fabd2e07b432d333da82c11ad701123888a/ffmpeg_python-0.2.0-py3-none-any.whl
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0
Collecting tensorflow-gpu<2
[?25l  Downloading https://files.pythonhosted.org/packages/83/b1/9c0d6640eab34fae38f4dae6b312894f8bc1025b0876b3eae1fe11745a7b/tensorflow_gpu-1.15.4-cp36-cp36m-manylinux2010_x86_64.whl (411.0MB)
[K     |████████████████████████████████| 411.0MB 41kB/s 
[?25hCollecting dm-sonnet<2
[?25l  Downloading https://files.pythonhosted.org/packages/53/14/e221b910127bf4e2c19bc6d3b3e65a4e0104b90f7e98a3d428926474ece3/dm_sonnet-1.36-py3-none-any.whl (665kB)
[K     |████████████████████████████████| 665kB 45.8MB/s 
[?25hCollecting tensorflow-probability==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/3e/3a/c10b6c22320531c774402ac7186d1b673374e2a9d12502cbc8d811e4601c/tensorflow_probabi

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import cv2
import shutil
import json
import numpy as np
import ffmpeg  
import math
import pickle
from scipy.special import softmax
import tensorflow as tf
import os
import time
import i3d
import random
def center_standardization(img):
  img=img.astype('float')
  mean=np.mean(img)
  std=np.std(img)
  for x in range(len(img)):
    for y in range(len(img[0])):
      for z in range(len(img[0,0])):
        img[x,y,z]=(img[x,y,z]-mean)/max(std,1/img.size)
  return img
def generate_i3d_feature(video_path):
  video_path,width,height=change_fps(video_path,3)
  cap = cv2.VideoCapture(video_path)
  frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
  i3d_features=np.zeros((frame_count,224,224,3))
  i3d_features_path='i3d_features'
  i=0
  while 1:
    ret,frame=cap.read()
    if not ret:
      break
    diff=abs(width-height)
    if width>height:
      frame=cv2.resize(frame[:,diff//2:-diff//2],(224,224))
    elif height>width:
      frame=cv2.resize(frame[diff//2:-diff//2],(224,224))
    else:
      frame=cv2.resize(frame,(224,224))
    i3d_features[i]=center_standardization(frame)
    i+=1
  i3d_features_path='i3d_features'
  if os.path.exists(i3d_features_path):
    !rm -rf 'i3d_features'
  os.makedirs(i3d_features_path)
  for i in range(0,frame_count-8,3):
    np.save(i3d_features_path+'/'+'{}.npy'.format(i//3),i3d_features[i:i+9])
  return i3d_features_path
def predict_i3d(i3d_features_path):
  tf.reset_default_graph()
  _BATCH_SIZE = 32
  _CLIP_SIZE = 9
  _FRAME_SIZE = 224
  _CHECKPOINT_PATHS = {
      'rgb':'/content/_RGB_0.754_model-34986'
  }
  clip_holder=tf.placeholder(tf.float32,shape=(_BATCH_SIZE, _CLIP_SIZE, _FRAME_SIZE, _FRAME_SIZE, 3))
  with tf.variable_scope('RGB'):
    model = i3d.InceptionI3d()
    logits, _ = model(clip_holder, is_training=False,dropout_keep_prob=1)
    logits_dropout = tf.nn.dropout(logits, 1)
    fc_out = tf.layers.dense(logits_dropout, 49, use_bias=True)
  variable_map = {}
  for variable in tf.global_variables():
    tmp = variable.name.split('/')
    if tmp[0] == 'RGB':
      variable_map[variable.name.replace(':0', '')] = variable
  saver = tf.train.Saver(var_list=variable_map)
  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
  saver.restore(sess, _CHECKPOINT_PATHS['rgb'])
  num_step = int(np.ceil(len(os.listdir(i3d_features_path))/_BATCH_SIZE))
  gen=data_generator(i3d_features_path,_BATCH_SIZE)
  true_count = 0
  dic={}
  for i in range(num_step):
    x_t,names=next(gen)
    fc_out2= sess.run(fc_out,feed_dict={clip_holder:x_t})
    preds=softmax(fc_out2,axis=1)
    for j in range(len(names)):
      name=names[j]
      pred=preds[j]
      dic[int(name)]=(np.argmax(pred),max(pred))
  # to ensure every test procedure has the same test size
  sess.close()
  times=list(dic.keys())
  times.sort()
  preds=[]
  probs=[]
  for i in range(len(times)):
    pred,prob=dic[times[i]]
    preds.append(pred)
    probs.append(prob)
  return preds,probs,times
def data_generator(path,batch_size):
    dirs=[]
    for file in os.listdir(path):
        dirs.append(path+'/'+file)
    random.shuffle(dirs)
    i=0
    while True:
        if i+batch_size<=len(dirs):
            batch_dirs=dirs[i:i+batch_size]
            i+=batch_size
        else:
            batch_dirs=dirs[i:]
            random.shuffle(dirs)
            batch_dirs+=dirs[:i+batch_size-len(dirs)]
            i=i+batch_size-len(dirs)
        dim=np.load(batch_dirs[0]).shape[-1]
        batch_x=np.zeros((batch_size,9,224,224,dim))
        names=[]
        for j in range(len(batch_dirs)):
            input_dir=batch_dirs[j]
            batch_x[j]=np.load(input_dir)
            name=input_dir.split('/')[-1][:-4]
            names.append(name)
        yield(batch_x,names)
def check_rotation(path_video_file):
    # this returns meta-data of the video file in form of a dictionary
    meta_dict = ffmpeg.probe(path_video_file)
    # from the dictionary, meta_dict['streams'][0]['tags']['rotate'] is the key
    # we are looking for
    rotateCode = None
    if 'tags' in meta_dict['streams'][0] and 'rotate' in meta_dict['streams'][0]['tags']:
      if int(meta_dict['streams'][0]['tags']['rotate']) == 90:
          rotateCode = cv2.ROTATE_90_CLOCKWISE
      elif int(meta_dict['streams'][0]['tags']['rotate']) == 180:
          rotateCode = cv2.ROTATE_180
      elif int(meta_dict['streams'][0]['tags']['rotate']) == 270:
          rotateCode = cv2.ROTATE_90_COUNTERCLOCKWISE
    return rotateCode
def change_fps(dir,out_fps):
  rotateCode = check_rotation(dir)
  cap = cv2.VideoCapture(dir)
  fps = cap.get(cv2.CAP_PROP_FPS)
  width=int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
  height=int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
  if rotateCode:
    if rotateCode!=cv2.ROTATE_180:
      tmp=width
      width=height
      height=temp
  frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
  duration = frame_count/fps
  out_frames=int(duration*out_fps)
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
  dir2=dir[:-4]+'_changed.avi'
  out = cv2.VideoWriter(dir2,fourcc, out_fps, (width,height))
  cnt=0
  ret=True
  i=0
  while cnt<frame_count and ret:
    if cnt/fps>=1800:
      break
    ret,frame=cap.read()
    if ret:
      if cnt==i*frame_count//out_frames:
        i+=1
        if rotateCode is not None:
            frame = cv2.rotate(frame, rotateCode)
        out.write(frame)
    cnt+=1
  out.release()
  cap.release()
  return dir2,width,height
def predict_rgb(video):
  with open('/content/label_map.txt', 'rb') as f:
    label_map = pickle.load(f)
  i3d_features_path=generate_i3d_feature(video)
  if len(os.listdir(i3d_features_path))==0:
    return 'Video too short',None
  preds,probs,times=predict_i3d(i3d_features_path)
  output=[]
  for i in range(len(times)):
    output.append((label_map[preds[i]],probs[i],times[i]))
  print(output)
  moments=[]
  for i in output:
    if i[0]=='chest pain':
      if moments:
        if moments[-1][0][1]<i[2]:
          moments.append([[i[2],i[2]+3],i[1]])
        else:
          moments[-1][0][1]=i[2]+3
          moments[-1][1]=max(moments[-1][1],i[1])

      else:
        moments.append([[i[2],i[2]+3],i[1]])
  print(moments)
  for moment in moments:
    moment[1]=str(moment[1])
  return output,moments
video='/content/test_youtube.mp4'
output,moments=predict_rgb(video)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
INFO:tensorflow:Restoring parameters from /content/_RGB_0.754_model-34986
[('take off jacket', 0.9589779, 0), ('taking a selfie', 0.25177354, 1), ('hand waving', 0.302118, 2), ('clapping', 0.3701908, 3), ('put on glasses', 0.6117321, 4), ('take off glasses', 0.4073134, 5), ('cheer up', 0.85038304, 6), ('put on glasses', 0.3018316, 7), ('take off glasses', 0.27414715, 8), ('take off glasses', 0.33245632, 9), ('take off glasses', 0.41233128, 10), ('take off glasses', 0.77292955, 11), ('put on glasses', 0.4644094, 12

([('take off jacket', 0.9589779, 0),
  ('taking a selfie', 0.25177354, 1),
  ('hand waving', 0.302118, 2),
  ('clapping', 0.3701908, 3),
  ('put on glasses', 0.6117321, 4),
  ('take off glasses', 0.4073134, 5),
  ('cheer up', 0.85038304, 6),
  ('put on glasses', 0.3018316, 7),
  ('take off glasses', 0.27414715, 8),
  ('take off glasses', 0.33245632, 9),
  ('take off glasses', 0.41233128, 10),
  ('take off glasses', 0.77292955, 11),
  ('put on glasses', 0.4644094, 12),
  ('reach into pocket', 0.4266951, 13),
  ('reach into pocket', 0.36640224, 14),
  ('neck pain', 0.23368491, 15),
  ('neck pain', 0.21333535, 16),
  ('chest pain', 0.4038162, 17),
  ('fan self', 0.5377983, 18),
  ('take off a hat/cap', 0.32219943, 19),
  ('take off a shoe', 0.37905672, 20),
  ('take off a shoe', 0.43651325, 21),
  ('put on a shoe', 0.2982044, 22),
  ('fan self', 0.22422884, 23),
  ('take off glasses', 0.2360645, 24),
  ('back pain', 0.56258184, 25),
  ('eat meal', 0.121914044, 26),
  ('sneeze/cough', 0.22