In [14]:
import sys
import time
import os
import tqdm
import torch
import argparse
import glob
import subprocess
import warnings
import cv2
import pickle
import numpy
import pdb
import math
import python_speech_features

from scipy import signal
from shutil import rmtree
from scipy.io import wavfile
from scipy.interpolate import interp1d
from sklearn.metrics import accuracy_score, f1_score

from scenedetect.video_manager import VideoManager
from scenedetect.scene_manager import SceneManager
from scenedetect.frame_timecode import FrameTimecode
from scenedetect.stats_manager import StatsManager
from scenedetect.detectors import ContentDetector

from model.faceDetector.s3fd import S3FD
from talkNet import talkNet
warnings.filterwarnings("ignore")



In [15]:
args = {
  "videoName":"1.aa",
  "videoFolder":"E:\\Projects\\Style-based Video Editor\\src\\server\\temp\\scenes",
  "pretrainModel":"pretrain_TalkSet.model",

  "nDataLoaderThread":10,
  "facedetScale":0.25,
  "minTrack":10,
  "numFailedDet":10,
  "minFaceSize":1,
  "cropScale":0.40,

  "start":0,
  "duration":0,

  "colSavePath":"/data08/col"
}

args["videoPath"] = glob.glob(os.path.join(args["videoFolder"], args["videoName"] + '.*'))[0]
args["savePath"] = os.path.join(args["videoFolder"], args["videoName"])


In [16]:
def scene_detect(args):
  
  # CPU: Scene detection, output is the list of each shot's time duration
  videoManager = VideoManager([args["videoFilePath"]])
  statsManager = StatsManager()
  sceneManager = SceneManager(statsManager)
  sceneManager.add_detector(ContentDetector())
  baseTimecode = videoManager.get_base_timecode()
  videoManager.set_downscale_factor()
  videoManager.start()
  sceneManager.detect_scenes(frame_source=videoManager)
  sceneList = sceneManager.get_scene_list(baseTimecode)
  savePath = os.path.join(args["pyworkPath"], 'scene.pckl')
  if sceneList == []:
    sceneList = [(videoManager.get_base_timecode(),
                  videoManager.get_current_timecode())]
  with open(savePath, 'wb') as fil:
    pickle.dump(sceneList, fil)
    sys.stderr.write('%s - scenes detected %d\n' % (args["videoFilePath"], len(sceneList)))
  return sceneList



In [26]:
def inference_video(args):
  # GPU: Face detection, output is the list contains the face location and score in this frame
  DET = S3FD(device='cpu')
  flist = glob.glob(os.path.join(args["pyframesPath"], '*.jpg'))
  flist.sort()
  dets = []
  for fidx, fname in enumerate(flist):
    image = cv2.imread(fname)
    imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    bboxes = DET.detect_faces(
        imageNumpy, conf_th=0.9, scales=[args["facedetScale"]])
    dets.append([])
    for bbox in bboxes:
      # dets has the frames info, bbox info, conf info
      dets[-1].append({'frame': fidx, 'bbox': (bbox[:-1]
                                               ).tolist(), 'conf': bbox[-1]})
    sys.stderr.write('%s-%05d; %d dets\r' %
                     (args["videoFilePath"], fidx, len(dets[-1])))
  savePath = os.path.join(args["pyworkPath"], 'faces.pckl')
  with open(savePath, 'wb') as fil:
    pickle.dump(dets, fil)
  return dets

In [18]:
def bb_intersection_over_union(boxA, boxB, evalCol=False):
  # CPU: IOU Function to calculate overlap between two image
  xA = max(boxA[0], boxB[0])
  yA = max(boxA[1], boxB[1])
  xB = min(boxA[2], boxB[2])
  yB = min(boxA[3], boxB[3])
  interArea = max(0, xB - xA) * max(0, yB - yA)
  boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
  boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
  if evalCol == True:
    iou = interArea / float(boxAArea)
  else:
    iou = interArea / float(boxAArea + boxBArea - interArea)
  return iou


In [19]:

def track_shot(args, sceneFaces):
  # CPU: Face tracking
  iouThres = 0.5     # Minimum IOU between consecutive face detections
  tracks = []
  while True:
    track = []
    for frameFaces in sceneFaces:
      for face in frameFaces:
        if track == []:
          track.append(face)
          frameFaces.remove(face)
        elif face['frame'] - track[-1]['frame'] <= args["numFailedDet"]:
          iou = bb_intersection_over_union(
              face['bbox'], track[-1]['bbox'])
          if iou > iouThres:
            track.append(face)
            frameFaces.remove(face)
            continue
        else:
          break
    if track == []:
      break
    elif len(track) > args["minTrack"]:
      frameNum = numpy.array([f['frame'] for f in track])
      bboxes = numpy.array([numpy.array(f['bbox']) for f in track])
      frameI = numpy.arange(frameNum[0], frameNum[-1] + 1)
      bboxesI = []
      for ij in range(0, 4):
        interpfn = interp1d(frameNum, bboxes[:, ij])
        bboxesI.append(interpfn(frameI))
      bboxesI = numpy.stack(bboxesI, axis=1)
      if max(numpy.mean(bboxesI[:, 2] - bboxesI[:, 0]), numpy.mean(bboxesI[:, 3] - bboxesI[:, 1])) > args["minFaceSize"]:
        tracks.append({'frame': frameI, 'bbox': bboxesI})
  return tracks


In [20]:
def crop_video(args, track, cropFile):
  # CPU: crop the face clips
  flist = glob.glob(os.path.join(
      args["pyframesPath"], '*.jpg'))  # Read the frames
  flist.sort()
  vOut = cv2.VideoWriter(
      cropFile + 't.avi', cv2.VideoWriter_fourcc(*'XVID'), 25, (224, 224))  # Write video
  dets = {'x': [], 'y': [], 's': []}
  for det in track['bbox']:  # Read the tracks
    dets['s'].append(max((det[3] - det[1]), (det[2] - det[0])) / 2)
    dets['y'].append((det[1] + det[3]) / 2)  # crop center x
    dets['x'].append((det[0] + det[2]) / 2)  # crop center y
  dets['s'] = signal.medfilt(dets['s'], kernel_size=13)  # Smooth detections
  dets['x'] = signal.medfilt(dets['x'], kernel_size=13)
  dets['y'] = signal.medfilt(dets['y'], kernel_size=13)
  for fidx, frame in enumerate(track['frame']):
    cs = args["cropScale"]
    bs = dets['s'][fidx]   # Detection box size
    bsi = int(bs * (1 + 2 * cs))  # Pad videos by this amount
    image = cv2.imread(flist[frame])
    frame = numpy.pad(image, ((bsi, bsi), (bsi, bsi), (0, 0)),
                      'constant', constant_values=(110, 110))
    my = dets['y'][fidx] + bsi  # BBox center Y
    mx = dets['x'][fidx] + bsi  # BBox center X
    face = frame[int(my - bs):int(my + bs * (1 + 2 * cs)),
                 int(mx - bs * (1 + cs)):int(mx + bs * (1 + cs))]
    vOut.write(cv2.resize(face, (224, 224)))
  audioTmp = cropFile + '.wav'
  audioStart = (track['frame'][0]) / 25
  audioEnd = (track['frame'][-1] + 1) / 25
  vOut.release()
  command = ("ffmpeg -y -i \"%s\" -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f \"%s\" -loglevel panic" %
             (args["audioFilePath"], args["nDataLoaderThread"], audioStart, audioEnd, audioTmp))
  output = subprocess.call(
      command, shell=True, stdout=None)  # Crop audio file
  _, audio = wavfile.read(audioTmp)
  command = ("ffmpeg -y -i \"%st.avi\" -i \"%s\" -threads %d -c:v copy -c:a copy \"%s.avi\" -loglevel panic" %
             (cropFile, audioTmp, args["nDataLoaderThread"], cropFile))  # Combine audio and video file
  output = subprocess.call(command, shell=True, stdout=None)
  os.remove(cropFile + 't.avi')
  return {'track': track, 'proc_track': dets}


In [21]:
def extract_MFCC(file, outPath):
  # CPU: extract mfcc
  sr, audio = wavfile.read(file)
  # (N_frames, 13)   [1s = 100 frames]
  mfcc = python_speech_features.mfcc(audio, sr)
  featuresPath = os.path.join(
      outPath, file.split('/')[-1].replace('.wav', '.npy'))
  numpy.save(featuresPath, mfcc)


In [22]:
def evaluate_network(files, args):
  # GPU: active speaker detection by pretrained TalkNet
  s = talkNet()
  s.loadParameters(args["pretrainModel"])
  sys.stderr.write("Model %s loaded from previous state! \r\n" %
                   args["pretrainModel"])
  s.eval()
  allScores = []
  # durationSet = {1,2,4,6} # To make the result more reliable
  # Use this line can get more reliable result
  durationSet = {1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 6}
  for file in tqdm.tqdm(files, total=len(files)):
    fileName = os.path.splitext(file.split(
        '/')[-1])[0]  # Load audio and video
    _, audio = wavfile.read(os.path.join(args["pycropPath"], fileName + '.wav'))
    audioFeature = python_speech_features.mfcc(
        audio, 16000, numcep=13, winlen=0.025, winstep=0.010)
    video = cv2.VideoCapture(os.path.join(args["pycropPath"], fileName + '.avi'))
    video = cv2.VideoCapture(fileName + '.avi')
    videoFeature = []
    while video.isOpened():
      ret, frames = video.read()
      if ret == True:
        face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
        face = cv2.resize(face, (224, 224))
        face = face[int(112 - (112 / 2)):int(112 + (112 / 2)),
                    int(112 - (112 / 2)):int(112 + (112 / 2))]
        videoFeature.append(face)
      else:
        break
    video.release()
    videoFeature = numpy.array(videoFeature)
    length = min(
        (audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100, videoFeature.shape[0])
    audioFeature = audioFeature[:int(round(length * 100)), :]
    videoFeature = videoFeature[:int(round(length * 25)), :, :]
    allScore = []  # Evaluation use TalkNet
    for duration in durationSet:
      batchSize = int(math.ceil(length / duration))
      scores = []
      with torch.no_grad():
        for i in range(batchSize):
          inputA = torch.FloatTensor(
              audioFeature[i * duration * 100:(i + 1) * duration * 100, :]).unsqueeze(0).cpu()
          inputV = torch.FloatTensor(
              videoFeature[i * duration * 25: (i + 1) * duration * 25, :, :]).unsqueeze(0).cpu()
          embedA = s.model.forward_audio_frontend(inputA)
          embedV = s.model.forward_visual_frontend(inputV)
          embedA, embedV = s.model.forward_cross_attention(
              embedA, embedV)
          out = s.model.forward_audio_visual_backend(embedA, embedV)
          score = s.lossAV.forward(out, labels=None)
          scores.extend(score)
      allScore.append(scores)
    allScore = numpy.round(
        (numpy.mean(numpy.array(allScore), axis=0)), 1).astype(float)
    allScores.append(allScore)
  return allScores


In [23]:
def visualization(tracks, scores, args):
  # CPU: visulize the result for video format
  flist = glob.glob(os.path.join(args["pyframesPath"], '*.jpg'))
  flist.sort()
  faces = [[] for i in range(len(flist))]
  for tidx, track in enumerate(tracks):
    score = scores[tidx]
    for fidx, frame in enumerate(track['track']['frame'].tolist()):
      # average smoothing
      s = score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)]
      s = numpy.mean(s)
      faces[frame].append({'track': tidx, 'score': float(s), 's': track['proc_track']['s']
                          [fidx], 'x': track['proc_track']['x'][fidx], 'y': track['proc_track']['y'][fidx]})
  firstImage = cv2.imread(flist[0])
  fw = firstImage.shape[1]
  fh = firstImage.shape[0]
  vOut = cv2.VideoWriter(os.path.join(args["pyaviPath"], 'video_only.avi'), cv2.VideoWriter_fourcc(*'XVID'), 25, (fw, fh))
  colorDict = {0: 0, 1: 255}
  for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
    image = cv2.imread(fname)
    for face in faces[fidx]:
      clr = colorDict[int((face['score'] >= 0))]
      txt = round(face['score'], 1)
      cv2.rectangle(image, (int(face['x'] - face['s']), int(face['y'] - face['s'])), (int(
          face['x'] + face['s']), int(face['y'] + face['s'])), (0, clr, 255 - clr), 10)
      cv2.putText(image, '%s' % (txt), (int(face['x'] - face['s']), int(
          face['y'] - face['s'])), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, clr, 255 - clr), 5)
    vOut.write(image)
  vOut.release()
  command = ("ffmpeg -y -i \"%s\" -i \"%s\" -threads %d -c:v copy -c:a copy \"%s\" -loglevel panic" %
             (os.path.join(args["pyaviPath"], 'video_only.avi'), os.path.join(args["pyaviPath"], 'audio.wav'),
              args["nDataLoaderThread"], os.path.join(args["pyaviPath"], 'video_out.avi')))
  output = subprocess.call(command, shell=True, stdout=None)


In [27]:
args["pyaviPath"] = os.path.join(args["savePath"], 'pyavi')
args["pyframesPath"] = os.path.join(args["savePath"], 'pyframes')
args["pyworkPath"] = os.path.join(args["savePath"], 'pywork')
args["pycropPath"] = os.path.join(args["savePath"], 'pycrop')
if os.path.exists(args["savePath"]):
  rmtree(args["savePath"])
# The path for the input video, input audio, output video
os.makedirs(args["pyaviPath"], exist_ok=True)
os.makedirs(args["pyframesPath"], exist_ok=True)  # Save all the video frames
# Save the results in this process by the pckl method
os.makedirs(args["pyworkPath"], exist_ok=True)
# Save the detected face clips (audio+video) in this process
os.makedirs(args["pycropPath"], exist_ok=True)

# Extract video
args["videoFilePath"] = os.path.join(args["pyaviPath"], 'video.avi')


In [28]:
# If duration did not set, extract the whole video, otherwise extract the video from 'args["start"]' to 'args["start"] + args["duration"]'
if args["duration"] == 0:
  command = ("ffmpeg -y -i \"%s\" -qscale:v 2 -threads %d -async 1 -r 25 \"%s\" -loglevel panic" %
              (args["videoPath"], args["nDataLoaderThread"], args["videoFilePath"]))
else:
  command = ("ffmpeg -y -i \"%s\" -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r 25 \"%s\" -loglevel panic" %
              (args["videoPath"], args["nDataLoaderThread"], args["start"], args["start"] + args["duration"], args["videoFilePath"]))
subprocess.call(command, shell=True, stdout=None)
x = sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the video and save in %s \r\n" % (args["videoFilePath"]))


2022-02-05 15:31:57 Extract the video and save in E:\Projects\Style-based Video Editor\src\server\temp\scenes\1.aa\pyavi\video.avi 


133

In [29]:
# Extract audio
args["audioFilePath"] = os.path.join(args["pyaviPath"], 'audio.wav')
command = ("ffmpeg -y -i \"%s\" -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 \"%s\" -loglevel panic" %
            (args["videoFilePath"], args["nDataLoaderThread"], args["audioFilePath"]))
subprocess.call(command, shell=True, stdout=None)
x = sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the audio and save in %s \r\n" % (args["audioFilePath"]))


2022-02-05 15:31:57 Extract the audio and save in E:\Projects\Style-based Video Editor\src\server\temp\scenes\1.aa\pyavi\audio.wav 


133

In [30]:
# Extract the video frames
command = ("ffmpeg -y -i \"%s\" -qscale:v 2 -threads %d -f image2 \"%s\" -loglevel panic" %
            (args["videoFilePath"], args["nDataLoaderThread"], os.path.join(args["pyframesPath"], '%06d.jpg')))
subprocess.call(command, shell=True, stdout=None)
x = sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the frames and save in %s \r\n" % (args["pyframesPath"]))

2022-02-05 15:31:58 Extract the frames and save in E:\Projects\Style-based Video Editor\src\server\temp\scenes\1.aa\pyframes 


127

In [31]:
# Scene detection for the video frames
scene = scene_detect(args)
sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Scene detection and save in %s \r\n" % (args["pyworkPath"]))

 90%|█████████ | 210/233 [00:02<00:00, 103.17frames/s]
E:\Projects\Style-based Video Editor\src\server\temp\scenes\1.aa\pyavi\video.avi - scenes detected 1
2022-02-05 15:32:00 Scene detection and save in E:\Projects\Style-based Video Editor\src\server\temp\scenes\1.aa\pywork 


122

In [32]:
# Face detection for the video frames
faces = inference_video(args)
x = sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face detection and save in %s \r\n" % (args["pyworkPath"]))


2022-02-05 15:33:31 Face detection and save in E:\Projects\Style-based Video Editor\src\server\temp\scenes\1.aa\pywork 


121

In [33]:
# Face tracking
allTracks = []
for shot in scene:
  # Discard the shot frames less than minTrack frames
  if shot[1].frame_num - shot[0].frame_num >= args["minTrack"]:
    # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces
    allTracks.extend(track_shot(args, faces[shot[0].frame_num:shot[1].frame_num]))
x = sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face track and detected %d tracks \r\n" % len(allTracks))


2022-02-05 15:33:31 Face track and detected 1 tracks 


55

In [34]:
# Face clips cropping
vidTracks = []
for ii, track in tqdm.tqdm(enumerate(allTracks), total=len(allTracks)):
  vidTracks.append(crop_video(args, track, os.path.join(args["pycropPath"], '%05d' % ii)))
x =sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face Crop and saved in %s tracks \r\n" % args["pycropPath"])


100%|██████████| 1/1 [00:03<00:00,  3.83s/it]
2022-02-05 15:33:35 Face Crop and saved in E:\Projects\Style-based Video Editor\src\server\temp\scenes\1.aa\pycrop tracks 


In [35]:
# Active Speaker Detection by TalkNet
files = glob.glob("%s/*.avi" % args["pycropPath"])
files.sort()
scores = evaluate_network(files, args)
sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Scores extracted and saved in %s \r\n" % args["pyworkPath"])

visualization(vidTracks, scores, args)

Model pretrain_TalkSet.model loaded from previous state! 


02-05 15:33:36 Model para number = 15.01


100%|██████████| 1/1 [00:09<00:00,  9.42s/it]
2022-02-05 15:33:45 Scores extracted and saved in E:\Projects\Style-based Video Editor\src\server\temp\scenes\1.aa\pywork 
100%|██████████| 233/233 [00:05<00:00, 45.53it/s]


In [52]:
track

{'track': {'frame': array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
          13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
          26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
          39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
          52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
          65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
          78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
          91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
         104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
         117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
         130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
         143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
         156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
         169, 170, 1

In [71]:
sum, count = 0 , 0
less , more = 0 ,0 
sores = []
for face in faces:
  if len(face) > 0:
    sores.append(face[0]["score"])
    sum += face[0]["score"]
    if face[0]["score"] > 0:
      more += 1
    else:
      less += 1
    count +=  1


In [72]:
less , more

(88, 122)

In [None]:
# CPU: visulize the result for video format
flist = glob.glob(os.path.join(args["pyframesPath"], '*.jpg'))
flist.sort()
faces = [[] for i in range(len(flist))]
for tidx, track in enumerate(vidTracks):
  score = scores[tidx]
  for fidx, frame in enumerate(track['track']['frame'].tolist()):
    # average smoothing
    s = score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)]
    s = numpy.mean(s)
    faces[frame].append({'track': tidx, 'score': float(s), 's': track['proc_track']['s']
                        [fidx], 'x': track['proc_track']['x'][fidx], 'y': track['proc_track']['y'][fidx]})

In [56]:
faces[0]

[{'track': 0,
  'score': -2.1333332856496177,
  's': 168.2314338684082,
  'x': 968.2450866699219,
  'y': 272.84931564331055}]

In [45]:
# CPU: visulize the result for video format
flist = glob.glob(os.path.join(args["pyframesPath"], '*.jpg'))
flist.sort()
faces = [[] for i in range(len(flist))]
for tidx, track in enumerate(vidTracks):
  score = scores[tidx]
  for fidx, frame in enumerate(track['track']['frame'].tolist()):
    # average smoothing
    s = score[max(fidx - 2, 0): min(fidx + 3, len(score) - 1)]
    s = numpy.mean(s)
    faces[frame].append({'track': tidx, 'score': float(s), 's': track['proc_track']['s']
                        [fidx], 'x': track['proc_track']['x'][fidx], 'y': track['proc_track']['y'][fidx]})
firstImage = cv2.imread(flist[0])
fw = firstImage.shape[1]
fh = firstImage.shape[0]
vOut = cv2.VideoWriter(os.path.join(args["pyaviPath"], 'video_only.avi'), cv2.VideoWriter_fourcc(*'XVID'), 25, (fw, fh))
colorDict = {0: 0, 1: 255}
for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
  image = cv2.imread(fname)
  for face in faces[fidx]:
    clr = colorDict[int((face['score'] >= 0))]
    txt = round(face['score'], 1)
    cv2.rectangle(image, (int(face['x'] - face['s']), int(face['y'] - face['s'])), (int(
        face['x'] + face['s']), int(face['y'] + face['s'])), (0, clr, 255 - clr), 10)
    cv2.putText(image, '%s' % (txt), (int(face['x'] - face['s']), int(
        face['y'] - face['s'])), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, clr, 255 - clr), 5)
  vOut.write(image)
vOut.release()
command = ("ffmpeg -y -i \"%s\" -i \"%s\" -threads %d -c:v copy -c:a copy \"%s\" -loglevel panic" %
            (os.path.join(args["pyaviPath"], 'video_only.avi'), os.path.join(args["pyaviPath"], 'audio.wav'),
            args["nDataLoaderThread"], os.path.join(args["pyaviPath"], 'video_out.avi')))
output = subprocess.call(command, shell=True, stdout=None)

100%|██████████| 233/233 [00:05<00:00, 45.74it/s]
