In [1]:
import os
import random
import datetime
import argparse
import time
import numpy as np
from torchvision import models
import torch
import torch.nn as nn
from facenet_pytorch import InceptionResnetV1, MTCNN
import random
import dlib
import cv2
import imutils
from imutils.video import VideoStream
from imutils import face_utils
from moviepy.editor import *

## Crosscut Class 구현

In [20]:
class Crosscut:
    # 교차 편집에 필요한 변수 초기화
    def __init__(self, dist_obj, video_path, output_path):
        self.videos_path = video_path
        self.output_path = output_path #mp4 파일
        self.min_time = 1000.0
        video_num = len(os.listdir(self.videos_path))
        self.start_times = [0]*video_num
        self.window_time = 10
        self.padded_time = 4
        self.dist_obj = dist_obj
        self.audioclip = None
        self.extracted_clips_array = []
        self.con_clips = []
    
    # 사용할 무대영상의 시작점 정렬하기
    def video_alignment(self):
        for i in range(len(os.listdir(self.videos_path))):
            video_path = os.path.join(self.videos_path, sorted(os.listdir(self.videos_path))[i])
            clip = VideoFileClip(video_path)
            clip = clip.subclip(self.start_times[i], clip.duration)
            if self.min_time > clip.duration:
                self.audioclip = clip.audio
                self.min_time = clip.duration
            self.extracted_clips_array.append(clip)
        print("LOGGER-- {} Video Will Be Mixed".format(len(self.extracted_clips_array)))
            

    # 다음 영상 선택하기
    def select_next_clip(self, t, current_idx):
        # 거리 측정에 필요한 변수 초기화하기
        cur_t = t
        next_t = min(t + self.window_time, self.min_time)
        
        reference_clip = self.extraced_clips_array[current_idx].subclip(cur_t, next_t)
        d = float("Inf")
        cur_clip = None
        min_idx = (current_idx+1)%len(self.extracted_clips_array)
        
        # 비교 영상들과 연재 영상의 거리 측정하기
        for video_idx in range(len(self.extracted_clips_array)):
            if video_idx == current_idx:
                continue
            clip = self.extracted_clips_array[video_idx].subclip(cur_t, next_t)
            cur_d, plus_frame = self.dist_obj.distance(reference_clip, clip)
            print(current_idx, video_idx, cur_d, cur_t + plus_frame)
            if d > cur_d:
                d = cur_d
                min_idx = video_idx
                next_t = cur_t + plus_frame
                cur_clip = reference_clip.subclip(0, plus_frame)
                
        # 다음 교차편집 지점 전까지 현재 영상 저장하기
        if cur_clip:
            clip = cur_clip
        else:
            clip = reference_clip
        self.con_clips.append(clip)
        
        t = next_t
        return t, min_idx
            
    # 선택한 다음 영상의 padding 추가하기
    def add_padding(self, t, next_idx):
        print("idx : {}".format(next_idx))
        pad_clip = self.extracted_clips_array[next_idx].subclip(t, min(self.min_tiem, t+self.padded_time))
        return t, next_idx
    
    # 교차편집 결과물 저장하기
    def write_video(self):
        final_clip = concatenate_videoclips(self.con_clips)
        if self.audioclip != None:
            print("Not None")
            final_clip.audio = self.audioclip
        final_clip.write_videofile(self.output_path)
        return final_clip
    
    # 교차편집 만들기(메인 함수)
    def generate_video(self):
        # 영상 전처리하기
        self.video_alignment()
        t = 3
        current_idx = 0
        self.con_clips.append(self.extracted_clips_array[current_idx].subclip(0, min(t, int(self.min_time))))
        # 노래 끝까지 교차 편집 만들기
        while t < int(self.min_time):
            t, min_idx = self.select_next_clip(t, current_idx)
            t, current_idx = self.add_padding(t,min_idx)
        # 교차 편집 결과 영상 저장하기
        final_clip = self.write_video()
        return final_clip

## Distance Class 구현

### RandomDistance Class

In [14]:
class RandomDistance:
    def distance(self, reference_clip, compare_clip):
        dur_end = min(reference_clip.duration, compare_clip.duration)
        return random.randrange(1,100), min(dur_end, random.randrange(3,7))

### FaceDistance Class

In [15]:
class FaceDistance:
    def __init__(self, shape_predictor_path, face_embedding_penalty=None):
        self.skip_frame_rate = 4 # 거리를 계산할 frame 주기
        self.minimax_frames = 5 # 정해진 편집점 주변 중 가장 자연러운 지점을 찾기 위해 둘러볼 주변 프레임 
        self.shape_predictor = shape_predictor_path # openCV model
        self.face_embedding_pealty = face_embedding_penalty # 얼굴이 다를 때 더해지는 penalty
        
    def extract_lamdmark(self, reference_clip, compare_clip):
        # 영상 저장 및 face landmark detect model 불러오기
        self.clips = [reference_clip, compare_clip]
        detector = dlib.get_frontal_face_detector()
        predictor = dlib.shape_predictor(self.shape_predictor)
        clips_frame_info = []
        for clip in self.clips:
            # 각 영상의 정보를 저장하기 위해 loop마다 초기화 하기
            i = 0
            every_frame_info=[]
            while True:
                # 각 영상에서 face Landmark 얻기
                frame = clip.get_frame(i*1.0/clip.fps)
                i += self.skip_frame_rate
                if (i*1.0/clip.fps) > clip.duration:
                    break
                frame = imutils.resize(frame, width=800)
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                rects = detector(gray, 0)
                # 얻은 face landmark를 가공해서 every_frame_info에 저장하기
                if len(rects) > 0:
                    max_width = 0
                    max_rect = None
                    for rect in rects:
                        if int(rects[0].width()) > max_width:
                            max_rect = rect
                    shape = predictor(gray, max_rect)
                    shape = face_utils.shape_to_np(shape)
                    every_frame_info.append(shape)
                else:
                    every_frame_info.append([])
            # 영상 frame별 landmark 정보를 clips_frame_info에 저장하기
            clips_frame_info.append(np.array(every_frame_info))
        cv2.destroyAllWindows()
        return clips_frame_info
        
    def embedding_cosine_distance(self, reference_frame, compare_frame):
        face_detector = MTCNN(select_largest=True)
        embed_model = InceptionResnetV1(pretrained='vggface2').eval()
        
        reference_frame = np.array(reference_frame)
        compare_frame = np.array(compare_frame)
        
        try:
            reference_frame_detected = face_detector(reference_frame)
            compare_frame_detected = face_detector(compare_frame)
        except:
            cosine_dist = 1
            return cosine_dist
        
        reference_frame_embed = embed_model(reference_frame_detected.unsqueeze(0)).detach().numpy()
        compare_frame_embed = embed_model(compare_frame_detected.unsqueeze(0)).detach().numpy()
        reference_frame_embed = np.squeeze(compare_frame_embed)
        cosine_dist = 1 - np.dot(reference_frame_embed, compare_frame_embed) / (np.linalg.norm(reference_frame_embed)) * (np.linalg.norm(compare_frame_embed))
        
        return cosine_dist
    
    def get_all_frame_distance(self, clips_frame_info, min_size):
        dist_arr = []
        for i in range(min_size-1):
            if len(clips_frame_info[0][i] > 0 and len(clips_frame_info[1][i+1]) > 0):
                l = 36
                r = 45
                left_eye = ((clips_frame_info[0][i][l][0] - clips_frame_info[1][i+1][l][0])**2
                             + (clips_frame_info[0][i][l][1] - clips_frame_info[1][i+1][l][1])**2)**0.5
                right_eye = ((clips_frame_info[0][i][r][0] - clips_frame_info[1][i+1][r][0])**2 
                             + (clips_frame_info[0][i][r][1] - clips_frame_info[1][i+1][r][1])**2)**0.5
                total_diff = left_eye + right_eye
                dist_arr.append(total_diff)
            else:
                dist_arr.append(None)
        return dist_arr
    
    def distance(self, reference_clip, compare_clip):
        # 거리 계산에 필요한 정보들 먼저 얻기
        clips_frame_info = self.extract_lamdmark(reference_clip, compare_clip)
        min_size = min(len(clips_frame_info[0]), len(clips_frame_info[1]))
        dist_arr = self.get_all_frame_distance(clips_frame_info, min_size)
        clips = [reference_clip, compare_clip]
        minimax_frames = self.minimax_frames
        min_diff = np.float('Inf')
        min_idx = 0
        
        # 최소 거리가 되는 영상과 시간 찾기
        for i in range(min_size - (minimax_frames-1)):
            start_minmax_idx = 0 if (i - minimax_frames)<0 else i - minimax_frames
            
            if (None not in dist_arr[start_minmax_idx:i+minimax_frames]):
                tmp_max = np.max(dist_arr[start_minmax_idx:i+minimax_frames])
                if min_diff > tmp_max:
                    min_diff = tmp_max
                    min_idx = i
                    
        # Face Embedding Penalty 추가하기
        if self.face_embedding_penalty != None and min_diff < np.float("Inf"):
            ref_frame = reference_clip.gget_frame(min_idx*1.0/reference_clip.fps)
            frame = compare_clip.get_frame(min_idx * 1.0/compare_clip.fps)
            cosine_dist = self.embedding_cosine_distance(ref_frame, frame)
            min_diff += consine_dist * self.face_embedding_penalty
            
        # 두 영상 간의 최소 거리 정보 Return
        return min_diff, (min_idx*self.skip_frame_rate)/self.clips[0].fps

### PoseDistance Class

In [16]:
class PoseDistance:
    def __init__(self):
        self.SKIP_FRAME_RATE = 10
        self.MINIMAX_FRAME = 4
        self. model = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
        self.model.eval()
        os.environ['KMP_DUPLICATE_LIB_OK'] = "True"

    # 가수 위치를 파악하는 함수
    def extract_boxes(self, reference_clip, compare_clip):
        #변수 초기화
        self.clips = [reference_clip, compare_clip]
        clips_frame_info = []

        for clip in self.clips:
            # 각 영상의 정보를 저장하기 위해 loop마다 초기화하기
            i = 0
            every_frame_info = []
            while True:
                # Faster RCNN을 이용해 물체 판별하기
                i+=self.SKIP_FRAME_RATE
                if (i*1.0/clip.fps) > clip.duration:
                    break

                frame = clip.get_frame(i*1.0/clip.fps)
                frame = imutils.resize(frame, width=640)
                frame = frame/255
                frame = np.transpose(frame, (2,0,1))
                x = [torch.from_numpy(frame).float()]
                predictions = self.model(x)
                prediction = predictions[0]
                # 판별정보 재가공하기
                each_box_list = zip(prediction['boxes'].tolis(), prediction['labels'].tolist(), prediction['scores'].tolist())
                filtered_box_list = filter(lambda x: x[1]==1 and x[2] >= 0.95, each_box_list)
                filtered_center_dot_list = list(map(lambda x:[(x[0][0]+x[0][2])/2, (x[0][1]+x[0][3])/2], filtered_box_list))
                sorted_dot_list = sorted(filtered_center_dot_list, key = lambda x:x[0])

                # 재가공한 정보 every_frame_info에 저장하기
                every_frame_info.append(sorted_dot_list)
            # 영상 frame별 landmark 정보 clips_frame_info에 저장하기
            clips_frame_info.append(np.array(every_frame_info))
        return clips_frame_info

    # 두 영상의 frame 거리를 계산하는 함수
    def get_all_frame_distance(self, clips_frame_info, min_size):
        dist_arr = list()
        for i in range(min_size):
            if len(clips_frame_info[0][i]) > 0 and len(clips_frame_info[1][i]) > 0:
                ref_frame_dots = clips_frame_info[0][i]
                compare_frame_dots = clips_frame_info[1][i]
                min_dot_num = min(len(ref_frame_dots), len(compare_frame_dots))
                dot_num_diff = abs(len(ref_frame_dots) - len(compare_frame_dots))
                panalty = ((self.clips[0].w **2 + self.clips[0].h**2)**0.5) * abs(len(res_frame_dots) - len(compare_frame_dots))
                total_diff = penalty*dot_num_diff

                for dot_idx in range(min_dot_num):
                    total_diff += ((ref_frame_dots[dot_idx][0] - compare_frame_dots[dot_idx][0])**2 + (ref_frame_dots[dot_idx][1] - compare_frame_dots[dot_idx][1])**2)**0.5

                dist_arr.append(total_diff)
            else:
                dist_arr.append(None)
        return dist_arr

    # 거리 측정 함수
    def distance(self, reference_clip, compare_clip):

        # 거리 계산에 필요한 정보들 얻기
        clips_frame_info = self.extract_boxes(reference_clip, compare_clip)
        min_size = min(len(clips_frame_info[0]), len(clips_frame_info[1]))
        dist_arr = self.get_all_frame_distance(clips_frame_info, min_size)
        min_diff = np.float('Inf')
        min_idx = 0

        for i in range(min_size-(self.MINIMAX_FRAME-1)):
            start_minmax_idx = 0 if (i - self.MINIMAX_FRAME) < 0 else i - self.MINIMAX_FRAME
            if (None not in dist_arr[start_minmax_idx : i + self.MINIMAX_FRAME]):
                tmp_max = np.max(dist_arr[i:i+self.MINIMAX_FRAME])
            if min_diff > tmp_max:
                min_diff = tmp_max
                min_idx = i
        return min_diff, (min_idx*self.SKIP_FRAME_RATE)/reference_clip.fps

## 교차 편집 실행 코드

In [24]:
method = 'face'
video_path = '../data/fifth_season'
output_path = '../data/my_stagemix.mp4'
shape_predictor_path = '../data/shape_predictor_68_face_landmarks.dat'
face_embedding_panalty = 100

print(output_path)
if method == 'random':
    distance = RandomDistance()
    
elif method == 'face':
    distance = FaceDistance(shape_predictor_path, face_embedding_panalty)

elif method == 'pose':
    distance = PoseDistance()
    
cross_cut = Crosscut(distance, video_path, output_path)
cross_cut.generate_video()

../data/my_stagemix.mp4
LOGGER-- 0 Video Will Be Mixed


IndexError: list index out of range