In [7]:
import cv2
import dlib
import numpy as np
import os
import time

In [None]:
def get_rotation_matrix(landmarks):
    left_corner = np.array([landmarks.part(48).x, landmarks.part(48).y])
    right_corner = np.array([landmarks.part(54).x, landmarks.part(54).y])

    dy = right_corner[1] - left_corner[1]
    dx = right_corner[0] - left_corner[0]
    angle = np.degrees(np.arctan2(dy, dx))
    
    center = int((left_corner[0] + right_corner[0]) //2), int((left_corner[1] + right_corner[1]) // 2)

    return cv2.getRotationMatrix2D(center, angle, 1.0), center

In [None]:
def process_vid(face_detector, landmark_predictor, input_video_path, output_video_path, fps, width, height, output_size):
    output_dir = os.path.dirname(output_video_path)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    vid_capture = cv2.VideoCapture(input_video_path)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (output_size, output_size), isColor=False)

    while vid_capture.isOpened():
        ret, frame = vid_capture.read()
        if not ret:
            break
        
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        faces = face_detector(gray)
        
        for face in faces:
            landmarks = landmark_predictor(gray, face)
            
            M, center = get_rotation_matrix(landmarks)

            gray_aligned = cv2.warpAffine(gray, M, (width, height))

            x_min = center[0] - output_size // 2
            y_min = center[1] - output_size // 2
            x_max = x_min + output_size
            y_max = y_min + output_size
            
            cropped_frame = np.zeros((output_size, output_size), dtype=np.uint8)

            src_x_min = max(x_min, 0)
            src_y_min = max(y_min, 0)
            src_x_max = min(x_max, gray.shape[1])
            src_y_max = min(y_max, gray.shape[0])
            
            dst_x_min = max(0, -x_min)
            dst_y_min = max(0, -y_min)
            dst_x_max = dst_x_min + (src_x_max - src_x_min)
            dst_y_max = dst_y_min + (src_y_max - src_y_min)
            
            cropped_frame[dst_y_min:dst_y_max, dst_x_min:dst_x_max] = gray_aligned[src_y_min:src_y_max, src_x_min:src_x_max] 
            out.write(cropped_frame)
            
    vid_capture.release()
    out.release()

In [None]:
#Attempt to normalize mouth size, doesn't look great but maybe later can try
#Would go after gray_aligned creation in preprocess
# lip_points = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(48, 61)])
# lip_points = np.hstack([lip_points, np.ones((lip_points.shape[0], 1))])
# rotated_lip_points = M.dot(lip_points.T).T
# x_min, y_min = np.min(rotated_lip_points, axis=0)[:2].astype(int)
# x_max, y_max = np.max(rotated_lip_points, axis=0)[:2].astype(int)

# width_lips = x_max - x_min
# height_lips = y_max - y_min

# scale_x = (output_size * 0.5) / width_lips
# scale_y = (output_size * 0.25) / height_lips

# new_crop_width = output_size // scale_x
# new_crop_height = output_size // scale_y

# x_min = int(center[0] - new_crop_width // 2)
# y_min = int(center[1] - new_crop_height // 2)
# x_max = int(x_min + new_crop_width)
# y_max = int(y_min + new_crop_height)


# cropped_frame = gray_aligned[y_min:y_max, x_min:x_max]
# resized_frame = cv2.resize(cropped_frame, (output_size, output_size))

# out.write(resized_frame)

In [None]:
face_detector = dlib.get_frontal_face_detector()
landmark_predictor = dlib.shape_predictor('shape_predictor_68_face_landmarks.dat')
source_root = 'selected_mp4_files'
source_word = 'ABOUT'
source_folder = 'train'
input_video = 'ABOUT_00003.mp4'
output_root = 'processed_selected_mp4_files'

In [None]:
def test_ABOUT_00001():
        input_video_path = os.path.join(source_root, source_word, 'train', input_video)
        output_video_path = os.path.join(output_root, source_word, 'train', input_video)
        process_vid(face_detector, landmark_predictor, input_video_path, output_video_path, fps=25, width=256, height=256, output_size=64)

def test_ABOUT_whole():
        start_time = time.time()
        source_folder = os.path.join(source_root, source_word, 'train')
        output_video_folder = os.path.join(output_root, source_word, 'train')
        for input_video in sorted([file for file in os.listdir(source_folder) if file.endswith(".mp4")]):
                input_video_path = os.path.join(source_folder, input_video)
                output_video_path = os.path.join(output_video_folder, input_video)
                process_vid(face_detector, landmark_predictor, input_video_path, output_video_path, fps=25, width=256, height=256, output_size=64)
        end_time = time.time()
        elapsed = end_time - start_time
        print(f"Finished Processing ABOUT folder in {elapsed: .2f} seconds")
        

def run_all():
        start_time = time.time()
        source_folders = [folder for folder in sorted(os.listdir(source_root)) if folder != '.DS_Store']

        for source_word in source_folders:
                source_folder = os.path.join(source_root, source_word, 'train')
                output_video_folder = os.path.join(output_root, source_word, 'train')
                for input_video in sorted([file for file in os.listdir(source_folder) if file.endswith(".mp4")]):
                        input_video_path = os.path.join(source_folder, input_video)
                        output_video_path = os.path.join(output_video_folder, input_video)
                        process_vid(face_detector, landmark_predictor, input_video_path, output_video_path, fps=25, width=256, height=256, output_size=64)

        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f'Done in {elapsed_time: .2f} seconds')

In [None]:
#This notebook is used to preprocess the mp4's 
#It 1) centers, 2) horizontally aligns the mp4 around the lips center
#3) Grayscale's the mp4, 4) Crops it to a 64x64 mp4
#Then, it creates a new folder called "processed_selected_mp4_files" with the same
#Structure as the lipread_mp4 data to store the new files
#I ran it on mac so it might work differently on windows or something. 
#test_ABOUT_00001 preprocesses just ABOUT_00001.mpy
#test_ABOUT_whole preprocesses the entire ABOUT train data
#run_all preprocesses all the Words train set