In [1]:
import os
import joblib
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data_dir = r'data\DFD'
img_size = (224,224)
num_frames = 16

In [3]:
img_size+(3,)

(224, 224, 3)

In [4]:
def extract_frames(video_path, num_frames=5):
    print(video_path)
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_interval = max(total_frames//num_frames, 1)
    for i in range(num_frames):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i*frame_interval)
        ret, frame = cap.read()
        # break the loop if end of the video is reached
        if not ret:
            break
        frame = cv2.resize(frame, img_size)
        frames.append(frame)
    cap.release()
    # fill any missing frames with blank frames
    while len(frames) < num_frames:
        frames.append(np.zeros(img_size+(3,), np.uint8))
    return np.array(frames)

In [5]:
def split_data(class_name, split_ratio=(0.7, 0.15, 0.15)):
    source = os.path.join(data_dir, class_name)
    all_files = os.listdir(source)

    train_files, val_test_files = train_test_split(all_files, train_size=split_ratio[0], random_state=42)
    val_files, test_files = train_test_split(val_test_files, test_size=0.5, random_state=42)
    data = {'train':[], 'val':[], 'test':[]}
    for split, files in zip(['train', 'val', 'test'],
                            [train_files, val_files, test_files]):
        print(f"{split.upper()} set: {len(files)} videos")
        for file in files:
            data[split].append(extract_frames(os.path.join(source, file), num_frames=num_frames))
    return data

In [6]:
X_train, X_val, X_test = [], [], []
y_train, y_val, y_test = [], [], []
for cls in ['real', 'fake']:
    cls_data = split_data(cls)
    X_train.extend(cls_data['train'])
    X_val.extend(cls_data['val'])
    X_test.extend(cls_data['test'])
    y_train.extend([cls]*len(cls_data['train']))
    y_val.extend([cls]*len(cls_data['val']))
    y_test.extend([cls]*len(cls_data['test']))
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

TRAIN set: 140 videos
data\FF++\real\13__walk_down_hall_angry.mp4
data\FF++\real\07__walk_down_hall_angry.mp4
data\FF++\real\02__walk_down_hall_angry.mp4
data\FF++\real\01__walking_down_indoor_hall_disgust.mp4
data\FF++\real\03__kitchen_still.mp4
data\FF++\real\10__kitchen_still.mp4
data\FF++\real\03__talking_angry_couch.mp4
data\FF++\real\15__outside_talking_pan_laughing.mp4
data\FF++\real\07__outside_talking_still_laughing.mp4
data\FF++\real\11__talking_angry_couch.mp4
data\FF++\real\04__outside_talking_pan_laughing.mp4
data\FF++\real\10__walk_down_hall_angry.mp4
data\FF++\real\13__secret_conversation.mp4
data\FF++\real\03__talking_against_wall.mp4
data\FF++\real\10__kitchen_pan.mp4
data\FF++\real\09__talking_against_wall.mp4
data\FF++\real\02__talking_angry_couch.mp4
data\FF++\real\11__walk_down_hall_angry.mp4
data\FF++\real\08__kitchen_still.mp4
data\FF++\real\09__outside_talking_still_laughing.mp4
data\FF++\real\01__kitchen_pan.mp4
data\FF++\real\06__podium_speech_happy.mp4
data\F

In [7]:
split_3d_data = {
    'X_train':X_train,
    'X_val':X_val,
    'X_test':X_test,
    'y_train':y_train,
    'y_val':y_val,
    'y_test':y_test
}
joblib.dump(split_3d_data, 'artifacts/split_3d_data.pkl')

['artifacts/split_3d_data.pkl']

In [4]:
X_train.shape, X_test.shape, X_val.shape

((280, 16, 224, 224, 3), (60, 16, 224, 224, 3), (60, 16, 224, 224, 3))

In [5]:
y_train.shape, y_test.shape, y_val.shape

((280,), (60,), (60,))

In [6]:
y_val

array(['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real',
       'real', 'real', 'real', 'real', 'real', 'real', 'real', 'real',
       'real', 'real', 'real', 'real', 'real', 'real', 'real', 'real',
       'real', 'real', 'real', 'real', 'real', 'real', 'fake', 'fake',
       'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake',
       'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake',
       'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake',
       'fake', 'fake', 'fake', 'fake'], dtype='<U4')