# Building a Deepfake Detector using Deep Learning Models
This notebook demonstrates the development of a deepfake detection system using multiple pre-trained **CNN (Convolutional Neural Network)** models, such as **ResNet50**, **EfficientNetV2B0** and **Xception**, combined with **LSTM (Long Short-Term Memory)** networks for temporal analysis. The datasets used are **FaceForensics++**, **DFDC** and **Celeb-DF (v2)**. To ensure unbiased testing, the **Celeb-DF (v2)** dataset consists of completely unseen videos that are exclusively reserved for testing and are not included in the training or validation processes. `OpenCV` is utilized for video frame extraction and preprocessing while `dlib` is used for face detection and cropping.

## GPU Configuration and Verification with TensorFlow
To ensure TensorFlow is configured to effectively utilize the GPU for deep learning tasks, optimize memory usage and verify GPU support.

In [1]:
import tensorflow as tf

# Check if TensorFlow is built with CUDA support and list GPUs
print("TensorFlow CUDA Support:", tf.test.is_built_with_cuda())
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available:", len(physical_devices))

if physical_devices:
    try:
        for i, gpu in enumerate(physical_devices):
            # Enable memory growth for each GPU
            tf.config.experimental.set_memory_growth(gpu, True)
            print(f"Enabled memory growth for GPU {i}: {tf.config.experimental.get_device_details(gpu)['device_name']}")
    except RuntimeError as e:
        print("Error enabling GPU memory growth:", e)
else:
    print("No GPU detected. Ensure proper GPU setup.")

TensorFlow CUDA Support: True
Num GPUs Available: 1
Enabled memory growth for GPU 0: NVIDIA GeForce 940MX


## Importing Libraries and Setup
Importing all necessary libraries at the top to ensure better organization, easy debugging and smooth execution of the entire pipeline.

In [2]:
import os
import sys
import numpy as np
import shutil
import cv2
import dlib
from sklearn.model_selection import train_test_split

## 1. Dataset Preparation
Preparing the dataset for video frame extraction, face detection and cropping followed by organizing the data into structured train and validation directories.

### 1.1 Defining Paths and Creating Directories for Training and Validation Datasets

In [3]:
# Defining base directory where the dataset resides
base_dir = os.getcwd() # Current working directory where my Jupyter Notebook is located

# Defining paths for dataset directories
real_videos_dir = os.path.join(base_dir, "Datasets", "FaceForensic++", "real")
fake_videos_dir = os.path.join(base_dir, "Datasets", "FaceForensic++", "fake")

# Defining paths for cropped faces directories
real_faces_dir = os.path.join(base_dir, "Cropped_Faces", "real")
fake_faces_dir = os.path.join(base_dir, "Cropped_Faces", "fake")

# Defining paths for training and validation directories
train_dir = os.path.join(base_dir, "Cropped_Faces", "train")
val_dir = os.path.join(base_dir, "Cropped_Faces", "val")

# Creating necessary directories if they don’t already exist
os.makedirs(real_faces_dir, exist_ok=True)
os.makedirs(fake_faces_dir, exist_ok=True)
os.makedirs(os.path.join(train_dir, "real"), exist_ok=True)
os.makedirs(os.path.join(train_dir, "fake"), exist_ok=True)
os.makedirs(os.path.join(val_dir, "real"), exist_ok=True)
os.makedirs(os.path.join(val_dir, "fake"), exist_ok=True)

print(f"Directories for processing and output:")
print(f"Real Videos: {real_videos_dir}")
print(f"Fake Videos: {fake_videos_dir}")
print(f"Real Faces: {real_faces_dir}")
print(f"Fake Faces: {fake_faces_dir}")
print(f"Train Directory: {train_dir}")
print(f"Validation Directory: {val_dir}")

Directories for processing and output:
Real Videos: C:\Users\atul\Datasets\FaceForensic++\real
Fake Videos: C:\Users\atul\Datasets\FaceForensic++\fake
Real Faces: C:\Users\atul\Cropped_Faces\real
Fake Faces: C:\Users\atul\Cropped_Faces\fake
Train Directory: C:\Users\atul\Cropped_Faces\train
Validation Directory: C:\Users\atul\Cropped_Faces\val


### 1.2 Face Detection and Cropping for Training and Validation Datasets

In [4]:
# Initializing the face detector
detector = dlib.get_frontal_face_detector()

def crop_faces(input_dir, output_dir, dataset_name, face_size=(224, 224), is_real=True):
    """
    Detects and crops faces from videos in the input directory.
    Cropped faces are saved in specific folders with unique names in the output directory.

    Args:
    - input_dir (str): Path to the directory containing videos.
    - output_dir (str): Path to the directory to save cropped face images.
    - dataset_name (str): Prefix for naming folders and files (e.g., dataset name).
    - face_size (tuple): Dimensions to resize each face (width, height).
    - is_real (bool): Indicates whether the videos are from the "real" or "fake" category.
    """
    # Check if input and output directories exist
    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist. Skipping.")
        return
    os.makedirs(output_dir, exist_ok=True)

    # Supported video formats
    supported_formats = (".mp4", ".avi", ".mkv", ".mov")

    # Initialize folder counter
    folder_counter = 0

    # Looping through each file in the input directory
    for file in os.listdir(input_dir):
        if file.lower().endswith(supported_formats):  # Process only supported video files
            video_name = os.path.splitext(file)[0]  # Extract the video name (without extension)

            # Generate a unique folder name based on the dataset name and category
            category = "real" if is_real else "fake"
            folder_name = f"{dataset_name}_{category}{folder_counter}"
            folder_counter += 1

            # Create the unique folder
            folder_path = os.path.join(output_dir, folder_name)

            # Skip already processed videos
            if os.path.exists(folder_path) and len(os.listdir(folder_path)) > 0:
                print(f"Skipping already processed video: {file}")
                continue

            os.makedirs(folder_path, exist_ok=True)

            video_path = os.path.join(input_dir, file)
            cap = cv2.VideoCapture(video_path)  # Open the video file

            if not cap.isOpened():
                print(f"Failed to open video {file}. Skipping.")
                continue

            frame_count = 0
            cropped_count = 0  # Counter for cropped faces

            # Looping through frames in the video
            while cap.isOpened():
                ret, frame = cap.read()  # Read a frame
                if not ret:  # Exit when no more frames
                    break

                frame_count += 1
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)  # Convert frame to grayscale
                faces = detector(gray)  # Detect faces in the frame

                # Save each detected face
                for i, face in enumerate(faces):
                    x, y, w, h = face.left(), face.top(), face.width(), face.height()

                    # Validate face coordinates to ensure they are within the frame bounds
                    x = max(0, x)
                    y = max(0, y)
                    w = min(frame.shape[1] - x, w)
                    h = min(frame.shape[0] - y, h)

                    if w <= 0 or h <= 0:  # Check if the cropped region is valid
                        print(f"Invalid face region in frame {frame_count}, video {file}. Skipping.")
                        continue

                    # Crop the face from the frame
                    cropped_face = frame[y:y+h, x:x+w]

                    # Resize the cropped face to the specified size
                    cropped_face = cv2.resize(cropped_face, face_size)

                    # Generate a unique filename for the cropped face
                    file_name = f"{folder_name}_frame{frame_count}_face{i}.jpg"
                    save_path = os.path.join(folder_path, file_name)

                    # Save the cropped face
                    cv2.imwrite(save_path, cropped_face)
                    cropped_count += 1  # Increment the cropped face counter

            cap.release()  # Release the video capture object
            print(f"Processed {file}: {cropped_count} face(s) cropped into {folder_name}.")
    print("--- Face cropping complete ---")

# Process real videos
print("--- Processing Real videos from FaceForensic++ dataset ---")
crop_faces(real_videos_dir, real_faces_dir, "FF", is_real=True)

# Process fake videos
print("\n--- Processing Fake videos from FaceForensic++ dataset ---")
crop_faces(fake_videos_dir, fake_faces_dir, "FF", is_real=False)

--- Processing Real videos from FaceForensic++ dataset ---
--- Face cropping complete ---

--- Processing Fake videos from FaceForensic++ dataset ---
--- Face cropping complete ---


### 1.3 Organizing the Dataset into Training and Validation Sets

In [5]:
# Prevent accidental re-run by checking for a flag file
flag_file = "dataset_preparation_done.flag"

if os.path.exists(flag_file):
    print("--- Dataset Preparation Already Completed. Skipping Step. ---\n")
    print("Note: The 'dataset_preparation_done.flag' file is generated to prevent re-running this step.")
    print("If you want to re-run the dataset preparation, please delete the 'dataset_preparation_done.flag' file and re-run the code.")
else:
    # Defining base directory and paths for real and fake cropped faces
    print("\n--- Dataset Preparation Started ---\n")
    base_dir = os.getcwd()
    real_faces_dir = os.path.join(base_dir, "Cropped_Faces", "real")
    fake_faces_dir = os.path.join(base_dir, "Cropped_Faces", "fake")

    # Defining train and validation directories
    train_dir = os.path.join(base_dir, "Cropped_Faces", "train")
    val_dir = os.path.join(base_dir, "Cropped_Faces", "val")

    print("Ensuring that train and validation directories exist...")
    # Ensuring train and validation directories exist
    os.makedirs(os.path.join(train_dir, "real"), exist_ok=True)
    os.makedirs(os.path.join(val_dir, "real"), exist_ok=True)
    os.makedirs(os.path.join(train_dir, "fake"), exist_ok=True)
    os.makedirs(os.path.join(val_dir, "fake"), exist_ok=True)
    print("Directory structure for train and validation sets created.\n")

    # Function to recursively collect all .jpg files from subdirectories
    def collect_images_from_subfolders(root_dir):
        image_files = []
        for subdir, _, files in os.walk(root_dir):
            for file in files:
                if file.endswith(".jpg"):
                    image_files.append(os.path.join(subdir, file))
        return image_files

    # Collect real and fake face images
    print("Collecting images for real and fake faces...")
    real_faces = collect_images_from_subfolders(real_faces_dir)
    fake_faces = collect_images_from_subfolders(fake_faces_dir)

    print(f"Found {len(real_faces)} real face images.")
    print(f"Found {len(fake_faces)} fake face images.\n")

    # Check for empty datasets
    if not real_faces or not fake_faces:
        print("Error: One or more directories are empty. Please ensure face cropping is successful.")
    else:
        print("Splitting dataset into training and validation sets...")
        # Split data into train and validation sets
        real_train, real_val = train_test_split(real_faces, test_size=0.2, random_state=42)
        fake_train, fake_val = train_test_split(fake_faces, test_size=0.2, random_state=42)
        print("Dataset split complete.\n")

        # Function to move files while preserving subfolders
        def move_files_with_subfolders(file_list, target_dir, base_dir):
            for file_path in file_list:
                # Compute the relative path from the base directory
                relative_path = os.path.relpath(file_path, base_dir)
                # Create the corresponding subfolder structure in the target directory
                destination_path = os.path.join(target_dir, os.path.dirname(relative_path))
                os.makedirs(destination_path, exist_ok=True)
                # Move the file to the target directory
                shutil.copy(file_path, os.path.join(destination_path, os.path.basename(file_path)))

        # Move the split data to respective directories while preserving subfolders
        print("Moving real face images to train and validation directories (preserving subfolders)...")
        move_files_with_subfolders(real_train, os.path.join(train_dir, "real"), real_faces_dir)
        move_files_with_subfolders(real_val, os.path.join(val_dir, "real"), real_faces_dir)
        print("Real face images successfully moved.\n")

        print("Moving fake face images to train and validation directories (preserving subfolders)...")
        move_files_with_subfolders(fake_train, os.path.join(train_dir, "fake"), fake_faces_dir)
        move_files_with_subfolders(fake_val, os.path.join(val_dir, "fake"), fake_faces_dir)
        print("Fake face images successfully moved.\n")

        print("Data split and moved to train and validation directories successfully.")

        # Create flag file to indicate completion
        with open(flag_file, "w") as f:
            f.write("Dataset preparation completed.\n")
        print("\n--- Dataset Preparation Complete ---\n")

        # Explicit note for users about the flag file
        print("Note: The 'dataset_preparation_done.flag' file is generated to prevent re-running this step.")
        print("If you want to re-run the dataset preparation, please delete the 'dataset_preparation_done.flag' file and re-run the code.")

--- Dataset Preparation Already Completed. Skipping Step. ---

Note: The 'dataset_preparation_done.flag' file is generated to prevent re-running this step.
If you want to re-run the dataset preparation, please delete the 'dataset_preparation_done.flag' file and re-run the code.


### 1.4 Defining Paths and Creating Directories for the Testing Phase

In [6]:
# Defining base directory where the dataset resides
base_dir = os.getcwd() # Current working directory where my Jupyter Notebook is located

# Defining paths for dataset directories
real_videos_dir = os.path.join(base_dir, "Datasets", "CelebDFv2", "real")
fake_videos_dir = os.path.join(base_dir, "Datasets", "CelebDFv2", "fake")

# Defining paths for cropped faces directories
real_faces_dir = os.path.join(base_dir, "Cropped_Faces", "test", "real")
fake_faces_dir = os.path.join(base_dir, "Cropped_Faces", "test", "fake")

# Creating necessary directories if they don’t already exist
os.makedirs(real_faces_dir, exist_ok=True)
os.makedirs(fake_faces_dir, exist_ok=True)

print(f"Directories for Test:")
print(f"Real Videos Directory: {real_videos_dir}")
print(f"Fake Videos Directory: {fake_videos_dir}")
print(f"Real Faces Directory: {real_faces_dir}")
print(f"Fake Faces Directory: {fake_faces_dir}")

Directories for Test:
Real Videos Directory: C:\Users\atul\Datasets\CelebDFv2\real
Fake Videos Directory: C:\Users\atul\Datasets\CelebDFv2\fake
Real Faces Directory: C:\Users\atul\Cropped_Faces\test\real
Fake Faces Directory: C:\Users\atul\Cropped_Faces\test\fake


### 1.5 Face Detection and Cropping for the Testing Phase

In [7]:
# Initializing the face detector
detector = dlib.get_frontal_face_detector()

def crop_faces(input_dir, output_dir, dataset_name, face_size=(224, 224), is_real=True):
    """
    Detects and crops faces from videos in the input directory.
    Cropped faces are saved in specific folders with unique names in the output directory.

    Args:
    - input_dir (str): Path to the directory containing videos.
    - output_dir (str): Path to the directory to save cropped face images.
    - dataset_name (str): Prefix for naming folders and files (e.g., dataset name).
    - face_size (tuple): Dimensions to resize each face (width, height).
    - is_real (bool): Indicates whether the videos are from the "real" or "fake" category.
    """
    # Check if input and output directories exist
    if not os.path.exists(input_dir):
        print(f"Input directory {input_dir} does not exist. Skipping.")
        return
    os.makedirs(output_dir, exist_ok=True)

    # Supported video formats
    supported_formats = (".mp4", ".avi", ".mkv", ".mov")

    # Initialize folder counter
    folder_counter = 0

    # Looping through each file in the input directory
    for file in os.listdir(input_dir):
        if file.lower().endswith(supported_formats):  # Process only supported video files
            video_name = os.path.splitext(file)[0]  # Extract the video name (without extension)

            # Generate a unique folder name based on the dataset name and category
            category = "real" if is_real else "fake"
            folder_name = f"{dataset_name}_{category}{folder_counter}"
            folder_counter += 1

            # Create the unique folder
            folder_path = os.path.join(output_dir, folder_name)

            # Skip already processed videos
            if os.path.exists(folder_path) and len(os.listdir(folder_path)) > 0:
                print(f"Skipping already processed video: {file}")
                continue

            os.makedirs(folder_path, exist_ok=True)

            video_path = os.path.join(input_dir, file)
            cap = cv2.VideoCapture(video_path)  # Open the video file

            if not cap.isOpened():
                print(f"Failed to open video {file}. Skipping.")
                continue

            frame_count = 0
            cropped_count = 0  # Counter for cropped faces

            # Looping through frames in the video
            while cap.isOpened():
                ret, frame = cap.read()  # Read a frame
                if not ret:  # Exit when no more frames
                    break

                frame_count += 1
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)  # Convert frame to grayscale
                faces = detector(gray)  # Detect faces in the frame

                # Save each detected face
                for i, face in enumerate(faces):
                    x, y, w, h = face.left(), face.top(), face.width(), face.height()

                    # Validate face coordinates to ensure they are within the frame bounds
                    x = max(0, x)
                    y = max(0, y)
                    w = min(frame.shape[1] - x, w)
                    h = min(frame.shape[0] - y, h)

                    if w <= 0 or h <= 0:  # Check if the cropped region is valid
                        print(f"Invalid face region in frame {frame_count}, video {file}. Skipping.")
                        continue

                    # Crop the face from the frame
                    cropped_face = frame[y:y+h, x:x+w]

                    # Resize the cropped face to the specified size
                    cropped_face = cv2.resize(cropped_face, face_size)

                    # Generate a unique filename for the cropped face
                    file_name = f"{folder_name}_frame{frame_count}_face{i}.jpg"
                    save_path = os.path.join(folder_path, file_name)

                    # Save the cropped face
                    cv2.imwrite(save_path, cropped_face)
                    cropped_count += 1  # Increment the cropped face counter

            cap.release()  # Release the video capture object
            print(f"Processed {file}: {cropped_count} face(s) cropped into {folder_name}.")
    print("--- Face cropping complete ---")

# Process real videos
print("--- Processing Real videos from Celeb-DF (v2) dataset ---")
crop_faces(real_videos_dir, real_faces_dir, "cdfv2", is_real=True)

# Process fake videos
print("\n--- Processing Fake videos from Celeb-DF (v2) dataset ---")
crop_faces(fake_videos_dir, fake_faces_dir, "cdfv2", is_real=False)

--- Processing Real videos from Celeb-DF (v2) dataset ---
--- Face cropping complete ---

--- Processing Fake videos from Celeb-DF (v2) dataset ---
--- Face cropping complete ---


## 2. Dynamic Calculation of Training, Validation and Testing Dataset Sizes

In [8]:
def count_images_in_class(directory, class_name):
    class_dir = os.path.join(directory, class_name)
    image_extensions = ('.jpg', '.jpeg', '.png')  # Supported image formats
    count = 0
    for root, _, files in os.walk(class_dir):
        count += sum(1 for file in files if file.lower().endswith(image_extensions))
    return count

def calculate_dataset_sizes(train_dir, val_dir):
    # Count for train
    train_real_count = count_images_in_class(train_dir, "real")
    train_fake_count = count_images_in_class(train_dir, "fake")
    train_dataset_size = train_real_count + train_fake_count

    # Count for val
    val_real_count = count_images_in_class(val_dir, "real")
    val_fake_count = count_images_in_class(val_dir, "fake")
    val_dataset_size = val_real_count + val_fake_count

    return train_real_count, train_fake_count, train_dataset_size, val_real_count, val_fake_count, val_dataset_size

def calculate_test_dataset_size(test_dir):
    # Count for val
    test_real_count = count_images_in_class(test_dir, "real")
    test_fake_count = count_images_in_class(test_dir, "fake")
    test_dataset_size = test_real_count + test_fake_count

    return test_real_count, test_fake_count, test_dataset_size

# Define paths to train, validation, and test directories
base_dir = os.getcwd()
train_dir = os.path.join(base_dir, "Cropped_Faces", "train")
val_dir = os.path.join(base_dir, "Cropped_Faces", "val")
test_dir = os.path.join(base_dir, "Cropped_Faces", "test")

# Calculate sizes dynamically
train_real_count, train_fake_count, train_dataset_size, val_real_count, val_fake_count, val_dataset_size = calculate_dataset_sizes(train_dir, val_dir)

# Calculate size for test dataset
test_real_count, test_fake_count, test_dataset_size = calculate_test_dataset_size(test_dir)

# Print output in the desired format
print(f"Training Dataset Size: {train_dataset_size} (Real: {train_real_count}, Fake: {train_fake_count})")
print(f"Validation Dataset Size: {val_dataset_size} (Real: {val_real_count}, Fake: {val_fake_count})")
print(f"Testing Dataset Size: {test_dataset_size} (Real: {test_real_count}, Fake: {test_fake_count})")

Training Dataset Size: 262642 (Real: 139606, Fake: 123036)
Validation Dataset Size: 65662 (Real: 34902, Fake: 30760)
Testing Dataset Size: 176050 (Real: 89826, Fake: 86224)
