**1. Data Prep:**

*Using FaceForensics++ Dataset from Kaggle*

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dataset_path = "/content/drive/MyDrive/ff++dataset"

In [None]:
#importing libs
import os
import cv2 as cv
import dlib
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import h5py

**1.1 Frame Extraction:**

In [None]:
#extract frames from one video
def extract_frames(video_path, output_folder, frame_rate=1, frame_prefix=""):
    """
    Extracts frames from a video, skipping frames that already exist.

    Args:
        video_path (str): Path to the input video file.
        output_folder (str): Path to the folder where frames will be saved.
        frame_rate (int): Desired frame rate (frames per second).
        frame_prefix (str): Prefix for the saved frame filenames.
    """
    cap = cv.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video file {video_path}")
        return

    fps = cap.get(cv.CAP_PROP_FPS)
    frame_interval = int(fps / frame_rate)
    frame_count = 0
    saved_count = 0

    while True:
        read_frame, frame_data = cap.read()
        if not read_frame:
            break

        if frame_count % frame_interval == 0:
            frame_filename = os.path.join(output_folder, f"{frame_prefix}_{saved_count:06d}.jpg")

            # Check if the frame already exists
            if not os.path.exists(frame_filename):
                cv.imwrite(frame_filename, frame_data)
                saved_count += 1
            else:
                print(f"Frame already exists: {frame_filename}")

        frame_count += 1

    cap.release()
    print(f"Extracted {saved_count} frames from {video_path}")

In [None]:
#extract frames from whole dataset
def extract_frames_from_dataset(dataset_path, output_folder, frame_rate=1):
    """
    Extracts frames from all videos in a dataset, organized by class (real/fake).

    Args:
        dataset_path (str): Path to the dataset folder.
        output_folder (str): Path to the folder where extracted frames will be saved.
        frame_rate (int): Desired frame rate.
    """
    for class_name in ["real", "fake"]:
        class_path = os.path.join(dataset_path, class_name)
        output_class_folder = os.path.join(output_folder, class_name)
        os.makedirs(output_class_folder, exist_ok=True)

        for video_name in os.listdir(class_path):
            video_path = os.path.join(class_path, video_name)
            frame_prefix = video_name.split('.mp4')[0]

            extract_frames(video_path, output_class_folder, frame_rate, frame_prefix=frame_prefix)


dataset_path = "/content/drive/MyDrive/ff++dataset"
output_folder = "/content/drive/MyDrive/ff++extracted_frames"
extract_frames_from_dataset(dataset_path, output_folder)

**1.2 Frame Preprocessing:**

In [None]:
# Initialize Dlib's face detector and shape predictor
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor("/content/drive/MyDrive/shape_predictor_68_face_landmarks.dat")

In [None]:
def preprocess_image(image, target_size=(224, 224)):
    """
    Preprocess the image: detect, align, crop, resize, and normalize.
    Handles multiple faces in the image.
    """
    # Convert the image to grayscale for face detection
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)

    # Detect faces in the image
    faces = detector(gray)
    if len(faces) == 0:
        return []  # No face detected, return an empty list

    preprocessed_faces = []

    # Process each detected face
    for face in faces:
        # Get facial landmarks
        landmarks = predictor(gray, face)

        # Align the face using Dlib's get_face_chip
        aligned_face = dlib.get_face_chip(image, landmarks)

        # Convert the aligned face to a NumPy array
        aligned_face_np = np.array(aligned_face)

        # Check if the aligned face is valid
        if aligned_face_np.size == 0:
            print("Warning: Aligned face is empty.")
            continue

        # Resize the face to the target size
        resized_face = cv.resize(aligned_face_np, target_size)

        # Normalize the pixel values to [0, 1]
        normalized_face = resized_face / 255.0

        # Append the preprocessed face to the list
        preprocessed_faces.append(normalized_face)

    return preprocessed_faces

In [None]:
def preprocess_dataset(input_folder, output_folder, target_size=(224, 224)):
    """
    Preprocess all frames in the input folder and save them to the output folder.
    Handles multiple faces in each frame.
    """
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Initialize lists to store preprocessed data and labels
    preprocessed_data = []
    labels = []

    # Iterate through all classes (e.g., "real" and "fake")
    for class_name in ["real", "fake"]:
        class_input_path = os.path.join(input_folder, class_name)
        class_output_path = os.path.join(output_folder, class_name)
        os.makedirs(class_output_path, exist_ok=True)

        # Iterate through all frames in the class folder
        for frame_name in os.listdir(class_input_path):
            frame_input_path = os.path.join(class_input_path, frame_name)

            # Remove the .jpg extension from the frame name
            frame_base_name = os.path.splitext(frame_name)[0]  # Removes .jpg

            # Load the frame
            image = cv.imread(frame_input_path)
            if image is None:
                print(f"Error: Could not load image {frame_input_path}")
                continue

            # Preprocess the frame (detect and process all faces)
            preprocessed_faces = preprocess_image(image, target_size)
            if not preprocessed_faces:
                print(f"No face detected or alignment failed in {frame_input_path}")
                continue

            # Save each preprocessed face and append to the lists
            for i, face in enumerate(preprocessed_faces):
                # Save the preprocessed face
                face_output_path = os.path.join(class_output_path, f"{frame_base_name}_face_{i}.jpg")
                face_image = (face * 255).astype(np.uint8)  # Convert back to [0, 255] for saving
                cv.imwrite(face_output_path, face_image)

                # Append the preprocessed face and label to the lists
                preprocessed_data.append(face)
                labels.append(0 if class_name == "real" else 1)  # 0 for real, 1 for fake

    # Convert lists to NumPy arrays
    preprocessed_data = np.array(preprocessed_data)
    labels = np.array(labels)

    return preprocessed_data, labels

In [None]:
# Define input and output folders
input_folder = "/content/drive/MyDrive/ff++extracted_frames"
output_folder = "/content/drive/MyDrive/ff++preprocessed_frames"

# Preprocess the dataset
preprocess_dataset(input_folder, output_folder)

# Print the shapes of the preprocessed data and labels
print("Preprocessed data shape:", preprocessed_data.shape)
print("Labels shape:", labels.shape)

**1.3 Loading Preprocessed Data**

In [None]:
def load_preprocessed_data(preprocessed_folder):
    """
    Load preprocessed data and labels from the output folder.
    Assumes the output folder has the same structure as the input folder.

    Args:
        preprocessed_folder (str): Path to the folder containing preprocessed data.

    Returns:
        tuple: A tuple containing:
            - preprocessed_data (np.array): Array of preprocessed images.
            - labels (np.array): Array of labels (0 for real, 1 for fake).
    """
    # Initialize lists to store preprocessed data and labels
    preprocessed_data = []
    labels = []

    # Iterate through all classes (e.g., "real" and "fake")
    for class_name in ["real", "fake"]:
        class_output_path = os.path.join(preprocessed_folder, class_name)

        # Iterate through all preprocessed face files in the class folder
        for face_file in os.listdir(class_output_path):
            face_file_path = os.path.join(class_output_path, face_file)

            # Load the preprocessed face image
            face_image = cv.imread(face_file_path)
            if face_image is None:
                print(f"Error: Could not load preprocessed face {face_file_path}")
                continue

            # Normalize the face image to [0, 1]
            normalized_face = face_image / 255.0

            # Append the preprocessed face and label to the lists
            preprocessed_data.append(normalized_face)
            labels.append(0 if class_name == "real" else 1)  # 0 for real, 1 for fake

            # Print the name of the image being stored
            print(f"Stored image: {face_file} (Class: {class_name})")

    # Convert lists to NumPy arrays
    preprocessed_data = np.array(preprocessed_data)
    labels = np.array(labels)

    return preprocessed_data, labels

In [None]:
# Define output folder
preprocessed_folder = "/content/drive/MyDrive/ff++preprocessed_frames"

# Load preprocessed data and labels
preprocessed_data, labels = load_preprocessed_data(preprocessed_folder)

# Print the shapes of the preprocessed data and labels
print("Preprocessed data shape:", preprocessed_data.shape)
print("Labels shape:", labels.shape)

**1.4 Saving Preprocessed Data Class-wise(Real/Fake) in .h5 format**

In [None]:
def save_class_data(preprocessed_folder, class_name, output_path):
    """
    Process and save data for a specific class (real or fake) to an HDF5 file incrementally.
    Skips frames that have already been processed and saved.

    Args:
        preprocessed_folder (str): Path to the folder containing preprocessed data.
        class_name (str): Name of the class ("real" or "fake").
        output_path (str): Path to save the HDF5 file.
    """
    # Path to the class folder
    class_output_path = os.path.join(preprocessed_folder, class_name)

    # Check if the HDF5 file already exists and handle overwriting
    if os.path.exists(output_path):
        os.remove(output_path)
        print(f"Overwriting existing HDF5 file: {output_path}")
    else:
        print(f"Creating new HDF5 file: {output_path}")

    # Open the HDF5 file in write mode
    with h5py.File(output_path, "w") as f:
        # Create datasets
        f.create_dataset(
            "preprocessed_data",
            shape=(0, 224, 224, 3),  # Consistent with your sample image shape
            maxshape=(None, 224, 224, 3),
            dtype=np.uint8,
            compression="gzip",
        )
        f.create_dataset(
            "labels",
            shape=(0,),
            maxshape=(None,),
            dtype=np.uint8,
            compression="gzip",
        )
        f.create_dataset(
            "frame_names",
            shape=(0,),
            maxshape=(None,),
            dtype=h5py.string_dtype(encoding="utf-8"),
            compression="gzip",
        )

        # Iterate through all preprocessed face files in the class folder
        for face_file in os.listdir(class_output_path):
            face_file_path = os.path.join(class_output_path, face_file)

            # Load the preprocessed face image
            face_image = cv.imread(face_file_path)
            if face_image is None:
                print(f"Error: Could not load preprocessed face {face_file_path}")
                continue

            # Check if the image has the correct shape
            if face_image.shape != (224, 224, 3):
                print(f"Warning: Image {face_file} has shape {face_image.shape}, expected (224, 224, 3). Skipping.")
                continue

            # Append the preprocessed face and label to the datasets
            f["preprocessed_data"].resize((f["preprocessed_data"].shape[0] + 1), axis=0)
            f["preprocessed_data"][-1] = face_image  # No normalization needed if saving as uint8

            f["labels"].resize((f["labels"].shape[0] + 1), axis=0)
            f["labels"][-1] = 0 if class_name == "real" else 1  # 0 for real, 1 for fake

            f["frame_names"].resize((f["frame_names"].shape[0] + 1), axis=0)
            f["frame_names"][-1] = face_file.encode("utf-8") # encode to utf-8

            # Print the name of the image being stored
            print(f"Stored image: {face_file} (Class: {class_name})")

    print(f"Data for class '{class_name}' saved to: {output_path}")


In [None]:
#Saving Preprocessed Real Class in real_data.h5
save_class_data(preprocessed_folder, "real", real_output_path)

In [None]:
#Saving Preprocessed Fake Class in fake_data.h5
save_class_data(preprocessed_folder, "fake", fake_output_path)

**1.5 Saving both classes into single .h5 file**

In [None]:
# Define paths
combined_output_path = "/content/drive/MyDrive/ff++preprocessed_combined_data.h5"
real_output_path = "/content/drive/MyDrive/ff++preprocessed_real_data.h5"
fake_output_path = "/content/drive/MyDrive/ff++preprocessed_fake_data.h5"

In [None]:
# Create an empty combined file with correct datasets
with h5py.File(combined_output_path, "w") as combined_file:
    combined_file.create_dataset("preprocessed_data", shape=(0, 224, 224, 3), maxshape=(None, 224, 224, 3), dtype=np.uint8, compression="gzip")
    combined_file.create_dataset("labels", shape=(0,), maxshape=(None,), dtype=np.uint8, compression="gzip")
    combined_file.create_dataset("frame_names", shape=(0,), maxshape=(None,), dtype=h5py.string_dtype(encoding="utf-8"), compression="gzip")
#Append Real Data
with h5py.File(real_output_path, "r") as real_file, h5py.File(combined_output_path, "a") as combined_file:
    real_data = real_file["preprocessed_data"][:]
    real_labels = real_file["labels"][:]
    real_frame_names = real_file["frame_names"][:]

    current_len = combined_file["preprocessed_data"].shape[0]

    combined_file["preprocessed_data"].resize((current_len + real_data.shape[0]), axis=0)
    combined_file["labels"].resize((current_len + real_labels.shape[0]), axis=0)
    combined_file["frame_names"].resize((current_len + real_frame_names.shape[0]), axis=0)

    combined_file["preprocessed_data"][current_len:] = real_data
    combined_file["labels"][current_len:] = real_labels
    combined_file["frame_names"][current_len:] = real_frame_names

    for i, frame_name in enumerate(real_frame_names):
      print(f"saved real {frame_name.decode('utf-8')} in combined file")

print(f"Real data appended to {combined_output_path}")


In [None]:
#Append Fake Data
with h5py.File(fake_output_path, "r") as fake_file, h5py.File(combined_output_path, "a") as combined_file:
    fake_data = fake_file["preprocessed_data"][:]
    fake_labels = fake_file["labels"][:]
    fake_frame_names = fake_file["frame_names"][:]

    current_len = combined_file["preprocessed_data"].shape[0]

    combined_file["preprocessed_data"].resize((current_len + fake_data.shape[0]), axis=0)
    combined_file["labels"].resize((current_len + fake_labels.shape[0]), axis=0)
    combined_file["frame_names"].resize((current_len + fake_frame_names.shape[0]), axis=0)

    combined_file["preprocessed_data"][current_len:] = fake_data
    combined_file["labels"][current_len:] = fake_labels
    combined_file["frame_names"][current_len:] = fake_frame_names

    for i, frame_name in enumerate(fake_frame_names):
      print(f"saved fake {frame_name.decode('utf-8')} in combined file")

print(f"Fake data appended to {combined_output_path}")

**2. Data Verification**

**2.1. Loading Combined Data for Verification**

In [None]:
# Open the HDF5 file
with h5py.File(combined_output_path, "r") as f:
    # List all datasets in the file
    print("Datasets in the file:", list(f.keys()))

    # Access the datasets
    preprocessed_data = f["preprocessed_data"]
    labels = f["labels"]
    frame_names = f["frame_names"]

    # Print dataset shapes
    print("Combined Preprocessed data shape:", preprocessed_data.shape)
    print("combined Labels shape:", labels.shape)
    print("combined Frame names shape:", frame_names.shape)


**2.2 Verifying Real Class**

In [None]:
def verify_real_data_against_combined(real_path, combined_path):
    with h5py.File(real_path, "r") as real_file, h5py.File(combined_path, "r") as combined_file:
        # Load real data
        real_frame_names = [name.decode("utf-8") for name in real_file["frame_names"][:]]
        real_labels = real_file["labels"][:]
        real_data = real_file["preprocessed_data"][:]

        # Load combined data
        combined_frame_names = [name.decode("utf-8") for name in combined_file["frame_names"][:]]
        combined_labels = combined_file["labels"][:]
        combined_data = combined_file["preprocessed_data"][:]

        # Check if all real frames are in the combined dataset
        missing_frames = [name for name in real_frame_names if name not in combined_frame_names]
        if missing_frames:
            print(f"Warning: {len(missing_frames)} real frames are missing in the combined dataset.")
            print("Missing frames:", missing_frames)
        else:
            print("All real frames are present in the combined dataset.")

        # Verify labels and data for real frames
        print("\nVerifying labels and data for real frames...")
        for i, frame_name in enumerate(real_frame_names):
            if frame_name in combined_frame_names:
                combined_index = combined_frame_names.index(frame_name)

                # Check label (real data should have label 0)
                if combined_labels[combined_index] != 0:
                    print(f"Label mismatch for real frame: {frame_name} (Expected: 0, Found: {combined_labels[combined_index]})")

                # Check preprocessed data
                if not np.array_equal(real_data[i], combined_data[combined_index]):
                    print(f"Data mismatch for real frame: {frame_name}")

        print("Verification for real data complete.")


verify_real_data_against_combined(real_output_path, combined_output_path)

**2.3 Verifying Fake Class**

In [None]:
def verify_fake_data_against_combined(fake_path, combined_path):
    with h5py.File(fake_path, "r") as fake_file, h5py.File(combined_path, "r") as combined_file:
        # Load fake data
        fake_frame_names = [name.decode("utf-8") for name in fake_file["frame_names"][:]]
        fake_labels = fake_file["labels"][:]
        fake_data = fake_file["preprocessed_data"][:]

        # Load combined data
        combined_frame_names = [name.decode("utf-8") for name in combined_file["frame_names"][:]]
        combined_labels = combined_file["labels"][:]
        combined_data = combined_file["preprocessed_data"][:]

        # Check if all fake frames are in the combined dataset
        missing_frames = [name for name in fake_frame_names if name not in combined_frame_names]
        if missing_frames:
            print(f"Warning: {len(missing_frames)} fake frames are missing in the combined dataset.")
            print("Missing frames:", missing_frames)
        else:
            print("All fake frames are present in the combined dataset.")

        # Verify labels and data for fake frames
        print("\nVerifying labels and data for fake frames...")
        for i, frame_name in enumerate(fake_frame_names):
            if frame_name in combined_frame_names:
                combined_index = combined_frame_names.index(frame_name)

                # Check label (fake data should have label 1)
                if combined_labels[combined_index] != 1:
                    print(f"Label mismatch for fake frame: {frame_name} (Expected: 1, Found: {combined_labels[combined_index]})")

                # Check preprocessed data
                if not np.array_equal(fake_data[i], combined_data[combined_index]):
                    print(f"Data mismatch for fake frame: {frame_name}")

        print("Verification for fake data complete.")


verify_fake_data_against_combined(fake_output_path, combined_output_path)

**2.4 Loading as Dataset to check for Duplicate Frames**

In [None]:
def load_h5_to_dataframe(file_path):
    """
    Load data from an HDF5 file into a Pandas DataFrame.

    Args:
        file_path (str): Path to the HDF5 file.

    Returns:
        pd.DataFrame: A DataFrame containing frame_names, labels, and preprocessed_data (if needed).
    """
    with h5py.File(file_path, "r") as f:
        frame_names = f["frame_names"][:].astype(str)
        labels = f["labels"][:]

        # Create a DataFrame
        df = pd.DataFrame({
            "frame_name": frame_names,
            "label": labels,
        })

    return df

combined_df = load_h5_to_dataframe(combined_output_path)

In [None]:
def check_duplicate_frames(combined_df):
    """
    Check for duplicate frames in the combined DataFrame.

    Args:
        combined_df (pd.DataFrame): DataFrame for the combined data.
    """
    duplicate_frames = combined_df[combined_df.duplicated("frame_name", keep=False)]
    if not duplicate_frames.empty:
        print(f"Warning: {len(duplicate_frames)} duplicate frames detected in the combined file.")
        print("Duplicate frames:")
        print(duplicate_frames)
    else:
        print("No duplicate frames detected in the combined file.")

check_duplicate_frames(combined_df)

**3. Spatial Dataset Creation**

In [None]:
import h5py
import numpy as np
import tensorflow as tf

def create_spatial_dataset(file_path):
    """
    Creates a TensorFlow Dataset pipeline for loading spatial data (individual frames) from an HDF5 file.
    Skips empty frames (all zeros) and normalizes frames to [0, 1].
    Preserves the order of frames and does not batch or shuffle.
    """
    def load_frame_data():
        with h5py.File(file_path, "r") as f:
            frame_names = f["frame_names"][:]  # Shape: (num_frames,)
            frames = f["preprocessed_data"][:]  # Shape: (num_frames, 224, 224, 3)
            labels = f["labels"][:]  # Shape: (num_frames,)

            # Filter out empty frames (all zeros)
            non_empty_indices = [i for i, frame in enumerate(frames) if not np.all(frame == 0)]
            frames = frames[non_empty_indices]  # Shape: (num_valid_frames, 224, 224, 3)
            labels = labels[non_empty_indices]  # Shape: (num_valid_frames,)

            # Normalize frames to [0, 1]
            frames = frames.astype(np.float32) / 255.0

        return frames, labels

    # Load all individual frames and labels for spatial dataset
    frames, labels = load_frame_data()

    # Create a TensorFlow Dataset
    dataset = tf.data.Dataset.from_tensor_slices((frames, labels))

    # Prefetch the dataset (no batching or shuffling)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset

In [None]:
# Load the dataset
file_path = "/content/drive/MyDrive/ff++preprocessed_combined_data.h5"
dataset = create_spatial_dataset(file_path)

In [None]:
for frame, label in dataset.take(5):  # Check the first 5 elements
    print("Frame shape:", frame.shape)  # Shape of the frame
    print("Label shape:", label.shape)  # Shape of the label

**3.1 Spatial Dataset Split**

In [None]:
dataset_list = list(dataset)

# Split the dataset into train and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(dataset_list, test_size=0.2, random_state=42)

# Print the sizes of the splits
print("Training samples:", len(train_data))
print("Validation samples:", len(val_data))

**3.2 Spatial Train and Val Saving**

In [None]:
import h5py
import numpy as np

# Convert the datasets to numpy arrays
train_frames = np.array([frame.numpy() for frame, label in train_data])
train_labels = np.array([label.numpy() for frame, label in train_data])

val_frames = np.array([frame.numpy() for frame, label in val_data])
val_labels = np.array([label.numpy() for frame, label in val_data])

# Save the training data to an HDF5 file
with h5py.File('/content/drive/MyDrive/train_spatial_data.h5', 'w') as f:
    f.create_dataset('frames', data=train_frames)
    f.create_dataset('labels', data=train_labels)

# Save the validation data to an HDF5 file
with h5py.File('/content/drive/MyDrive/val_spatial_data.h5', 'w') as f:
    f.create_dataset('frames', data=val_frames)
    f.create_dataset('labels', data=val_labels)

print("Training and validation data for Spatial Model saved to HDF5 files.")

**4. Temporal Dataset Creation**

In [None]:
def create_temporal_dataset(file_path):
    """
    Creates a TensorFlow Dataset pipeline for loading data from an HDF5 file.
    Skips videos with no valid frames and filters out empty frames (all zeros).
    """
    def load_video_data(video_name):
        with h5py.File(file_path, "r") as f:
            # Get indices of frames for the current video
            frame_indices = np.where([name.decode('utf-8').split('_face_')[0].rsplit('_', 1)[0] == video_name for name in f["frame_names"]])[0]
            frames = f["preprocessed_data"][frame_indices]  # Shape: (num_frames, 224, 224, 3)
            labels = f["labels"][frame_indices]  # Shape: (num_frames,)

            # Debug: Print raw frame values
            print(f"Video: {video_name}")
            print("Raw frames min:", np.min(frames))
            print("Raw frames max:", np.max(frames))

            # Filter out empty frames (all zeros)
            non_empty_indices = [i for i, frame in enumerate(frames) if not np.all(frame == 0)]
            empty_count = len(frames) - len(non_empty_indices)
            if empty_count > 0:
                print(f"Video {video_name}: Filtered out {empty_count} empty frames.")

            # Skip videos with no valid frames
            if len(non_empty_indices) == 0:
                print(f"Warning: Video {video_name} has no valid frames. Skipping.")
                return None, None, None

            frames = frames[non_empty_indices]
            labels = labels[non_empty_indices]

            # Ensure labels are consistent
            first_label = labels[0]
            if not np.all(labels == first_label):
                print(f"Warning: Inconsistent labels in video {video_name}. Using the first label.")
                labels = np.full_like(labels, first_label)

            # Convert frames to float32 and normalize to [0, 1]
            frames = frames.astype(np.float32) / 255.0

            # Debug: Print normalized frame values
            print("Normalized frames min:", np.min(frames))
            print("Normalized frames max:", np.max(frames))

            # Ensure shapes are defined
            frames = tf.ensure_shape(frames, [None, 224, 224, 3])  # num_frames can be None
            labels = tf.ensure_shape(labels, [None])  # num_frames can be None

        return frames, first_label, video_name

    # Get unique video names from the HDF5 file
    with h5py.File(file_path, "r") as f:
        unique_video_names = np.unique([name.decode('utf-8').split('_face_')[0].rsplit('_', 1)[0] for name in f["frame_names"]])

    # Create a TensorFlow Dataset from the list of video names
    dataset = tf.data.Dataset.from_tensor_slices(unique_video_names)

    # Map each video name to its frames, label, and video name
    dataset = dataset.map(
        lambda video_name: tf.py_function(
            load_video_data, [video_name], [tf.float32, tf.float32, tf.string]
        ),
        num_parallel_calls=tf.data.experimental.AUTOTUNE
    )

    # Filter out videos with no valid frames
    dataset = dataset.filter(lambda frames, label, video_name: frames is not None)

    # Prefetch the dataset (no batching here)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset

# Load the dataset
dataset = create_temporal_dataset(file_path)

In [None]:
# Convert the dataset to a list for splitting
dataset_list = list(dataset)

# Split the dataset into train and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(dataset_list, test_size=0.2, random_state=42)
# Print the sizes of the splits
print("Training samples:", len(train_data))
print("Validation samples:", len(val_data))

In [None]:
# Drop video_name and keep only frames and labels
train_data = [(frames, label) for frames, label, _ in train_data]
val_data = [(frames, label) for frames, label, _ in val_data]

**4.1 Saving Temporal Train and Val**

In [None]:
# Save train_data to an HDF5 file
#getting frames in video sequence for temporal dataset
with h5py.File('train_temporal_data.h5', 'w') as f:
    for i, (frames, label) in enumerate(train_data):
        # Create a group for each video
        video_group = f.create_group(f'video_{i}')
        # Save frames and label for this video
        video_group.create_dataset('frames', data=frames)
        video_group.create_dataset('label', data=label)

# Save val_data to an HDF5 file
with h5py.File('val_temporal_data.h5', 'w') as f:
    for i, (frames, label) in enumerate(val_data):
        # Create a group for each video
        video_group = f.create_group(f'video_{i}')
        # Save frames and label for this video
        video_group.create_dataset('frames', data=frames)
        video_group.create_dataset('label', data=label)