In [5]:
import os
import cv2
import random
import numpy as np
import re
import tensorflow as tf
# import keras._tf_keras.keras.models
from datetime import datetime
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed
from tensorflow.keras.preprocessing.image import img_to_array, array_to_img, ImageDataGenerator
from tensorflow.keras import layers, models
from tensorflow.keras.applications import resnet50, efficientnet

In [7]:
print(tf.__version__)

2.19.0


1. Extracting even number of frames from the videos (original_dataset_even_frames folder)

In [1]:
def extract_frames_with_priority_deletion(video_path, output_dir, gray_output_dir, target_frames=60):
    """
    Extract frames focusing on the middle section of the video and save both RGB and grayscale versions.
    If the video has fewer than 60 frames, copy the last frame to fill the deficit.
    If the video has more than 60 frames, delete 25% of extra frames from the start and 75% from the end.

    :param video_path: Path to the video file.
    :param output_dir: Directory where the RGB frames will be saved.
    :param gray_output_dir: Directory where the grayscale frames will be saved.
    :param target_frames: The number of frames to extract (default 60).
    """
    # Open the video file
    cap = cv2.VideoCapture(video_path)

    # Get the total number of frames in the video
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Create output directories if they don't exist
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(gray_output_dir, exist_ok=True)

    # Read and store all frames
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)

    # If the total number of frames is less than the target, copy the last frame to fill the gap
    if total_frames < target_frames:
        # Save all existing frames first
        for i in range(total_frames):
            # Save RGB frame
            frame_filename_rgb = os.path.join(output_dir, f"frame{i+1:04d}.jpg")
            cv2.imwrite(frame_filename_rgb, frames[i])

            # Convert to grayscale and save
            gray_frame = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
            frame_filename_gray = os.path.join(gray_output_dir, f"frame{i+1:04d}.jpg")
            cv2.imwrite(frame_filename_gray, gray_frame)

        # Copy the last frame to fill the deficit
        last_frame = frames[-1]
        for i in range(total_frames, target_frames):
            # Save last RGB frame
            frame_filename_rgb = os.path.join(output_dir, f"frame{i+1:04d}.jpg")
            cv2.imwrite(frame_filename_rgb, last_frame)

            # Convert last frame to grayscale and save
            gray_last_frame = cv2.cvtColor(last_frame, cv2.COLOR_BGR2GRAY)
            frame_filename_gray = os.path.join(gray_output_dir, f"frame{i+1:04d}.jpg")
            cv2.imwrite(frame_filename_gray, gray_last_frame)

        # print(f"Copied last frame to fill the deficit for {video_path}.")
        cap.release()
        return

    # If the video has more than the target number of frames, prioritize deleting from start and end
    if total_frames > target_frames:
        frames_to_delete = total_frames - target_frames
        delete_from_end = int(frames_to_delete * 0.75)  # 75% of frames to delete from the end
        delete_from_start = frames_to_delete - delete_from_end  # 25% from the start

        middle_start = delete_from_start
        middle_end = total_frames - delete_from_end
    else:
        # Calculate the start and end points to extract the middle section
        middle_start = max((total_frames // 2) - int(target_frames // 1.5), 0)
        middle_end = min(middle_start + target_frames, total_frames)

    extracted_frame_count = 0

    # Iterate through the middle section and save frames
    for i in range(middle_start, middle_end):
        # Save RGB frame
        frame_filename_rgb = os.path.join(output_dir, f"frame{extracted_frame_count + 1:04d}.jpg")
        cv2.imwrite(frame_filename_rgb, frames[i])

        # Convert to grayscale and save
        gray_frame = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
        frame_filename_gray = os.path.join(gray_output_dir, f"frame{extracted_frame_count + 1:04d}.jpg")
        cv2.imwrite(frame_filename_gray, gray_frame)

        extracted_frame_count += 1

    cap.release()
    # print(f"Extracted {extracted_frame_count} frames from {video_path}")

def process_video(video_file, video_folder, output_folder_rgb, output_folder_gray, count):
    video_path = os.path.join(video_folder, video_file)
    word_output_dir_rgb = os.path.join(output_folder_rgb, os.path.splitext(video_file)[0])
    word_output_dir_gray = os.path.join(output_folder_gray, os.path.splitext(video_file)[0])
    print(f"{count}. Processing {video_file}...")
    extract_frames_with_priority_deletion(video_path, word_output_dir_rgb, word_output_dir_gray, target_frames=60)

def run_multithreading(video_folder, output_folder_rgb, output_folder_gray):
    video_files = [f for f in os.listdir(video_folder) if f.endswith('.mp4')]

    # Use ThreadPoolExecutor to process videos in parallel
    with ThreadPoolExecutor(max_workers=8) as executor:  # Adjust max_workers to the number of threads you want
        future_to_video = {executor.submit(process_video, video_file, video_folder, output_folder_rgb, output_folder_gray, count): video_file
                           for count, video_file in enumerate(video_files, start=1)}

        # Monitor the progress
        for future in as_completed(future_to_video):
            video_file = future_to_video[future]
            try:
                future.result()
            except Exception as exc:
                print(f'{video_file} generated an exception: {exc}')

video_folder = r'D:\pro_dis\videos\HD_720p'  # Your video folder path
output_folder_rgb = r'D:\pro_dis\original_dataset_even_frames'  # Folder to save RGB frames
output_folder_gray = r'D:\pro_dis\original_gray_dataset_even_frames'  # Folder to save grayscale frames

run_multithreading(video_folder, output_folder_rgb, output_folder_gray)

print("\nRGB and grayscale frame extraction complete!")

1. Processing ace.mp4...
2. Processing act.mp4...
3. Processing add.mp4...
4. Processing age.mp4...
5. Processing aid.mp4...
6. Processing aids.mp4...
7. Processing aim.mp4...
8. Processing air.mp4...
9. Processing all.mp4...10. Processing anal.mp4...

11. Processing and.mp4...
12. Processing app.mp4...
13. Processing approx.mp4...
14. Processing arc.mp4...
15. Processing arch.mp4...
16. Processing are.mp4...
17. Processing arm.mp4...
18. Processing armed.mp4...
19. Processing arms.mp4...
20. Processing art.mp4...
21. Processing arts.mp4...22. Processing ash.mp4...

23. Processing ask.mp4...
24. Processing ass.mp4...
25. Processing aunt.mp4...
26. Processing auto.mp4...
27. Processing babe.mp4...
28. Processing back.mp4...
29. Processing bad.mp4...
30. Processing bag.mp4...
31. Processing bags.mp4...32. Processing bald.mp4...

33. Processing ball.mp4...
34. Processing balls.mp4...
35. Processing ban.mp4...
36. Processing band.mp4...
37. Processing bang.mp4...
38. Processing bank.mp4...

2.1. Augmenting even number of frames from the original_dataset_even_frames folder (augmented_dataset_uneven_frames folder)

In [3]:
# ImageDataGenerator for basic augmentations (horizontal flip, brightness)
datagen = ImageDataGenerator(
    horizontal_flip=True,
    brightness_range=[0.8, 1.2]  # Adjusting brightness (color jittering)
)

# Function to apply noise to an image
def add_noise(image):
    # Check if the image is grayscale (2D) or has an extra channel dimension (3D)
    if len(image.shape) == 2:
        row, col = image.shape  # Grayscale image (2D)
    else:
        row, col, ch = image.shape  # In case it still has a third dimension

    mean = 0
    sigma = 0.1  # Fixed sigma value (standard deviation) for Gaussian noise
    gauss = np.random.normal(mean, sigma, (row, col))

    # Ensure that the noise is applied consistently across channels if any exist
    if len(image.shape) == 2:  # Grayscale
        noisy_image = np.clip(image + gauss * 255, 0, 255).astype(np.uint8)
    else:  # If for some reason it has channels, apply the same noise across all channels
        noisy_image = np.clip(image + gauss[:, :, np.newaxis] * 255, 0, 255).astype(np.uint8)

    return noisy_image

# Function to augment and save augmented frames
def augment_and_save_frame(image_path, output_dir, count):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  # Load grayscale image
    img = img_to_array(img)  # Convert image to array for augmentation

    # Apply horizontal flipping and brightness adjustments
    img = datagen.random_transform(img)

    # Add noise to 30% of the frames
    if random.random() < 0.3:
        img = add_noise(img)

    # Convert back to image format and save augmented frame with consistent naming
    aug_img = array_to_img(img, scale=False)
    aug_img.save(f"{output_dir}/frame{count:04d}.jpg")  # Augmented frame saved with sequential naming

# Function to randomly shift frames temporally
def temporal_shift(frames):
    shift_amount = random.randint(-2, 2)  # Shift between -2 to +2 frames
    return np.roll(frames, shift_amount, axis=0)

# Function to augment frames with all techniques and store only augmented frames
def augment_frames(frame_dir, output_dir):
    # List all frame files in the directory
    frames = sorted(os.listdir(frame_dir))

    # Randomly remove or duplicate 10% of frames to simulate frame corruption/duplication
    num_modify = int(0.1 * len(frames))
    modify_indices = random.sample(range(len(frames)), num_modify)

    # Read all frames into a list for easier manipulation
    all_frames = [cv2.imread(os.path.join(frame_dir, frame), cv2.IMREAD_GRAYSCALE) for frame in frames]

    # Apply temporal shifting to the entire sequence
    shifted_frames = temporal_shift(np.array(all_frames))

    # Save augmented frames to output directory
    for count, frame_name in enumerate(frames):
        frame_path = os.path.join(frame_dir, frame_name)

        # If the frame is selected for modification (removal or duplication)
        if count in modify_indices:
            if random.random() < 0.5:  # Randomly decide whether to remove or duplicate
                # Duplicate frame (copy it again)
                augment_and_save_frame(frame_path, output_dir, count + len(frames))  # Save augmented frame after originals
            else:
                # Skip the frame to simulate removal
                continue

        # Apply augmentation to the frame and save it
        augment_and_save_frame(frame_path, output_dir, count + 1)

# Function to process each word directory in parallel
def process_word_dir(word_dir, count):
    word_frame_dir = os.path.join(original_frames_folder, word_dir)
    output_word_dir = os.path.join(augmented_frames_folder, word_dir)  # New output directory
    os.makedirs(output_word_dir, exist_ok=True)  # Create output directory
    if os.path.isdir(word_frame_dir):
        print(f"{count}. Augmenting grayscale frames for {word_dir}...")
        augment_frames(word_frame_dir, output_word_dir)

# Main function to handle multi-threading
def augment_all_words():
    word_dirs = [word_dir for word_dir in os.listdir(original_frames_folder) if os.path.isdir(os.path.join(original_frames_folder, word_dir))]

    # Use ThreadPoolExecutor for multi-threading
    with ThreadPoolExecutor(max_workers=8) as executor:  # Adjust the number of threads as needed
        for count, word_dir in enumerate(word_dirs, start=1):
            executor.submit(process_word_dir, word_dir, count)

    print("\nAll grayscale frames augmented and saved in 'augmented_gray_dataset_uneven_frames'.")

# Path where the original grayscale frames are stored
original_frames_folder = r'D:\pro_dis\original_gray_dataset_even_frames'

# Output folder for storing only augmented grayscale frames
augmented_frames_folder = r'D:\pro_dis\augmented_gray_dataset_uneven_frames'

# Run the multi-threading process
augment_all_words()

1. Augmenting grayscale frames for ace...
2. Augmenting grayscale frames for act...
3. Augmenting grayscale frames for add...
4. Augmenting grayscale frames for age...
5. Augmenting grayscale frames for aid...
6. Augmenting grayscale frames for aids...
7. Augmenting grayscale frames for aim...
8. Augmenting grayscale frames for air...
9. Augmenting grayscale frames for all...
10. Augmenting grayscale frames for anal...
11. Augmenting grayscale frames for and...
12. Augmenting grayscale frames for app...
13. Augmenting grayscale frames for approx...
14. Augmenting grayscale frames for arc...
15. Augmenting grayscale frames for arch...
16. Augmenting grayscale frames for are...
17. Augmenting grayscale frames for arm...
18. Augmenting grayscale frames for armed...
19. Augmenting grayscale frames for arms...
20. Augmenting grayscale frames for art...
21. Augmenting grayscale frames for arts...
22. Augmenting grayscale frames for ash...
23. Augmenting grayscale frames for ask...
24. Augmen

2.2. augmented_dataset_uneven_frames are created. Limiting these to 60 frames (augmented_dataset_even_frames folder)

In [39]:
# Function to ensure there are exactly 60 frames with proper naming convention
def ensure_60_frames(frame_dir, output_dir, target_frames=60):
    # List all frame files in the directory
    frames = sorted(os.listdir(frame_dir))

    # If fewer than 60 frames, duplicate the last frame until there are 60
    if len(frames) < target_frames:
        last_frame_path = os.path.join(frame_dir, frames[-1])
        for i in range(len(frames), target_frames):
            new_frame_path = os.path.join(output_dir, f"frame{i+1:04d}.jpg")
            shutil.copy(last_frame_path, new_frame_path)  # Directly copy the last frame

    # If more than 60 frames, remove extra frames from the end
    elif len(frames) > target_frames:
        for i in range(target_frames, len(frames)):
            os.remove(os.path.join(frame_dir, frames[i]))

    # Copy all frames to the output directory with the updated naming convention
    for count, frame in enumerate(sorted(os.listdir(frame_dir))):
        input_frame_path = os.path.join(frame_dir, frame)
        output_frame_path = os.path.join(output_dir, f"frame{count+1:04d}.jpg")
        shutil.copy(input_frame_path, output_frame_path)  # Directly copy the frame

# Path where the augmented uneven grayscale frames are stored
uneven_frames_folder = r'D:\pro_dis\augmented_gray_dataset_uneven_frames'

# Output folder for storing frames with exactly 60 frames in grayscale
even_frames_folder = r'D:\pro_dis\augmented_gray_dataset_even_frames'

# Ensure all folders in "augmented_gray_dataset_uneven_frames" have exactly 60 frames and save them to "augmented_gray_dataset_even_frames"
for word_dir in os.listdir(uneven_frames_folder):
    word_frame_dir = os.path.join(uneven_frames_folder, word_dir)
    output_word_dir = os.path.join(even_frames_folder, word_dir)
    os.makedirs(output_word_dir, exist_ok=True)  # Create output directory for each word

    if os.path.isdir(word_frame_dir):
        print(f"Ensuring 60 frames for {word_dir}...")
        ensure_60_frames(word_frame_dir, output_word_dir)

print("\nAll folders processed and saved with exactly 60 frames in 'augmented_gray_dataset_even_frames'.")

Ensuring 60 frames for Bat...
Ensuring 60 frames for Big...
Ensuring 60 frames for Book...
Ensuring 60 frames for Car...
Ensuring 60 frames for Cat...
Ensuring 60 frames for Cup...
Ensuring 60 frames for Dog...
Ensuring 60 frames for Drop...
Ensuring 60 frames for Eat...
Ensuring 60 frames for Fast...
Ensuring 60 frames for Fish...
Ensuring 60 frames for Fun...
Ensuring 60 frames for Go...
Ensuring 60 frames for Hat...
Ensuring 60 frames for Hot...
Ensuring 60 frames for Jump...
Ensuring 60 frames for Kick...
Ensuring 60 frames for Leg...
Ensuring 60 frames for Milk...
Ensuring 60 frames for No...
Ensuring 60 frames for Pen...
Ensuring 60 frames for Pick...
Ensuring 60 frames for Red...
Ensuring 60 frames for Run...
Ensuring 60 frames for Sit...
Ensuring 60 frames for Stop...
Ensuring 60 frames for Sun...
Ensuring 60 frames for Top...
Ensuring 60 frames for Win...
Ensuring 60 frames for Yes...

All folders processed and saved with exactly 60 frames in 'augmented_gray_dataset_even_frame

3. Cropping the mouth region with basic algorithm and saving it in different folders (original_dataset_mouth_cropped, augmented_dataset_mouth_cropped folders)

In [40]:
# Path to the Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Helper function for natural sorting
def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
    """
    Helper function to sort strings containing numbers in human order.
    Example: frame2.jpg comes before frame10.jpg
    """
    return [int(text) if text.isdigit() else text.lower() for text in _nsre.split(s)]

# Updated function to crop the mouth region with new parameters
def crop_mouth_region(image, height_ratio=0.8, width_ratio=0.8, scale_factor=0.5):
    """
    Crop the mouth region from the detected face with adjustable height, width, and starting position.

    :param image: The input frame (already grayscale in this case).
    :param height_ratio: Proportion of the lower face height to crop (default 0.8 = 80%).
    :param width_ratio: Proportion of the face width to crop (default 0.8 = 80%).
    :param scale_factor: Proportion of the lower part of the face to start the crop (default 0.5 = 50%).
    :return: Cropped mouth region.
    """
    # Since the image is already grayscale, we skip the conversion step

    # Detect face directly using grayscale image
    faces = face_cascade.detectMultiScale(image, 1.3, 5)

    if len(faces) == 0:
        # If no face is detected, return a placeholder (black image of expected size)
        h, w = image.shape  # Adjust for grayscale (no channel)
        placeholder = np.zeros((int(h * height_ratio), int(w * width_ratio)), dtype=np.uint8)
        return placeholder

    for (x, y, w, h) in faces:
        # Calculate crop width and height based on ratios
        crop_width = int(w * width_ratio)
        crop_height = int(h * height_ratio)

        # Calculate coordinates to start cropping based on scale_factor and centering width crop
        crop_x = x + (w - crop_width) // 2  # Center the width
        crop_y = int(y + h * scale_factor)  # Start cropping from lower part of the face

        # Ensure the crop coordinates are within image bounds
        crop_x = max(0, crop_x)
        crop_y = max(0, crop_y)
        crop_width = min(crop_width, image.shape[1] - crop_x)
        crop_height = min(crop_height, image.shape[0] - crop_y)

        # Crop the mouth region based on adjusted height and width
        mouth_region = image[crop_y:crop_y + crop_height, crop_x:crop_x + crop_width]

        return mouth_region

# Function for resizing and normalization
def resize_and_normalize(image, target_size=(128, 128)):
    """
    Resize the image to the target size and normalize pixel values.

    :param image: Cropped mouth region.
    :param target_size: Target size for resizing (default is 128x128).
    :return: Resized and normalized image.
    """
    resized_image = cv2.resize(image, target_size)
    normalized_image = resized_image / 255.0
    return normalized_image

# Function to process frames, crop the mouth region, resize, and normalize
def process_frames_for_feature_extraction(frame_dir, output_dir, target_size=(128, 128)):
    """
    Process frames by cropping the mouth region, resizing to 128x128, and normalizing.

    :param frame_dir: Directory where the frames are stored.
    :param output_dir: Output directory for the processed frames.
    :param target_size: The size for resizing the cropped image.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Apply natural sorting to the frames
    frames = sorted(os.listdir(frame_dir), key=natural_sort_key)

    for count, frame_name in enumerate(frames):
        frame_path = os.path.join(frame_dir, frame_name)
        frame = cv2.imread(frame_path, cv2.IMREAD_GRAYSCALE)  # Directly read in grayscale mode

        if frame is None:
            print(f"Warning: Could not read {frame_path}. Skipping.")
            continue

        # Step 1: Crop the mouth region
        mouth = crop_mouth_region(frame)

        if mouth is not None:
            # Step 2: Resize and normalize the cropped mouth region to 128x128
            resized_normalized_mouth = resize_and_normalize(mouth, target_size)

            # Save the resized and normalized frame
            save_path = os.path.join(output_dir, f"frame{count + 1:04d}.jpg")
            cv2.imwrite(save_path, (resized_normalized_mouth * 255).astype(np.uint8))  # Save as image with 0-255 pixel values

        else:
            print(f"Warning: No mouth detected in {frame_name}. Using placeholder.")

# Paths for original and augmented grayscale datasets
original_gray_frames_folder = r'D:\pro_dis\original_gray_dataset_even_frames'  # Original grayscale frames
augmented_gray_frames_folder = r'D:\pro_dis\augmented_gray_dataset_even_frames'  # Augmented grayscale frames

# Output folders for processed frames
original_gray_output_folder = r'D:\pro_dis\original_gray_dataset_mouth_cropped'  # Mouth-cropped original grayscale dataset
augmented_gray_output_folder = r'D:\pro_dis\augmented_gray_dataset_mouth_cropped'  # Mouth-cropped augmented grayscale dataset

# Process the original grayscale dataset (crop mouth, resize, normalize)
for word_dir in os.listdir(original_gray_frames_folder):
    word_frame_dir = os.path.join(original_gray_frames_folder, word_dir)
    output_dir = os.path.join(original_gray_output_folder, word_dir)  # Output folder for original grayscale dataset
    print(f"Processing {word_dir} from original grayscale dataset...")
    process_frames_for_feature_extraction(word_frame_dir, output_dir, target_size=(128, 128))

print("\nMouth cropping, resizing (128x128), and normalization complete for original datasets!\n")

# Process the augmented grayscale dataset (crop mouth, resize, normalize)
for word_dir in os.listdir(augmented_gray_frames_folder):
    word_frame_dir = os.path.join(augmented_gray_frames_folder, word_dir)
    output_dir = os.path.join(augmented_gray_output_folder, word_dir)  # Output folder for augmented grayscale dataset
    print(f"Processing {word_dir} from augmented grayscale dataset...")
    process_frames_for_feature_extraction(word_frame_dir, output_dir, target_size=(128, 128))

print("\nMouth cropping, resizing (128x128), and normalization complete for augmented datasets!")

Processing Bat from original grayscale dataset...
Processing Big from original grayscale dataset...
Processing Book from original grayscale dataset...
Processing Car from original grayscale dataset...
Processing Cat from original grayscale dataset...
Processing Cup from original grayscale dataset...
Processing Dog from original grayscale dataset...
Processing Drop from original grayscale dataset...
Processing Eat from original grayscale dataset...
Processing Fast from original grayscale dataset...
Processing Fish from original grayscale dataset...
Processing Fun from original grayscale dataset...
Processing Go from original grayscale dataset...
Processing Hat from original grayscale dataset...
Processing Hot from original grayscale dataset...
Processing Jump from original grayscale dataset...
Processing Kick from original grayscale dataset...
Processing Leg from original grayscale dataset...
Processing Milk from original grayscale dataset...
Processing No from original grayscale datase

4. feature extraction using 3DCNN, 2DCNN and 2DRESNET. New 2DCNN is used to connect output of 3DCNN and input of 2DRESNET

In [29]:

# Define the 3D-CNN layer for spatio-temporal feature extraction
def build_3d_cnn(input_shape):
    input_3d = layers.Input(shape=input_shape)
    x = layers.Conv3D(32, (3, 3, 3), activation='relu')(input_3d)
    x = layers.MaxPooling3D(pool_size=(2, 2, 2))(x)

    x = layers.Conv3D(64, (3, 3, 3), activation='relu')(x)
    x = layers.MaxPooling3D(pool_size=(2, 2, 2))(x)

    x = layers.Conv3D(128, (3, 3, 3), activation='relu')(x)
    x = layers.MaxPooling3D(pool_size=(2, 2, 2))(x)

    x = layers.Flatten()(x)
    model = models.Model(inputs=input_3d, outputs=x)
    return model

# Define the 2D ResNet model for spatial refinement
def build_2d_resnet(input_shape):
    resnet = resnet50.ResNet50(weights='imagenet', include_top=False, input_shape=(input_shape[0], input_shape[1], 3))
    for layer in resnet.layers:
        layer.trainable = False
    return resnet

# Helper function to find factors
def find_factors(n):
    factors = [(i, n // i) for i in range(1, int(n**0.5) + 1) if n % i == 0]
    return factors[-1]  # Get the largest factors that multiply to n

# Define the combined end-to-end model (3D-CNN + ResNet)
def build_combined_model(input_shape_3d, input_shape_2d, num_classes=30):
    input_3d = layers.Input(shape=input_shape_3d)
    cnn_3d = build_3d_cnn(input_shape_3d)(input_3d)

    # Calculate a suitable reshaped size for ResNet
    flattened_size = cnn_3d.shape[1]
    reshaped_height, reshaped_width = find_factors(flattened_size)

    # Define the reshaped size
    reshaped_size = (reshaped_height, reshaped_width, -1)
    reshaped_features = layers.Reshape(reshaped_size)(cnn_3d)

    # Convert the reshaped features to 3 channels for compatibility with ResNet
    reshaped_features = layers.Conv2D(3, (1, 1), padding='same', activation='relu')(reshaped_features)

    # Apply ResNet to the reshaped features
    resnet = build_2d_resnet((reshaped_height, reshaped_width, 3))
    refined_features = resnet(reshaped_features)

    # Global Average Pooling
    pooling = layers.GlobalAveragePooling2D()(refined_features)

    # Final output layer (classification layer with softmax activation)
    output = layers.Dense(num_classes, activation='softmax')(pooling)

    # Build the full model
    model = models.Model(inputs=input_3d, outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model

# Example usage:
input_shape_3d = (60, 128, 128, 1)  # 60 frames, 128x128, 1 channel (grayscale)
input_shape_2d = (128, 128, 1)      # Each frame is 128x128, 1 channel

# Build the combined model
model = build_combined_model(input_shape_3d, input_shape_2d, num_classes=30)

# Print model summary to understand the architecture
model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step


4. feature extraction using 3DCNN and EfficientNet.

In [33]:
# Define the 3D-CNN layer for spatio-temporal feature extraction
def build_3d_cnn(input_shape):
    """
    This function builds a 3D CNN model for capturing spatio-temporal features
    from a sequence of video frames (e.g., lip movements over time).
    """
    model = models.Sequential()

    # 3D convolutional layer with 32 filters, (3x3x3) kernel size
    model.add(layers.Conv3D(32, (3, 3, 3), activation='relu', input_shape=input_shape))

    # MaxPooling3D layer to downsample the input after the convolution
    model.add(layers.MaxPooling3D(pool_size=(2, 2, 2)))

    # Second 3D convolutional layer with 64 filters
    model.add(layers.Conv3D(64, (3, 3, 3), activation='relu'))

    # Another MaxPooling3D layer to further downsample the features
    model.add(layers.MaxPooling3D(pool_size=(2, 2, 2)))

    # Third 3D convolutional layer with 128 filters
    model.add(layers.Conv3D(128, (3, 3, 3), activation='relu'))

    # MaxPooling again to reduce the spatial and temporal dimensions
    model.add(layers.MaxPooling3D(pool_size=(2, 2, 2)))

    # Flatten the output to feed it into the next stage (EfficientNet)
    model.add(layers.Flatten())

    return model

# Define the EfficientNet model for spatial refinement
def build_efficientnet(input_shape):
    """
    This function creates an EfficientNetB0 model for refining spatial features.
    """
    # Pre-trained EfficientNetB0 model (on ImageNet), without the top (final classification layers)
    efficientnet_model = efficientnet.EfficientNetB0(
        weights='imagenet', include_top=False, input_shape=input_shape
    )

    # Freeze all layers initially (for fine-tuning later)
    for layer in efficientnet_model.layers:
        layer.trainable = False

    return efficientnet_model

# Define the combined end-to-end model (3D-CNN + EfficientNet)
def build_combined_model(input_shape_3d, input_shape_2d, num_classes):
    """
    This function builds an end-to-end model that first applies 3D-CNN for spatio-temporal feature extraction,
    then uses EfficientNet for spatial refinement, and finally includes a classification layer.
    """
    # 3D-CNN Input Layer (for the sequence of frames)
    input_3d = layers.Input(shape=input_shape_3d)

    # Apply the 3D-CNN to the input sequence of frames
    cnn_3d = build_3d_cnn(input_shape_3d)(input_3d)

    # Dense layer to reduce the size for compatibility with EfficientNet input
    dense_features = layers.Dense(input_shape_2d[0] * input_shape_2d[1] * input_shape_2d[2], activation='relu')(cnn_3d)
    reshaped_features = layers.Reshape(input_shape_2d)(dense_features)

    # 2D EfficientNet (Refines spatial features of individual frames)
    efficientnet_model = build_efficientnet(input_shape_2d)

    # Apply EfficientNet to the reshaped features
    refined_features = efficientnet_model(reshaped_features)

    # Global Average Pooling (reduces the feature map size by averaging)
    pooling = layers.GlobalAveragePooling2D()(refined_features)

    # Final output layer (classification layer with softmax activation for word classification)
    output = layers.Dense(num_classes, activation='softmax')(pooling)  # Assuming 30 classes (words)

    # Build the full model (from input_3d to output)
    model = models.Model(inputs=input_3d, outputs=output)

    # Compile the model with Adam optimizer and categorical crossentropy loss
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model

# Example usage:
# 3D input shape: 60 frames, each frame is 128x128, with 1 channel (grayscale)
input_shape_3d = (60, 128, 128, 1)

# 2D input shape: Each frame is resized to be compatible with EfficientNet input (32x32)
input_shape_2d = (32, 32, 3)

# Build the combined model with EfficientNet
model = build_combined_model(input_shape_3d, input_shape_2d, num_classes=30)

# Print model summary to understand the architecture
model.summary()


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


4. Saving the last model in .h5 format

In [37]:
# Save the model as a .h5 file
model.save('feature_extraction_model.h5')



4. Performing Data Preparation means building training, validation and test dataset