In [1]:
import os
import cv2
import dlib
import json
import shutil
import numpy as np
import pandas as pd
import mediapipe as mp

from PIL import Image
from concurrent.futures import ThreadPoolExecutor
from albumentations import Compose, RandomBrightnessContrast
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tensorflow.keras.models import Sequential, Model, save_model, load_model

  check_for_updates()


In [2]:
print(mp.__version__)

0.10.14


In [4]:
# Define paths
base_dir = r'D:\PycharmProjects\source project files\collected_test_data'
extracted_frames_dir = r'D:\PycharmProjects\source project files\collected_test_data\extracted_frames_one'
processed_frames_dir = r'D:\PycharmProjects\source project files\collected_test_data\processed_frames_two'
cropped_frames_dir = r'D:\PycharmProjects\source project files\collected_test_data\cropped_frames_three'

**frame extraction functions**

In [5]:
def extract_frame(frame, output_dir, frame_index):
    frame_filename = os.path.join(output_dir, f"{frame_index:02d}.png")  # Format frame index with leading zeros
    cv2.imwrite(frame_filename, frame)
    return frame_filename

def extract_frames_with_priority_deletion(video_path, output_dir, target_frames=60):
    # Create base directory for storing extracted frames
    os.makedirs(output_dir, exist_ok=True)

    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frames = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)

    # Handle case when the video has fewer than the target number of frames
    if total_frames < target_frames:
        for i in range(total_frames):
            extract_frame(frames[i], output_dir, i + 1)  # Start frame index from 1

        # Copy the last frame to fill the deficit until the target number is reached
        last_frame = frames[-1]
        for i in range(total_frames, target_frames):
            extract_frame(last_frame, output_dir, i + 1)

        print(f"Copied last frame to fill the deficit for {video_path}.")
        cap.release()
        return

    # Handle case when the video has more than the target number of frames
    if total_frames > target_frames:
        frames_to_delete = total_frames - target_frames
        delete_from_end = int(frames_to_delete * 0.9)  # 50% of frames to delete from the end
        delete_from_start = frames_to_delete - delete_from_end  # 20% from the start

        # Retain the middle portion after deleting the required frames
        frames = frames[delete_from_start:total_frames - delete_from_end]

    # Extract frames after deletion logic or for target-sized videos
    for i in range(len(frames)):
        extract_frame(frames[i], output_dir, i + 1)

    cap.release()
    print(f"Extracted {len(frames)} frames saved at: {output_dir}")

**extracted image processing**

In [6]:
# Function for sharpening the image
def sharpen_image(image):
    """Apply sharpening to the image."""
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    return cv2.filter2D(image, -1, kernel)

# Albumentations pipeline for brightness and contrast adjustment
def get_brightness_contrast_augmentation():
    """
    Returns a pipeline to adjust brightness and contrast without probability.
    """
    return Compose([
        RandomBrightnessContrast(
            brightness_limit=(0.05, 0.05),  # Fixed Brightness adjustment range: +5%
            contrast_limit=(0.05, 0.05),    # Fixed Contrast adjustment range: +5%
            p=1.0                         # Always apply
        )
    ])

# Unified function to apply sharpening and brightness/contrast adjustment
def process_extracted_frames(input_folder, output_folder):
    """
    Applies sharpening and brightness/contrast adjustment to all images in a folder.
    :param input_folder: Path to the folder containing images.
    :param output_folder: Path to save the processed images.
    """
    os.makedirs(output_folder, exist_ok=True)

    # Get the brightness/contrast augmentation pipeline
    augmentation_pipeline = get_brightness_contrast_augmentation()

    for file_name in sorted(os.listdir(input_folder)):
        if file_name.endswith(".png"):
            input_path = os.path.join(input_folder, file_name)
            output_path = os.path.join(output_folder, file_name)

            # Read the image
            image = cv2.imread(input_path)
            if image is None:
                print(f"Error reading image: {input_path}")
                continue

            # Step 1: Apply sharpening
            sharpened_image = sharpen_image(image)

            # Step 2: Apply brightness and contrast adjustment
            augmented = augmentation_pipeline(image=sharpened_image)
            final_image = augmented['image']

            # Save the processed image
            cv2.imwrite(output_path, final_image)

    print(f"Processed extracted frames and saved at: {output_folder}")

**frames cropping function and parameters**

In [7]:
# Load the detector and predictor (dlib models)
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor(r"D:\PycharmProjects\source project files\models\shape_predictor_68_face_landmarks.dat")

# Mouth crop dimensions
LIP_HEIGHT = 80
LIP_WIDTH = 112

def process_frame(frame_file, path, output_path):
    """
    Save a single frame as a .png image in the specified directory.
    Ensures complete image save before moving to the next frame.
    """
    frame_path = os.path.join(path, frame_file)
    try:
        # Load the frame
        frame = cv2.imread(frame_path)
        if frame is None:
            print(f"Warning: Could not read frame {frame_path}. Skipping.")
            return False

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Detect faces in the grayscale image
        faces = detector(gray)

        if not faces:
            return False

        # Only process if a face is detected
        for face in faces:
            landmarks = predictor(gray, face)

            # Extract the mouth region by iterating over the landmarks (48 to 67)
            mouth_points = [(landmarks.part(n).x, landmarks.part(n).y) for n in range(48, 68)]
            mouth_points_np = np.array(mouth_points)

            # Find the bounding rectangle around the mouth points
            x, y, w, h = cv2.boundingRect(mouth_points_np)

            # Calculate padding to fit the target dimensions
            width_diff = LIP_WIDTH - w
            height_diff = LIP_HEIGHT - h
            pad_left = max(width_diff // 2, 0)
            pad_right = max(width_diff - pad_left, 0)
            pad_top = max(height_diff // 2, 0)
            pad_bottom = max(height_diff - pad_top, 0)

            # Adjust padding to ensure it doesn’t exceed image boundaries
            pad_left = min(pad_left, x)
            pad_right = min(pad_right, frame.shape[1] - (x + w))
            pad_top = min(pad_top, y)
            pad_bottom = min(pad_bottom, frame.shape[0] - (y + h))

            # Crop and resize the mouth region
            lip_frame = frame[y - pad_top:y + h + pad_bottom, x - pad_left:x + w + pad_right]
            lip_frame = cv2.resize(lip_frame, (LIP_WIDTH, LIP_HEIGHT))

            # Save the cropped mouth region to the output directory
            output_frame_path = os.path.join(output_path, frame_file)
            cv2.imwrite(output_frame_path, lip_frame)

            return True  # Exit after processing the first detected face
    except Exception as e:
        print(f"Error processing frame {frame_path}: {e}")
        return False

**combining cropped frames**

In [8]:
def combine_images(input_path, output_path, output_filename="combined_frames.png"):
    # List all frame files and sort them to maintain order
    frame_files = [f for f in os.listdir(input_path) if f.endswith('.png')]
    frame_files.sort()  # Ensure the frames are in order

    # Check that there are exactly 60 frames
    if len(frame_files) != 60:
        print(f"Warning: {input_path} does not contain exactly 60 frames. Skipping.")
        return

    # Load the first image to get dimensions
    first_image = cv2.imread(os.path.join(input_path, frame_files[0]))
    if first_image is None:
        print(f"Error: Could not read {frame_files[0]}.")
        return

    img_height, img_width, channels = first_image.shape

    # Create an empty array for the combined image (10 rows × 6 columns)
    combined_image = np.zeros((img_height * 10, img_width * 6, channels), dtype=np.uint8)

    # Place each frame into the correct position in the combined image
    for idx, frame_file in enumerate(frame_files):
        img = cv2.imread(os.path.join(input_path, frame_file))
        if img is None:
            print(f"Error: Could not read {frame_file}.")
            continue

        row = idx // 6
        col = idx % 6

        # Place the image in the combined image array
        combined_image[row * img_height:(row + 1) * img_height, col * img_width:(col + 1) * img_width] = img

    # Save the combined image
    cv2.imwrite(os.path.join(output_path, output_filename), combined_image)

    print(f"\nCombined image saved at: {os.path.join(output_path, output_filename)}")

**models loading**

In [9]:
# Load the model from the .h5 file
lip_model = load_model(r'D:\PycharmProjects\source project files\Saved models\lip detection\model2811_361_21_d130_GOOD.h5')
lip_model.summary()

# Load the saved class labels
with open(r'D:\PycharmProjects\source project files\Saved models\class_labels_cl10.json', 'r') as f:
    class_labels = json.load(f)



In [10]:
# class_labels

In [11]:
# Load T5 model and tokenizer for sentence generation
load_directory = r"D:\PycharmProjects\source project files\Saved models\text generation\t5_fine_tuned_local"
tokenizer = T5Tokenizer.from_pretrained(load_directory)
txt_model = T5ForConditionalGeneration.from_pretrained(load_directory)

**integration with T5 model**

In [12]:
# Initialize MediaPipe Hands and Drawing modules
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

# Initialize video capture
cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)

# Set the resolution to the maximum supported by your camera
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)  # Set width (e.g., 1280 for 720p)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)  # Set height (e.g., 720 for 720p)

# Directories checking
os.makedirs(base_dir, exist_ok=True)
os.makedirs(extracted_frames_dir, exist_ok=True)
os.makedirs(cropped_frames_dir, exist_ok=True)
combined_image_path = os.path.join(base_dir, "combined_frames.png")

# Variables to handle recording
recording = False
video_writer = None
executor = ThreadPoolExecutor(max_workers=1)  # Thread pool for frame extraction

# Set video codec and file format
fps = int(cap.get(cv2.CAP_PROP_FPS)) or 30  # Default to 30 if unable to read fps
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# ---------------------------------------------------------------------------------------------------------------------------------------
# while updating the code just paste the previous block which includes is_open_hand(), is_closed_fist() and process_video_in_background()
# ---------------------------------------------------------------------------------------------------------------------------------------
def is_open_hand(hand_landmarks):
    """ Check if all fingers are extended (open hand) """
    for finger_tip, finger_pip in [
        (mp_hands.HandLandmark.INDEX_FINGER_TIP, mp_hands.HandLandmark.INDEX_FINGER_PIP),
        (mp_hands.HandLandmark.MIDDLE_FINGER_TIP, mp_hands.HandLandmark.MIDDLE_FINGER_PIP),
        (mp_hands.HandLandmark.RING_FINGER_TIP, mp_hands.HandLandmark.RING_FINGER_PIP),
        (mp_hands.HandLandmark.PINKY_TIP, mp_hands.HandLandmark.PINKY_PIP)
    ]:
        if hand_landmarks.landmark[finger_tip].y > hand_landmarks.landmark[finger_pip].y:
            return False  # A finger is not extended
    return True  # All fingers are extended

def is_closed_fist(hand_landmarks):
    """ Check if all fingers are folded (closed fist) """
    for finger_tip, finger_pip in [
        (mp_hands.HandLandmark.INDEX_FINGER_TIP, mp_hands.HandLandmark.INDEX_FINGER_PIP),
        (mp_hands.HandLandmark.MIDDLE_FINGER_TIP, mp_hands.HandLandmark.MIDDLE_FINGER_PIP),
        (mp_hands.HandLandmark.RING_FINGER_TIP, mp_hands.HandLandmark.RING_FINGER_PIP),
        (mp_hands.HandLandmark.PINKY_TIP, mp_hands.HandLandmark.PINKY_PIP)
    ]:
        if hand_landmarks.landmark[finger_tip].y < hand_landmarks.landmark[finger_pip].y:
            return False  # A finger is extended
    return True  # All fingers are folded

def process_video_in_background(video_path, extracted_frames_dir, processed_frames_dir, cropped_frames_dir, combined_frames_dir, target_frames=60):
    """Run frame extraction and cropping in a separate thread"""
    print(f"\nProcessing video...")

    # Call the frame extraction function
    extract_frames_with_priority_deletion(video_path, extracted_frames_dir, target_frames)

    # Process extracted frames
    print(f"\nSharpening and adjusting brightness/contrast of extracted frames...")
    process_extracted_frames(extracted_frames_dir, processed_frames_dir)

    # Perform cropping on the processed extracted frames
    frame_files = [f for f in os.listdir(extracted_frames_dir) if f.endswith('.png')]
    cropped_count = 0

    for frame_file in frame_files:
        if process_frame(frame_file, extracted_frames_dir, cropped_frames_dir):
            cropped_count += 1

    print(f"\nFinished processing all frames. Images cropped -> {cropped_count}")

    # Combine cropped frames into a single image
    combine_images(cropped_frames_dir, combined_frames_dir)
    print(f"All frames combined and saved in {combined_frames_dir}.")

    # Perform word prediction from the combined image
    if os.path.exists(combined_image_path):
        img = Image.open(combined_image_path).resize((224, 224), Image.LANCZOS)
        img_array = np.array(img) / 255.0  # Normalize pixel values
        img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension

        predictions = lip_model.predict(img_array)
        predicted_class_index = np.argmax(predictions, axis=1)[0]
        predicted_word = class_labels[str(predicted_class_index)]
        print(f"\nPredicted word: {predicted_word}")
        print(predictions)

        # Generate a sentence from the predicted word
        input_text = f"Generate a sentence for {predicted_word}:"
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids

        outputs = txt_model.generate(
            input_ids,
            max_length=20,
            do_sample=True,
            top_k=50,
            top_p=0.9,
            temperature=0.9,
            num_return_sequences=1
        )

        generated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"\nGenerated Sentence: {generated_sentence}")

    print('----------------------------------------------------------------------')
# ---------------------------------------------------------------------------------------------------------------------------------------
# while updating the code just paste the previous block which includes is_open_hand(), is_closed_fist() and process_video_in_background()
# ---------------------------------------------------------------------------------------------------------------------------------------

with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        image_height, image_width, _ = frame.shape

        # Convert BGR to RGB and flip for mirror effect
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image = cv2.flip(image, 1)
        image.flags.writeable = False  # Set to False for faster processing

        # Process the image and find hands
        results = hands.process(image)
        image.flags.writeable = True  # Set to True for drawing
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        # Check if hands are detected
        if results.multi_hand_landmarks:
            for num, hand_landmarks in enumerate(results.multi_hand_landmarks):
                mp_drawing.draw_landmarks(
                    image, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                    mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                    mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2)
                )

                # Gesture recognition logic
                if is_open_hand(hand_landmarks):
                    cv2.putText(image, "Open Hand Detected", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                    if not recording:
                        recording = True
                        video_name = "recording.mp4"  # Change this if needed for unique names
                        output_file = os.path.join(base_dir, video_name)
                        video_writer = cv2.VideoWriter(
                            output_file,
                            cv2.VideoWriter_fourcc(*'mp4v'),  # Codec for .mp4 format
                            fps,
                            (frame_width, frame_height)
                        )
                        if not video_writer.isOpened():
                            print("Error: Video writer failed to open.")
                            recording = False
                        else:
                            print("\nRecording started...")

                elif is_closed_fist(hand_landmarks):
                    cv2.putText(image, "Closed Fist Detected", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    if recording:
                        recording = False
                        if video_writer is not None:
                            video_writer.release()
                            print(f"Recording stopped and saved at: {output_file}")
                        video_writer = None  # Reset the writer

                        # Start frame extraction, cropping, and sentence generation in a separate thread
                        executor.submit(process_video_in_background, output_file, extracted_frames_dir, processed_frames_dir, cropped_frames_dir, base_dir)

        # Write frame to the video if recording
        if recording and video_writer is not None:
            video_writer.write(frame)

        # Display the image with results
        cv2.imshow('Hand Gesture Recognition', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

# Release resources
cap.release()
if video_writer is not None:
    video_writer.release()
executor.shutdown(wait=True)  # Ensure all background tasks complete
cv2.destroyAllWindows()

KeyboardInterrupt: 

In [2]:
cap.release()
cv2.destroyAllWindows()

NameError: name 'cap' is not defined

In [None]:
# --------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------

**loading dictionaries for LSTM model**

In [9]:
# Load dictionaries from JSON files
with open(r"C:\Users\avikd\OneDrive - Sheffield Hallam University\Desktop\Project & Dissertation\8. Saved models\text generation\LSTM_json\input_word_to_index.json", "r") as f:
    input_word_to_index = json.load(f)

with open(r"C:\Users\avikd\OneDrive - Sheffield Hallam University\Desktop\Project & Dissertation\8. Saved models\text generation\LSTM_json\output_index_word.json", "r") as f:
    output_index_word = json.load(f)

# Reverse output_index_word dictionary for lookup
output_index_word = {int(k): v for k, v in output_index_word.items()}  # Ensure integer keys
print("Dictionaries loaded successfully.")

Dictionaries loaded successfully.


In [10]:
# Path to your saved model
model_path = r'C:\Users\avikd\OneDrive - Sheffield Hallam University\Desktop\Project & Dissertation\8. Saved models\text generation\model0212_512.h5'

# Load the model
text_model = load_model(model_path)

# Verify the model summary
text_model.summary()



In [11]:
def hybrid_sampling(prob_dist, top_k=50, top_p=0.9, temperature=1.0):
    prob_dist = np.log(prob_dist + 1e-8) / temperature
    prob_dist = np.exp(prob_dist)
    prob_dist /= np.sum(prob_dist)

    top_k_indices = np.argsort(prob_dist)[-top_k:]
    top_k_probs = prob_dist[top_k_indices]

    sorted_indices = np.argsort(top_k_probs)[::-1]
    sorted_probs = top_k_probs[sorted_indices]
    cumulative_probs = np.cumsum(sorted_probs)
    cutoff = np.argmax(cumulative_probs >= top_p)
    top_p_indices = top_k_indices[sorted_indices[:cutoff + 1]]
    top_p_probs = prob_dist[top_p_indices]
    top_p_probs /= np.sum(top_p_probs)

    return np.random.choice(top_p_indices, p=top_p_probs)

def generate_sentence_with_attention(model, input_sequence, output_index_word, max_seq_len=20, top_k=10, top_p=0.9, temperature=0.8):
    decoder_input = np.zeros((1, 1))
    decoder_input[0, 0] = output_index_word.get("<start>", 1)

    generated_tokens = []

    for _ in range(max_seq_len):
        predictions = model.predict([input_sequence, decoder_input])
        prob_dist = predictions[0, -1, :]
        next_token = hybrid_sampling(prob_dist, top_k=top_k, top_p=top_p, temperature=temperature)

        if next_token == 0:
            break
        generated_tokens.append(next_token)
        decoder_input = np.hstack([decoder_input, [[next_token]]])

    generated_sentence = " ".join(output_index_word.get(token, "<unk>") for token in generated_tokens)
    return generated_sentence

**integration with LSTM model**

In [14]:
# Initialize MediaPipe Hands and Drawing modules
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

# Initialize video capture
cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)

# Set the resolution to the maximum supported by your camera
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)  # Set width (e.g., 1280 for 720p)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)  # Set height (e.g., 720 for 720p)

# Directories checking
os.makedirs(base_dir, exist_ok=True)
os.makedirs(extracted_frames_dir, exist_ok=True)
os.makedirs(cropped_frames_dir, exist_ok=True)
combined_image_path = os.path.join(base_dir, "combined_frames.png")

# Variables to handle recording
recording = False
video_writer = None
executor = ThreadPoolExecutor(max_workers=1)  # Thread pool for frame extraction

# Set video codec and file format
fps = int(cap.get(cv2.CAP_PROP_FPS)) or 30  # Default to 30 if unable to read fps
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

def is_open_hand(hand_landmarks):
    """ Check if all fingers are extended (open hand) """
    for finger_tip, finger_pip in [
        (mp_hands.HandLandmark.INDEX_FINGER_TIP, mp_hands.HandLandmark.INDEX_FINGER_PIP),
        (mp_hands.HandLandmark.MIDDLE_FINGER_TIP, mp_hands.HandLandmark.MIDDLE_FINGER_PIP),
        (mp_hands.HandLandmark.RING_FINGER_TIP, mp_hands.HandLandmark.RING_FINGER_PIP),
        (mp_hands.HandLandmark.PINKY_TIP, mp_hands.HandLandmark.PINKY_PIP)
    ]:
        if hand_landmarks.landmark[finger_tip].y > hand_landmarks.landmark[finger_pip].y:
            return False  # A finger is not extended
    return True  # All fingers are extended

def is_closed_fist(hand_landmarks):
    """ Check if all fingers are folded (closed fist) """
    for finger_tip, finger_pip in [
        (mp_hands.HandLandmark.INDEX_FINGER_TIP, mp_hands.HandLandmark.INDEX_FINGER_PIP),
        (mp_hands.HandLandmark.MIDDLE_FINGER_TIP, mp_hands.HandLandmark.MIDDLE_FINGER_PIP),
        (mp_hands.HandLandmark.RING_FINGER_TIP, mp_hands.HandLandmark.RING_FINGER_PIP),
        (mp_hands.HandLandmark.PINKY_TIP, mp_hands.HandLandmark.PINKY_PIP)
    ]:
        if hand_landmarks.landmark[finger_tip].y < hand_landmarks.landmark[finger_pip].y:
            return False  # A finger is extended
    return True  # All fingers are folded

def process_video_in_background(video_path, extracted_frames_dir, cropped_frames_dir, combined_frames_dir, target_frames=60):
    print(f"\nProcessing video...")

    # Call the frame extraction function
    extract_frames_with_priority_deletion(video_path, extracted_frames_dir, target_frames)

    # Perform cropping on the extracted frames
    frame_files = [f for f in os.listdir(extracted_frames_dir) if f.endswith('.png')]
    cropped_count = 0

    for frame_file in frame_files:
        if process_frame(frame_file, extracted_frames_dir, cropped_frames_dir):
            cropped_count += 1

    print(f"\nFinished processing all frames. Images cropped -> {cropped_count}")

    # Combine cropped frames into a single image
    combine_images(cropped_frames_dir, combined_frames_dir)

    # Perform word prediction from the combined image
    if os.path.exists(combined_image_path):
        img = Image.open(combined_image_path).resize((224, 224), Image.LANCZOS)
        img_array = np.array(img) / 255.0
        img_array = np.expand_dims(img_array, axis=0)

        predictions = lip_model.predict(img_array)
        predicted_class_index = np.argmax(predictions, axis=1)[0]
        predicted_word = class_labels[str(predicted_class_index)]
        print(f"\nPredicted word: {predicted_word}")
        print(predictions)

        # Generate a sentence from the predicted word
        input_sequence = np.array([[input_word_to_index[predicted_word]]])
        generated_sentence = generate_sentence_with_attention(
            text_model,
            input_sequence,
            output_index_word,
            max_seq_len=20,
            top_k=10,
            top_p=0.9,
            temperature=0.8
        )
        print(f"Generated Sentence: {generated_sentence}")

    print('----------------------------------------------------------------------')

with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        image_height, image_width, _ = frame.shape

        # Convert BGR to RGB and flip for mirror effect
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image = cv2.flip(image, 1)
        image.flags.writeable = False

        results = hands.process(image)
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        if results.multi_hand_landmarks:
            for num, hand_landmarks in enumerate(results.multi_hand_landmarks):
                mp_drawing.draw_landmarks(
                    image, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                    mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                    mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2)
                )

                if is_open_hand(hand_landmarks):
                    cv2.putText(image, "Open Hand Detected", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                    if not recording:
                        recording = True
                        video_name = "recording.mp4"
                        output_file = os.path.join(base_dir, video_name)
                        video_writer = cv2.VideoWriter(
                            output_file,
                            cv2.VideoWriter_fourcc(*'mp4v'),
                            fps,
                            (frame_width, frame_height)
                        )
                        if not video_writer.isOpened():
                            print("Error: Video writer failed to open.")
                            recording = False
                        else:
                            print("\nRecording started...")

                elif is_closed_fist(hand_landmarks):
                    cv2.putText(image, "Closed Fist Detected", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    if recording:
                        recording = False
                        if video_writer is not None:
                            video_writer.release()
                            print(f"Recording stopped and saved at: {output_file}")
                        video_writer = None

                        executor.submit(process_video_in_background, output_file, extracted_frames_dir, cropped_frames_dir, base_dir)

        if recording and video_writer is not None:
            video_writer.write(frame)

        cv2.imshow('Hand Gesture Recognition', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
if video_writer is not None:
    video_writer.release()
executor.shutdown(wait=True)
cv2.destroyAllWindows()


Recording started...
Recording stopped and saved at: D:\PycharmProjects\pro_dis_2\collected_data\!test\recording.mp4

Processing video...
Extracted 60 frames saved at: D:\PycharmProjects\pro_dis_2\collected_data\!test\extracted_frames_one

Finished processing all frames. Images cropped -> 60

Combined image saved at: D:\PycharmProjects\pro_dis_2\collected_data\!test\combined_frames.png
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 291ms/step

Predicted word: cup
[[0.02816016 0.36350307 0.05042779 0.0048933  0.05065876 0.00381401
  0.06020677 0.12935103 0.30531642 0.00366876]]




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 397ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 306ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Generated Sentence: is the cup next left in the dining table? table?
----------------------------------------------------------------------

Recording started...
Recording stopped and saved at: D:\PycharmProjects\pro_dis_2\collected_data\!test\recording.mp4

Processing video...
Copied last frame to fill the deficit for D:\PycharmProjects\pro_dis_2\co

In [15]:
cap.release()
cv2.destroyAllWindows()

In [None]:
# --------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------