# Report Exploring Mediapipe Optimization Strategies for Real-Time Sign Language Recognition

# Table of Content
- [SETTING](#setting)
- [SKELETON POSING](#skeleton-posing)
- [DATA COLLECTION](#data-collection)
- [BUILDING MODEL](#building-model)
  - [JOINT MODEL](#joint-model-tranning)
  - [BONE MODEL](#bone-model-tranning)
  - [JOINT MOTION MODEL](#joint-motion-model-tranning)
  - [BONE MOTION MODEL](#bone-motion-model-tranning)
- [DEPLOYMENT](#Deployment)

## SETTING

Libraries

In [None]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import mediapipe as mp

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

from sklearn.model_selection import train_test_split

: 

skeleton setting

In [3]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

def draw_landmarks(image, results):
    # Import các module cần thiết
    mp_drawing = mp.solutions.drawing_utils
    mp_holistic = mp.solutions.holistic

    # Khởi tạo các DrawingSpecs
    drawing_spec_face_connections = mp_drawing.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1)
    drawing_spec_pose_connections = mp_drawing.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=4)
    drawing_spec_hand_connections = mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4)

    # Vẽ face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                             connection_drawing_spec=drawing_spec_face_connections)
    # Vẽ pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             connection_drawing_spec=drawing_spec_pose_connections)
    # Vẽ left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             connection_drawing_spec=drawing_spec_hand_connections)
    # Vẽ right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             connection_drawing_spec=drawing_spec_hand_connections)
    
def extract_joint(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(34*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

    # Concatenate the differences to form the feature vector
    joint = np.concatenate([pose, face, lh, rh])

    return joint

def extract_bone(joint):
    # Extract pose from joint
    pose_size = 132
    pose = joint[:pose_size]

    # Extract face from joint
    face_size = 1404
    face = joint[pose_size:pose_size + face_size]

    # Extract left hand from joint
    lh_size = 63
    lh = joint[pose_size + face_size:pose_size + face_size + lh_size]

    # Extract right hand from joint
    rh_size = 63
    rh = joint[pose_size + face_size + lh_size:pose_size + face_size + lh_size + rh_size]
    
    # Calculate coordinate differences
    pose_bone = np.array([pose[i+4]-pose[i] if (i+1) % 4 != 0 else pose[i] for i in range(len(pose)-4)])
    face_bone = np.array([face[i+3]-face[i] if (i+1) % 3 != 0 else face[i] for i in range(len(face)-3)])
    lh_bone = np.array([lh[i+3]-lh[i] if (i+1) % 3 != 0 else lh[i] for i in range(len(lh)-3)])
    rh_bone = np.array([rh[i+3]-rh[i] if (i+1) % 3 != 0 else rh[i] for i in range(len(rh)-3)])
    
    bone = np.concatenate([pose_bone, face_bone, lh_bone, rh_bone])
    return bone

def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(34*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

    # Calculate coordinate differences
    pose_bone = np.array([pose[i+4]-pose[i] if (i+1) % 4 != 0 else pose[i] for i in range(len(pose)-4)])
    face_bone = np.array([face[i+3]-face[i] if (i+1) % 3 != 0 else face[i] for i in range(len(face)-3)])
    lh_bone = np.array([lh[i+3]-lh[i] if (i+1) % 3 != 0 else lh[i] for i in range(len(lh)-3)])
    rh_bone = np.array([rh[i+3]-rh[i] if (i+1) % 3 != 0 else rh[i] for i in range(len(rh)-3)])

    # Concatenate the differences to form the feature vector
    joint = np.concatenate([pose, face, lh, rh])
    bone = np.concatenate([pose_bone, face_bone, lh_bone, rh_bone])
    return joint, bone

## SKELETON POSING

In [5]:
cap = cv2.VideoCapture(0)
# Set mediapipe model
with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        
        image, results = mediapipe_detection(frame, holistic)
        
        draw_landmarks(image, results)
            
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
        
cap.release()
cv2.destroyAllWindows()

## DATA COLLECTION

Label by image

In [6]:
# Set mediapipe model
with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    # Iterate through video files in the Test directory
    for folder in os.listdir('./Indian'):
        try:
            os.makedirs(os.path.join('Data', folder))
        except:
            pass
        
        len_i = len(os.listdir(f"./Data/{folder}"))
        os.makedirs(os.path.join('Data', folder, str(len_i)))
        i_frame = 0
        
        for image in os.listdir(f"./Indian/{folder}"):            
            fig = cv2.imread(f"./Indian/{folder}/{image}")
            fig, results = mediapipe_detection(fig, holistic)
            joint = extract_joint(results)
            
            npy_path = os.path.join('Data', folder, str(len_i), str(i_frame))

            np.save(npy_path, joint)
            
            if i_frame == 30:
                len_i += 1
                os.makedirs(os.path.join('Data', folder, str(len_i)))
                i_frame = 0
            else:
                i_frame += 1

KeyboardInterrupt: 

Label by video

In [3]:
for i in range(3):
    # Set mediapipe model
    with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        # Iterate through video files in the Test directory
        for video_file in os.listdir('./Video'):
            if video_file.endswith('.mp4'):
                video_path = os.path.join('./Video', video_file)
                action = os.path.splitext(video_file)[0]

                try:
                    os.makedirs(os.path.join('Data', action))

                except:
                    pass
                cap = cv2.VideoCapture(video_path)
                
                len_i = len(os.listdir(f"./Data/{action}"))
                os.makedirs(os.path.join('Data', action, str(len_i)))
                i_frame = 0
                
                while cap.isOpened():
                    ret, frame = cap.read()
                    
                    try:
                        image, results = mediapipe_detection(frame, holistic)
                    except:
                        break

                    # Draw landmarks on the image
                    draw_landmarks(image, results)
                    # cv2.imshow('OpenCV Feed', image)

                    joint = extract_joint(results)


                    npy_path = os.path.join('Data', action, str(len_i), str(i_frame))
                    np.save(npy_path, joint)
                    if i_frame == 30:
                        len_i += 1
                        os.makedirs(os.path.join('Data', action, str(len_i)))
                        i_frame = 0
                    else:
                        i_frame += 1

                    # Break gracefully
                    if cv2.waitKey(10) & 0xFF == ord('q'):
                        break

cap.release()
cv2.destroyAllWindows()

Label by camera

In [11]:
# Actions that we try to detect
actions = np.array(['hi'])
cap = cv2.VideoCapture(0)

for action in actions:
    try:
        os.makedirs(os.path.join('./Data', action))
    except:
        pass

number = 30
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    for action in actions:
        len_i = len(os.listdir(f"./Data/{action}"))
        for sequence in range(len_i, len_i + number):
            os.makedirs(os.path.join('Data', os.path.join(action, f'{sequence}')))
            for i_frame in range(31):
                ret, frame = cap.read()
                image, results = mediapipe_detection(frame, holistic)
                # Draw landmarks on the image
                draw_landmarks(image, results)
                if i_frame == 0: 
                    cv2.putText(image, 'STARTING COLLECTION', (120,200), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, f"Collecting frames for {action} Video Number {i_frame}", (15,12), 
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                    cv2.waitKey(2000)
                else: 
                    cv2.putText(image, f"Collecting frames for {action} Video Number {sequence} Frame {i_frame}", (15,12), 
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)

                joint = extract_joint(results)
                
                
                npy_path = os.path.join('Data', action, str(sequence), str(i_frame))
                np.save(npy_path, joint)
                
                # Break gracefully
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
            
cap.release()
cv2.destroyAllWindows()

In [10]:
cap.release()
cv2.destroyAllWindows()

Crawl video

In [16]:
from msedge.selenium_tools import EdgeOptions, Edge
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By



from selenium import webdriver
from selenium.webdriver.edge.options import Options

options = Options()
options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"

driver = webdriver.Edge(options = options)

url = ('https://qipedc.moet.gov.vn/dictionary')

def download_video(url, output_directory):
    # Khởi tạo trình duyệt
    

    try:
        # Mở trang web
        driver.get(url)

        # Tìm tất cả các phần tử chứa video (thẻ <video>) trên trang web
        video_elements = driver.find_elements_by_tag_name('video')

        if video_elements:
            for i, video_element in enumerate(video_elements):
                video_url = video_element.get_attribute('src')
                if video_url:
                    # Tạo tên tệp cho video
                    video_filename = f"video_{i}.mp4"

                    # Tạo đường dẫn đầy đủ cho tệp video
                    video_path = os.path.join(output_directory, video_filename)

                    # Tải video xuống đĩa
                    download_video_from_url(video_url, video_path)
                    print(f"Video {i} đã được tải xuống: {video_path}")
                else:
                    print(f"Không tìm thấy liên kết video cho video {i}.")
        else:
            print("Không tìm thấy video trên trang web.")
    except Exception as e:
        print(f"Lỗi: {e}")
    finally:
        # Đóng trình duyệt sau khi hoàn thành
        driver.quit()

def download_video_from_url(url, output_file):
    # Sử dụng requests hoặc urllib để tải video từ URL
    import requests
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(output_file, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
                
download_video(url, '.')


TypeError: __init__() got an unexpected keyword argument 'options'

In [3]:
from selenium import webdriver
from selenium.webdriver.edge.options import Options

options = Options()
options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge Dev\Application\msedge.exe"

driver = webdriver.Edge(options = options)

## BUILDING MODEL

### Joint Model Tranning

Demo joint model

In [1]:
model_joint = load_model("joint.h5")
joint_window = []

# Load and preprocess data
actions = os.listdir('./Data')

label_map = {label: num for num, label in enumerate(actions)}

cap = cv2.VideoCapture(0)
# Set mediapipe model
with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        
        image, results = mediapipe_detection(frame, holistic)
        
        draw_landmarks(image, results)
        
        joint, bone = extract_keypoints(results)
            
        joint_window.append(joint)
        joint_window = joint_window[-30:]
    
        if len(joint_window) == 30:
            J = model_joint.predict(np.expand_dims(joint_window, axis=0))[0]
            action = actions[np.argmax(J)]
            label_text = f"Action: {action}" 
            cv2.putText(image, label_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) 
            
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
cap.release()
cv2.destroyAllWindows()

NameError: name 'load_model' is not defined

### Bone Model Tranning

Demo bone model

In [8]:
model_bone = load_model("bone.h5")
bone_window = [] 

# Load and preprocess data
actions = os.listdir('./Data')

label_map = {label: num for num, label in enumerate(actions)}

cap = cv2.VideoCapture(0)
# Set mediapipe model
with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        
        image, results = mediapipe_detection(frame, holistic)
        
        draw_landmarks(image, results)
        
        joint, bone = extract_keypoints(results)

        bone_window.append(bone)
        bone_window = bone_window[-30:]
        
        if len(bone_window) == 30:
            B = model_bone.predict(np.expand_dims(bone_window, axis=0))[0]
            action = actions[np.argmax(B)]
            label_text = f"Action: {action}"
            cv2.putText(image, label_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) 
            
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
cap.release()
cv2.destroyAllWindows()
    



KeyboardInterrupt: 

### Joint Motion Model Tranning

Demo joint motion model

In [7]:
model_joint_motion = load_model("joint_motion.h5")
joint_motions = []
joint_window = []

# Load and preprocess data
actions = os.listdir('./Data')

label_map = {label: num for num, label in enumerate(actions)}

cap = cv2.VideoCapture(0)
# Set mediapipe model
with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        
        image, results = mediapipe_detection(frame, holistic)
        
        draw_landmarks(image, results)
        
        joint, bone = extract_keypoints(results)
        
        if joint_window:
            JM = joint - joint_window[-1]
            
            joint_motions.append(JM)
            joint_motions = joint_motions[-30:]
            
        joint_window.append(joint)
        joint_window = joint_window[-30:]
        
        if len(joint_motions) == 30:            
            JM = model_joint_motion.predict(np.expand_dims(joint_motions, axis=0))[0]
            action = actions[np.argmax(JM)]
            label_text = f"Action: {action}"
            cv2.putText(image, label_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()



### Bone Motion Model Tranning

Demo bone motion model

In [6]:
model_bone_motion = load_model("bone_motion.h5")
bone_motions = []
bone_window = []

# Load and preprocess data
actions = os.listdir('./Data')

label_map = {label: num for num, label in enumerate(actions)}

cap = cv2.VideoCapture(0)
# Set mediapipe model
with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        
        image, results = mediapipe_detection(frame, holistic)
        
        draw_landmarks(image, results)
        
        joint, bone = extract_keypoints(results)
        
        if bone_window:
            BM = bone - bone_window[-1]
            
            bone_motions.append(BM)
            bone_motions = bone_motions[-30:]
            
        bone_window.append(bone)
        bone_window = bone_window[-30:]
        
        if len(bone_motions) == 30:            
            BM = model_bone_motion.predict(np.expand_dims(bone_motions, axis=0))[0]
            action = actions[np.argmax(BM)]
            label_text = f"Action: {action}"
            cv2.putText(image, label_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()



## DEPLOYMENT

In [9]:
model_joint = load_model("joint.h5")
model_bone = load_model("bone.h5")
model_joint_motion = load_model("joint_motion.h5")
model_bone_motion = load_model("bone_motion.h5")
joint_motions = []
bone_motions = []
joint_window = []  # Joint window for storing joint data every frame
bone_window = [] 

# Load and preprocess data
actions = os.listdir('./Data')

label_map = {label: num for num, label in enumerate(actions)}

cap = cv2.VideoCapture(0)
# Set mediapipe model
with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        
        image, results = mediapipe_detection(frame, holistic)
        
        draw_landmarks(image, results)
        
        joint, bone = extract_keypoints(results)
        
        if joint_window:
            JM = joint - joint_window[-1]
            BM = bone - bone_window[-1]
            
            joint_motions.append(JM)
            joint_motions = joint_motions[-30:]
            
            bone_motions.append(BM)
            bone_motions = bone_motions[-30:]
            

        joint_window.append(joint)
        joint_window = joint_window[-30:]
        
        bone_window.append(bone)
        bone_window = bone_window[-30:]
        

        if len(joint_motions) == 30:
            J = model_joint.predict(np.expand_dims(joint_window, axis=0))[0]
        
            B = model_bone.predict(np.expand_dims(bone_window, axis=0))[0]
            
            JM = model_joint_motion.predict(np.expand_dims(joint_motions, axis=0))[0]
            
            BM = model_bone_motion.predict(np.expand_dims(bone_motions, axis=0))[0]
            
            # Các trọng số tương ứng
            weights = np.array([0.25, 0.25, 0.25, 0.25])
            
            ensemble_prob = np.dot(weights, [J, B, JM, BM])
            
            action = actions[np.argmax(ensemble_prob)]
            label_text = f"Action: {action}"
            cv2.putText(image, label_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
        
cap.release()
cv2.destroyAllWindows()

