In [19]:
import cv2
import json
from xgboost import XGBClassifier

import numpy as np
import pickle
import mediapipe as mp
from sklearn.preprocessing import LabelEncoder  # Corrected import for LabelEncoder

# Paths
MODEL_PATH = r"D:\data_ISL\new_project\models\xgboost_xgboost.pickle.dat"
LABEL_MAP_PATH = r"D:\data_ISL\new_project\label_maps\label_map.json"
VIDEO_PATH = r"D:\data_ISL\new_project\data_all\Rulers\a2_v2.mp4"

with open(LABEL_MAP_PATH, "r") as f:
        label_map = json.load(f)

# Updated: Flatten and Pad Function
def flatten_and_pad(keypoints, max_seq_len=200):
    """
    Flatten and pad keypoints to match the training logic.
    Ensures consistent length for each video.
    """
    keypoints = np.array(keypoints)
    if keypoints.shape[0] < max_seq_len:
        padded_keypoints = np.pad(
            keypoints,
            ((0, max_seq_len - keypoints.shape[0]), (0, 0), (0, 0)),  # Pad along the time axis
            mode="constant",
        )
    else:
        padded_keypoints = keypoints[:max_seq_len]  # Truncate to max_seq_len

    return padded_keypoints.flatten()


# Updated: Process Video Function
def extract_keypoints_from_video(video_path, max_seq_len=200):
    """
    Extract and process keypoints from a video using MediaPipe.
    Ensures consistent keypoints structure.
    """
    mp_holistic = mp.solutions.holistic
    holistic = mp_holistic.Holistic(static_image_mode=False, min_detection_confidence=0.5)
    keypoints = []

    cap = cv2.VideoCapture(video_path)
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Convert frame to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = holistic.process(frame_rgb)

        # Extract landmarks or use zeros for missing landmarks
        pose = [[0, 0]] * 33
        left_hand = [[0, 0]] * 21
        right_hand = [[0, 0]] * 21

        if results.pose_landmarks:
            pose = [
                [landmark.x, landmark.y] for landmark in results.pose_landmarks.landmark
            ]

        if results.left_hand_landmarks:
            left_hand = [
                [landmark.x, landmark.y] for landmark in results.left_hand_landmarks.landmark
            ]

        if results.right_hand_landmarks:
            right_hand = [
                [landmark.x, landmark.y] for landmark in results.right_hand_landmarks.landmark
            ]

        # Combine all keypoints into a single array
        combined = pose + left_hand + right_hand
        keypoints.append(combined)

    cap.release()
    holistic.close()

    # Ensure keypoints array is consistent
    keypoints = np.array(keypoints)

    # Check for consistency
    if keypoints.shape[1:] != (75, 2):
        raise ValueError(f"Expected shape (75, 2), but got {keypoints.shape[1:]}")

    # Flatten and pad keypoints
    return flatten_and_pad(keypoints, max_seq_len)

# Inference Pipeline
def inference_pipeline(video_path, model_path, label_map_path):
    """
    Full inference pipeline for extracting keypoints and making predictions.
    """
    # Step 1: Load the model
    print("Loading model...")
    with open(model_path, "rb") as f:
        model = pickle.load(f)

    # Step 2: Load the label map
    print("Loading label map...")
    with open(label_map_path, "r") as f:
        label_map = eval(f.read())  # Assuming the label map is a JSON-like dictionary

    # Reverse the label map for decoding predictions
    reverse_label_map = {v: k for k, v in label_map.items()}

    # Step 3: Extract keypoints from the video
    print(f"Extracting keypoints from video: {video_path}")
    keypoints = extract_keypoints_from_video(video_path)
    keypoints = keypoints.reshape(1, -1)  # Reshape for model input

    # Step 4: Model inference
    print("Running inference...")
    prediction = model.predict(keypoints)[0]

    # Step 5: Decode the prediction
    predicted_word = reverse_label_map[prediction]

    return predicted_word

# Run the pipeline
if __name__ == "__main__":
    predicted_word = inference_pipeline(VIDEO_PATH, MODEL_PATH, LABEL_MAP_PATH)
    print(f"Predicted word: {predicted_word}")


Loading model...
Loading label map...
Extracting keypoints from video: D:\data_ISL\new_project\data_all\Rulers\a2_v2.mp4
Running inference...
Predicted word: rulers



    E.g. tree_method = "hist", device = "cuda"



In [1]:
import mediapipe as mp
print(mp.__version__)


0.10.18
