In [1]:
import cv2
import numpy as np
import mediapipe as mp
import json
import coremltools as ct

TensorFlow version 2.13.0 has not been tested with coremltools. You may run into unexpected errors. TensorFlow 2.12.0 is the most recent version that has been tested.
Torch version 2.4.1+cu118 has not been tested with coremltools. You may run into unexpected errors. Torch 2.4.0 is the most recent version that has been tested.
Fail to import BlobReader from libmilstoragepython. No module named 'coremltools.libmilstoragepython'
Failed to load _MLModelProxy: No module named 'coremltools.libcoremlpython'
Fail to import BlobWriter from libmilstoragepython. No module named 'coremltools.libmilstoragepython'


In [3]:
class ASLRecognizerCoreML:
    def __init__(self, 
                 model_path='asl_recognition_model811.mlpackage', 
                 scaler_params_path='scaler_params.json',
                 mappings_path='label_mappings.json'):
        # Initialize MediaPipe
        self.mp_hands = mp.solutions.hands
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_drawing_styles = mp.solutions.drawing_styles
        self.hands = self.mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=1,
            min_detection_confidence=0.7,
            min_tracking_confidence=0.5
        )

        # Load the CoreML model
        self.model = ct.models.MLModel(model_path)
        print("Model input description:", self.model.input_description)
        print("Model output description:", self.model.output_description)

        # Load scaler parameters
        with open(scaler_params_path, 'r') as f:
            self.scaler_params = json.load(f)

        # Load label mappings
        with open(mappings_path, 'r') as f:
            mappings = json.load(f)
            self.reverse_label_map = {int(k): v for k, v in mappings['reverse_label_map'].items()}

        # Initialize display parameters
        self.font = cv2.FONT_HERSHEY_SIMPLEX
        self.colors = {
            'blue': (255, 0, 0),
            'green': (0, 255, 0),
            'red': (0, 0, 255),
            'white': (255, 255, 255)
        }

    def scale_landmarks(self, landmarks):
        """Scale the landmarks using the stored scaler parameters."""
        data_min = np.array(self.scaler_params['data_min_'])
        data_range = np.array(self.scaler_params['data_range_'])
        scaled = (landmarks - data_min) / data_range
        return scaled

    def extract_landmarks(self, image):
        """Extract hand landmarks from image."""
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        results = self.hands.process(image_rgb)
        
        landmarks = None
        if results.multi_hand_landmarks:
            # Get landmarks of the first hand
            hand_landmarks = results.multi_hand_landmarks[0]
            # Convert to array
            landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]).flatten()
            
            # Draw landmarks on the image
            self.mp_drawing.draw_landmarks(
                image,
                hand_landmarks,
                self.mp_hands.HAND_CONNECTIONS,
                self.mp_drawing_styles.get_default_hand_landmarks_style(),
                self.mp_drawing_styles.get_default_hand_connections_style()
            )
        
        return landmarks, image

    def predict(self, landmarks):
        """Make prediction using the CoreML model."""
        # Scale landmarks
        scaled_landmarks = self.scale_landmarks(landmarks)
        # Reshape for model input (batch_size, 21, 3)
        input_data = scaled_landmarks.reshape(1, 21, 3)
        
        # Convert to float32 for CoreML
        input_data = input_data.astype(np.float32)
        
        # Get the input feature name from the model
        input_feature_name = self.model.input_description.keys()[0]
        
        # Prepare input dictionary for CoreML
        input_dict = {input_feature_name: input_data}
        
        # Make prediction
        prediction = self.model.predict(input_dict)
        
        # Get the output feature name from the model
        output_feature_name = self.model.output_description.keys()[0]
        prediction_array = prediction[output_feature_name]
        
        predicted_class = np.argmax(prediction_array)
        confidence = np.max(prediction_array)
        
        return self.reverse_label_map[predicted_class], confidence

    def add_prediction_text(self, image, prediction, confidence):
        """Add prediction text to the image."""
        # Create background rectangle for text
        text = f"{prediction.upper()}: {confidence:.2f}"
        text_size = cv2.getTextSize(text, self.font, 1, 2)[0]
        text_x = 10
        text_y = 50
        
        cv2.rectangle(image, 
                     (text_x - 5, text_y - text_size[1] - 5),
                     (text_x + text_size[0] + 5, text_y + 5),
                     self.colors['blue'],
                     -1)
        
        # Add text
        cv2.putText(image, text,
                    (text_x, text_y),
                    self.font, 1, self.colors['white'], 2)

    def add_fps_counter(self, image, fps):
        """Add FPS counter to the image."""
        cv2.putText(image, f"FPS: {fps:.1f}",
                    (10, 90),
                    self.font, 0.7, self.colors['green'], 2)

    def process_frame(self, frame):
        """Process a single frame."""
        # Extract landmarks
        landmarks, annotated_frame = self.extract_landmarks(frame)
        
        if landmarks is not None:
            try:
                # Make prediction
                prediction, confidence = self.predict(landmarks)
                # Add prediction text
                self.add_prediction_text(annotated_frame, prediction, confidence)
            except Exception as e:
                print(f"Prediction error: {str(e)}")
        
        return annotated_frame

    def run_webcam(self):
        """Run real-time recognition using webcam."""
        cap = cv2.VideoCapture(0)
        
        # FPS calculation variables
        fps = 0
        frame_time = 0
        prev_time = cv2.getTickCount()
        
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                print("Failed to grab frame")
                break

            # Flip frame horizontally for selfie-view
            frame = cv2.flip(frame, 1)
            
            # Process frame
            output_frame = self.process_frame(frame)
            
            # Calculate FPS
            curr_time = cv2.getTickCount()
            frame_time = (curr_time - prev_time) / cv2.getTickFrequency()
            fps = 1.0 / frame_time
            prev_time = curr_time
            
            # Add FPS counter
            self.add_fps_counter(output_frame, fps)
            
            # Display result
            cv2.imshow('ASL Recognition (CoreML)', output_frame)
            
            # Break loop on 'q' press
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

        # Clean up
        cap.release()
        cv2.destroyAllWindows()
        self.hands.close()

In [7]:
try:
    # Initialize and run the recognizer
    recognizer = ASLRecognizerCoreML()
    recognizer.run_webcam()
except Exception as e:
    print(f"Error occurred: {str(e)}")
finally:
    cv2.destroyAllWindows()

Error occurred: Unable to load libmodelpackage. Cannot make save spec.
