In [5]:
import cv2
import numpy as np
from ultralytics import YOLO
import albumentations as A
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Function to apply bilateral filtering and CLAHE
def preprocess_image(img):
    # Apply bilateral filtering for noise reduction while preserving edges
    img_filtered = cv2.bilateralFilter(img, 9, 75, 75)

    # Convert to LAB color space for CLAHE
    lab = cv2.cvtColor(img_filtered, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)

    # Apply CLAHE to the L-channel (lightness)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    l_clahe = clahe.apply(l)

    # Merge CLAHE enhanced L-channel back with A and B channels
    lab_clahe = cv2.merge((l_clahe, a, b))

    # Convert back to BGR color space
    img_clahe = cv2.cvtColor(lab_clahe, cv2.COLOR_LAB2BGR)

    return img_clahe

# Fine-tuning MobileNetV2 for classification (drowsy vs awake)
def load_mobilenetv2():
    base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    x = base_model.output
    x = GlobalAveragePooling2D()(x)

    # Adding a custom classification head (binary classification: drowsy or awake)
    predictions = Dense(1, activation='sigmoid')(x)

    # Creating the model
    model = Model(inputs=base_model.input, outputs=predictions)

    # Freeze base model layers
    for layer in base_model.layers:
        layer.trainable = False

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Load the trained YOLO and MobileNetV2 models
yolo_model = YOLO(r"G:\My Drive\finalyolo\runs\train_results (2)\weights\best.pt")  # Replace with your YOLOv8 model path
mobilenet_model = load_mobilenetv2()  # Assuming this function is already defined

# Define augmentation pipeline (if necessary for preprocessing)
augmentation_pipeline = A.Compose([
    A.RandomResizedCrop(width=224, height=224, scale=(0.8, 1.0), ratio=(0.9, 1.1), p=1.0)
], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

def process_real_time_video():
    # Open video capture (0 for the default webcam)
    cap = cv2.VideoCapture(0)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Preprocess the frame
        img_preprocessed = preprocess_image(frame)
        h, w, _ = img_preprocessed.shape

        # Perform inference with YOLOv8 to detect faces or regions of interest
        results = yolo_model(img_preprocessed)

        for result in results:
            boxes = result.boxes.xyxy.cpu().numpy().astype(np.int64)
            for box in boxes:
                x1, y1, x2, y2 = [int(coord) for coord in box]

                x1 = max(0, min(x1, w))
                y1 = max(0, min(y1, h))
                x2 = max(0, min(x2, w))
                y2 = max(0, min(y2, h))

                region = img_preprocessed[y1:y2, x1:x2]

                # Normalize the bounding box coordinates for YOLO format (0 to 1 range)
                bbox_yolo_format = [
                    (x1 + x2) / 2 / w,  # center x
                    (y1 + y2) / 2 / h,  # center y
                    (x2 - x1) / w,      # width
                    (y2 - y1) / h       # height
                ]

                # Apply augmentation before classification (if needed)
                transformed = augmentation_pipeline(
                    image=region,
                    bboxes=[bbox_yolo_format],  # Use normalized coordinates
                    class_labels=[0]
                )
                transformed_image = transformed['image']

                # Classify the region using MobileNetV2
                state = classify_region(transformed_image, mobilenet_model)

                # Draw bounding box and label on the frame
                if state != 'Unknown':
                    cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
                    cv2.putText(frame, state, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

        # Display the frame with annotations
        cv2.imshow('Drowsiness Detection', frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Release the video capture and close windows
    cap.release()
    cv2.destroyAllWindows()

# Function to classify eye/mouth state using MobileNetV2
def classify_region(region, model):
    if region.size == 0 or region.shape[0] == 0 or region.shape[1] == 0:  # Check if the region is empty
        return 'Unknown'

    region_resized = cv2.resize(region, (224, 224))  # Resize to 224x224
    region_array = img_to_array(region_resized)
    region_preprocessed = preprocess_input(np.expand_dims(region_array, axis=0))
    prediction = model.predict(region_preprocessed)

    if prediction > 0.5:
        return 'Drowsy'
    else:
        return 'Awake'

# Start real-time video processing
process_real_time_video()


0: 480x640 (no detections), 18.9ms
Speed: 1.0ms preprocess, 18.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 15.3ms
Speed: 1.1ms preprocess, 15.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 15.9ms
Speed: 1.5ms preprocess, 15.9ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 9.3ms
Speed: 1.0ms preprocess, 9.3ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 16.2ms
Speed: 1.0ms preprocess, 16.2ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 15.6ms
Speed: 2.0ms preprocess, 15.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 12.2ms
Speed: 2.0ms preprocess, 12.2ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.0ms
Speed: 1.7ms preprocess, 6.0ms infer