In [4]:
from ultralytics import YOLO
import cv2
import numpy as np
import torch
from torchvision import models, transforms
from collections import deque

# Load the pretrained YOLOv8m model
model_yolo = YOLO('yolov8m.pt')

# Load a pre-trained 3D-CNN model for action recognition (e.g., ResNet3D-18)
model_i3d = models.video.r3d_18(pretrained=True)  # Using ResNet3D-18 as an example
model_i3d.eval()

# Action labels for action recognition (replace with your specific labels)
action_labels = {0: 'fighting', 1: 'running', 2: 'stealing'}  # Example action classes

# Initialize video capture
video_source = "/data/BigBuckBunny_320x180.mp4"  # Replace with your video source
cap = cv2.VideoCapture(video_source)

# Initialize a deque to store a sequence of frames for action recognition
frame_sequence = deque(maxlen=16)  # Example using 16 frames for temporal context

# Define transformation for 3D-CNN input
transform = transforms.Compose([
    transforms.Resize((112, 112)),
    transforms.ToTensor()
])

def process_frames(frames):
    """
    Prepare a batch of frames for the 3D-CNN action recognition model.
    """
    # Apply the transformations and stack them
    batch = torch.stack([transform(frame) for frame in frames], dim=1)  # Shape: (3, 16, 112, 112)
    return batch.unsqueeze(0)  # Shape: (1, 3, 16, 112, 112)

while True:
    # Read a frame from the video source
    ret, frame = cap.read()
    
    if not ret:
        print("End of video stream or error.")
        break

    # Perform YOLO detection on the current frame
    results = model_yolo(frame)
    annotated_frame = results[0].plot()

    # Print detected objects and their confidence scores
    detected_objects = results[0].boxes.data.cpu().numpy()  # Get detected boxes
    for obj in detected_objects:
        class_id = int(obj[5])  # Get class ID of detected object
        confidence = obj[4]  # Confidence score of the detection
        object_name = model_yolo.names[class_id]  # Convert class ID to object name using YOLO model's class names
        print(f"Detected: {object_name} with confidence {confidence:.2f}")

    # Display YOLO object detection result
    cv2.imshow('YOLO Object Detection', annotated_frame)

    # Resize and convert the frame to RGB for 3D-CNN input
    rgb_frame = cv2.cvtColor(cv2.resize(frame, (112, 112)), cv2.COLOR_BGR2RGB)
    frame_sequence.append(rgb_frame)

    # Check if we have enough frames for action recognition
    if len(frame_sequence) == 16:
        # Process the frames and prepare a batch for 3D-CNN
        input_batch = process_frames(list(frame_sequence))

        # Run the 3D-CNN model to predict actions
        with torch.no_grad():
            action_scores = model_i3d(input_batch)

        # Get the predicted action
        predicted_action = torch.argmax(action_scores, dim=1).item()
        
        # Print the detected activity in human-readable form
        print(f"Predicted Action: {action_labels.get(predicted_action, 'Unknown')}")

        # Reset the frame sequence after processing
        frame_sequence.clear()

    # Press 'q' to exit the stream
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video capture object and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()


End of video stream or error.


OpenCV: Couldn't read video stream from file "BigBuckBunny_320x180.mp4"
