In [1]:
from transformers import pipeline
import cv2
from PIL import Image
import hashlib

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
def process_video_with_thresholds(video_path, output_path, candidate_labels, box_threshold=0.3, text_threshold=0.25):
    """
    Process 8 frames from every second of video with Grounding DINO
    Creates output video with 8 FPS
    """
    # Create detector
    detector = pipeline(
        "zero-shot-object-detection",
        model="IDEA-Research/grounding-dino-base"
    )
    
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video file {video_path}")
        return
    
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps
    
    print(f"Input video: {fps} FPS, {total_frames} frames, {duration:.1f} seconds")
    print(f"Processing with thresholds - Box: {box_threshold}, Text: {text_threshold}")
    print(f"Looking for: {candidate_labels}")
    
    # Output video will be 8 FPS
    output_fps = 8
    frame_interval = fps // output_fps
    if frame_interval == 0:
        frame_interval = 1
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, output_fps, (width, height))
    
    print(f"Sampling every {frame_interval} frames -> {output_fps} FPS output")
    
    frame_count = 0
    processed_count = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Process frames at regular intervals to get 8 FPS
        if frame_count % frame_interval == 0:
            # Convert to PIL and process
            pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            detections = detector(pil_image, candidate_labels=candidate_labels, threshold=box_threshold)
            
            # Draw detections
            for detection in detections:
                if detection['score'] >= text_threshold:
                    box = detection['box']
                    label = detection['label']
                    score = detection['score']
                    
                    # Generate consistent color from label
                    color_hash = hashlib.md5(label.encode()).hexdigest()[:6]
                    color = tuple(int(color_hash[i:i+2], 16) for i in (0, 2, 4))
                    
                    # Coordinates
                    x1, y1, x2, y2 = int(box['xmin']), int(box['ymin']), int(box['xmax']), int(box['ymax'])
                    
                    # Draw box and label
                    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                    cv2.putText(frame, f"{label}: {score:.2f}", (x1, y1-10), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
            
            # Save frame
            out.write(frame)
            processed_count += 1
            
            if processed_count % 30 == 0:
                print(f"Processed {processed_count} frames (input frame {frame_count})")
        
        frame_count += 1
    
    cap.release()
    out.release()
    
    output_duration = processed_count / output_fps
    print(f"Output saved to: {output_path}")
    print(f"Input: {frame_count} frames at {fps} FPS ({duration:.1f}s)")
    print(f"Output: {processed_count} frames at {output_fps} FPS ({output_duration:.1f}s)")

In [12]:
def process_video_with_thresholds(video_path, output_path, candidate_labels, box_threshold=0.3, text_threshold=0.25):
    """
    Process 8 frames from every second of video with Grounding DINO
    Creates output video with 8 FPS
    """
    # Create detector
    detector = pipeline(
        "zero-shot-object-detection",
        model="IDEA-Research/grounding-dino-base"
    )
    
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video file {video_path}")
        return
    
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps
    
    print(f"Input video: {fps} FPS, {total_frames} frames, {duration:.1f} seconds")
    print(f"Processing with thresholds - Box: {box_threshold}, Text: {text_threshold}")
    print(f"Looking for: {candidate_labels}")
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    processed_count = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        detections = detector(pil_image, candidate_labels=candidate_labels, threshold=box_threshold)
        
        # Draw detections
        for detection in detections:
            if detection['score'] >= text_threshold:
                box = detection['box']
                label = detection['label']
                score = detection['score']
                
                # Generate consistent color from label
                color_hash = hashlib.md5(label.encode()).hexdigest()[:6]
                color = tuple(int(color_hash[i:i+2], 16) for i in (0, 2, 4))
                
                # Coordinates
                x1, y1, x2, y2 = int(box['xmin']), int(box['ymin']), int(box['xmax']), int(box['ymax'])
                
                # Draw box and label
                cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                cv2.putText(frame, f"{label}: {score:.2f}", (x1, y1-10), 
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
        
        # Save frame
        out.write(frame)
        processed_count += 1
        
        if processed_count % 30 == 0:
            print(f"Processed {processed_count} frames")
    
    cap.release()
    out.release()
    
    print(f"Output saved to: {output_path}")
    print(f"Input: {total_frames} frames at {fps} FPS ({duration:.1f}s)")

Видос от Тиграна

In [None]:
# process_video_with_thresholds(
#     "custom_dataset/video1.mp4",
#     "custom_dataset/video1_output.mp4", 
#     ["pull-up", "push-up", "person"],
#     box_threshold=0.3,
#     text_threshold=0.25
# )

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda:0


Input video: 29 FPS, 13418 frames, 462.7 seconds
Processing with thresholds - Box: 0.3, Text: 0.25
Looking for: ['pull-up', 'push-up', 'person']
Sampling every 3 frames -> 8 FPS output


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed 30 frames (input frame 87)
Processed 60 frames (input frame 177)
Processed 90 frames (input frame 267)
Processed 120 frames (input frame 357)
Processed 150 frames (input frame 447)
Processed 180 frames (input frame 537)
Processed 210 frames (input frame 627)
Processed 240 frames (input frame 717)
Processed 270 frames (input frame 807)
Processed 300 frames (input frame 897)
Processed 330 frames (input frame 987)
Processed 360 frames (input frame 1077)
Processed 390 frames (input frame 1167)
Processed 420 frames (input frame 1257)
Processed 450 frames (input frame 1347)
Processed 480 frames (input frame 1437)
Processed 510 frames (input frame 1527)
Processed 540 frames (input frame 1617)
Processed 570 frames (input frame 1707)
Processed 600 frames (input frame 1797)
Processed 630 frames (input frame 1887)
Processed 660 frames (input frame 1977)
Processed 690 frames (input frame 2067)
Processed 720 frames (input frame 2157)
Processed 750 frames (input frame 2247)
Processed 780 f

Видос с кагла

In [13]:
process_video_with_thresholds(
    "kaggle_dataset/push-up_1.mp4",
    "kaggle_dataset/output4.mp4", 
    ["push-up ending"],
    box_threshold=0.3,
    text_threshold=0.25
)

Device set to use cuda:0


Input video: 29 FPS, 150 frames, 5.2 seconds
Processing with thresholds - Box: 0.3, Text: 0.25
Looking for: ['push-up ending']
Processed 30 frames
Processed 60 frames
Processed 90 frames
Processed 120 frames
Processed 150 frames
Output saved to: kaggle_dataset/output4.mp4
Input: 150 frames at 29 FPS (5.2s)
