In [2]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection

processor = AutoImageProcessor.from_pretrained("sansh2356/DETR_finetune")
model = AutoModelForObjectDetection.from_pretrained("sansh2356/DETR_finetune")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image, ImageDraw, ImageFont

image_path = "./Screenshot (760).png"
image = Image.open(image_path).convert("RGB")


inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
print(type(results),results)
draw = ImageDraw.Draw(image)
font = ImageFont.load_default()  


for i in results["boxes"]:
 for j in i:
    print(float(j))

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    
    draw.rectangle(box, outline="red", width=3)
    
    label_text = f"{model.config.id2label[label.item()]}: {round(score.item(), 3)}"
    
    draw.text((box[0], box[1] - 10), label_text, fill="red", font=font)

image.show()  
image.save("./test/annotated_image.png") 


<class 'dict'> {'scores': tensor([0.9112], grad_fn=<IndexBackward0>), 'labels': tensor([1]), 'boxes': tensor([[ 857.2190,  493.8048, 1017.9589,  640.5778]],
       grad_fn=<IndexBackward0>)}
857.218994140625
493.8048400878906
1017.9588623046875
640.5777587890625


In [5]:
import os
import cv2
from PIL import Image, ImageDraw, ImageFont
import torch
import json
import numpy as np
import time

def process_video(video_path, output_video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Unable to open video file.")
        return

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    frame_count = 0
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

            inputs = processor(images=image, return_tensors="pt")
            outputs = model(**inputs)

            target_sizes = torch.tensor([image.size[::-1]])
            results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

            draw = ImageDraw.Draw(image)
            font = ImageFont.load_default()
            with open("./output.jsonl", "a") as outfile:
                    if len(results["scores"]) == 0:
                        json.dump({
                            "box": "None",
                            "frame_no": frame_count,
                            "video_name": video_path,
                            "label": "None",
                            "predicted_confidence_score": "None"
                        }, outfile)
                        outfile.write('\n')

                    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
                        box = [round(i, 2) for i in box.tolist()]
                        predicted_label = label.item()
                        predicted_confidence_score = score.item()
                        json.dump({
                            "box": box,
                            "frame_no": frame_count,
                            "video_name": video_path,
                            "label": predicted_label,
                            "predicted_confidence_score": predicted_confidence_score
                        }, outfile)
                        outfile.write('\n')

            for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
                box = [round(i, 2) for i in box.tolist()]
                draw.rectangle(box, outline="red", width=3)
                label_text = f"{model.config.id2label[label.item()]}: {round(score.item(), 3)}"
                draw.text((box[0], box[1] - 10), label_text, fill="red", font=font)

            frame_with_detections = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
            cv2.imshow('Processed Video', frame_with_detections)

            out.write(frame_with_detections)
            print(f"Frame {frame_count} processed.")
            frame_count += 1

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    finally:
        cap.release()
        out.release()
        cv2.destroyAllWindows()
        print(f"Video saved as {output_video_path}")

process_video('http://192.168.1.13:8000/stream.mjpg', "./rpi_capture/capture_1.mp4")


Frame 0 processed.
Frame 1 processed.
Frame 2 processed.
Frame 3 processed.
Frame 4 processed.
Frame 5 processed.
Frame 6 processed.
Frame 7 processed.
Frame 8 processed.
Frame 9 processed.
Frame 10 processed.
Video saved as ./rpi_capture/capture_1.mp4
