In [1]:
from ultralytics.vit import RTDETR
import torch
import numpy as np
import cv2
import time 
import supervision as sv
import pandas as pd

class DETRClass:
    def __init__(self,capture_index):
        self.capture_index = capture_index
        
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        
        print("Using device: ", self.device)
        
        self.model = RTDETR("rtdetr-l.pt")
        
        self.CLASS_NAMES_DICT = self.model.model.names
        
        print("Class names: ", self.CLASS_NAMES_DICT)
        
        self.box_annotator = sv.BoxAnnotator(sv.ColorPalette.default(), thickness=3, text_thickness=3,text_scale=1.5)
        
    def plot_bboxes(self, results, frame):
        #Extract the results
        boxes=results[0].boxes.cpu().numpy()
        class_id = boxes.cls
        conf = boxes.conf
        xyxy = boxes.xyxy
        
        class_id = class_id.astype(np.int32)
        
        
        detections = sv.Detections(xyxy=xyxy, class_id=class_id, confidence = conf)
        
        self.labels = [f"{self.CLASS_NAMES_DICT[class_id]} {confidence:.2f}" for xyxy,confidence, class_id,track_id in detections]
        
        frame = self.box_annotator.annotate(frame,detections,self.labels)
        return frame
    
    
    
    def __call__(self):
        cap =cv2.VideoCapture(self.capture_index)
        assert cap.isOpened()
        cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
        cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
        
        while cap.isOpened():
            start_time = time.perf_counter()
            
            ret, frame = cap.read()
            
            results = self.model.predict(frame)
            
            frame = self.plot_bboxes(results, frame)
            
            end_time = time.perf_counter()
            fps = 1/ (end_time - start_time)
            
            cv2.putText(frame, f"FPS: {fps:.2f}",(20,70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
            
            cv2.imshow("DETR", frame)
            
            if cv2.waitKey(1) == 27 or cv2.getWindowProperty("DETR", cv2.WND_PROP_VISIBLE) < 1:
                break
        cap.release()
        cv2.destroyAllWindows()
        
        
transformer_detector = DETRClass(0)
transformer_detector()
            
    



Using device:  cpu
Class names:  {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: '10', 11: '11', 12: '12', 13: '13', 14: '14', 15: '15', 16: '16', 17: '17', 18: '18', 19: '19', 20: '20', 21: '21', 22: '22', 23: '23', 24: '24', 25: '25', 26: '26', 27: '27', 28: '28', 29: '29', 30: '30', 31: '31', 32: '32', 33: '33', 34: '34', 35: '35', 36: '36', 37: '37', 38: '38', 39: '39', 40: '40', 41: '41', 42: '42', 43: '43', 44: '44', 45: '45', 46: '46', 47: '47', 48: '48', 49: '49', 50: '50', 51: '51', 52: '52', 53: '53', 54: '54', 55: '55', 56: '56', 57: '57', 58: '58', 59: '59', 60: '60', 61: '61', 62: '62', 63: '63', 64: '64', 65: '65', 66: '66', 67: '67', 68: '68', 69: '69', 70: '70', 71: '71', 72: '72', 73: '73', 74: '74', 75: '75', 76: '76', 77: '77', 78: '78', 79: '79'}


Ultralytics YOLOv8.0.132  Python-3.11.4 torch-2.0.1+cpu CPU
rt-detr-l summary: 494 layers, 32148140 parameters, 0 gradients

0: 640x640 1 0, 2363.5ms
Speed: 14.1ms preprocess, 2363.5ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns\detect\predict[0m

0: 640x640 1 0, 2 67s, 1644.7ms
Speed: 7.5ms preprocess, 1644.7ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns\detect\predict[0m

0: 640x640 1 0, 2 67s, 1509.9ms
Speed: 6.6ms preprocess, 1509.9ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns\detect\predict[0m

0: 640x640 1 0, 1 67, 1397.4ms
Speed: 5.8ms preprocess, 1397.4ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns\detect\predict[0m

0: 640x640 1 0, 1 67, 1290.4ms
Speed: 7.2ms preprocess, 1290.4ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns\detect\predict[0m