In [1]:
#-- Install ultralytics for YOLO  --------------------------------------------------------------------------------
!pip install ultralytics

from IPython import display
display.clear_output()

import ultralytics
ultralytics.checks()
#---------------------------------------------------------------------------------------------------------------

Ultralytics YOLOv8.2.42 🚀 Python-3.10.13 torch-2.1.2 CUDA:0 (Tesla T4, 15102MiB)
Setup complete ✅ (4 CPUs, 31.4 GB RAM, 5689.3/8062.4 GB disk)


In [2]:
#-- Install GroundingDINO  ----------------------------------------------------------------------------------------
%cd /kaggle/working/  

!git clone https://github.com/IDEA-Research/GroundingDINO.git

%cd GroundingDINO/
!pip install -e .

!mkdir weights
%cd weights
!wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth

%cd /kaggle/working/GroundingDINO    

#-- clear output --
from IPython import display
display.clear_output()  

!python -c "import groundingdino" && echo "Module installed successfully" || echo "Module installation failed"
#---------------------------------------------------------------------------------------------------------------

Module installed successfully


In [3]:
#-- Import -----------------------------------------------------------------------------------------------
%cd /kaggle/working/GroundingDINO
from groundingdino.util.inference import load_model as dn_load_model
from groundingdino.util.inference import load_image as dn_load_image
from groundingdino.util.inference import predict as dn_predict
from groundingdino.util.inference import annotate as dn_annotate
import groundingdino.datasets.transforms as T
%cd /kaggle/working

from ultralytics import YOLO

import torch

import cv2
from PIL import Image

import matplotlib.pyplot as plt

import os
import shutil

import numpy as np
#---------------------------------------------------------------------------------------------------------------

/kaggle/working/GroundingDINO
/kaggle/working


In [61]:
#-- Initialize ---------------------------------------------------------------------------------------------------
intput_path = '/kaggle/input/'
out_path = '/kaggle/working/'

input_video_dir = intput_path + 'sample-videos-detecting-and-matching-objs-1/'
result_video_dir = out_path + 'result_videos/'

dino_model_config_file = out_path + 'GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
dino_model_weights_file = out_path + 'GroundingDINO/weights/groundingdino_swint_ogc.pth'

drone_detector_weights_file = intput_path + 'drone-detection-yolov8-best-weights/best.pt'

DINO_BOX_THRESHOLD = 0.25
DINO_TEXT_THRESHOLD = 0.1

YOLO_CONF_THRESHOLD = 0.1
YOLO_IOU_THRESHOLD = 0.5

MOTION_THRESHOLD = 20
IOU_THRESHOLD = 0.5

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:' , DEVICE)
#---------------------------------------------------------------------------------------------------------------

device: cuda


In [6]:
#-- Create Dir for saving Results ---------------------------------------------------------------------------------
os.makedirs(result_video_dir, exist_ok=True)
#-----------------------------------------------------------------------------------------------------------------

In [49]:
#-- Set labels for ZSOD Models ------------------------------------------------------------------------------------
all_labels = ['drone', 'UAV', 'Unmanned Aerial Vehicle', 'Quadcopter']
# all_labels = ['person']            

yolo_all_labels = all_labels

dino_all_labels = ''
for lbl in all_labels:
    dino_all_labels += lbl + ', '
#-----------------------------------------------------------------------------------------------------------------

In [50]:
#-- Create and Initialize Models ----------------------------------------------------------------------------------
#-- YOLO World --
model_yolo_world_zsod = YOLO('yolov8x-worldv2.pt')
model_yolo_world_zsod.set_classes(yolo_all_labels)

#-- DINO --
model_dino_zsod = dn_load_model(dino_model_config_file,
                                dino_model_weights_file,
                                device= DEVICE)

#-- Custome Model for Drone Detection --
model_drone_detector_yolov8 = YOLO(drone_detector_weights_file) 

#-- background subtractor --
back_sub = cv2.createBackgroundSubtractorMOG2(history=500, varThreshold=100, detectShadows=False)


display.clear_output()
print('All models loaded successfully :)')
#-----------------------------------------------------------------------------------------------------------------

All models loaded successfully :)


In [12]:
#-- ReCreate DINO LOAD Image for Loading image -------------------------------------------------------------------
def dino_load_image(input_image):
    transform = T.Compose(
        [
            T.RandomResize([800], max_size=1333),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    image_source = input_image.convert("RGB")
    image = np.asarray(image_source)
    image_transformed, _ = transform(image_source, None)
    return image, image_transformed
#-----------------------------------------------------------------------------------------------------------------

In [13]:
#-- calculate IOU for 2 Deteced Objects --------------------------------------------------------------------------
def calculate_iou(box1, box2):
    
    x1_1, y1_1, x2_1, y2_1 = box1
    x1_2, y1_2, x2_2, y2_2 = box2
    
    x1_intersection = max(x1_1, x1_2)
    y1_intersection = max(y1_1, y1_2)
    x2_intersection = min(x2_1, x2_2)
    y2_intersection = min(y2_1, y2_2)
   
    intersection_area = max(0, x2_intersection - x1_intersection + 1) * max(0, y2_intersection - y1_intersection + 1)    
    box1_area = (x2_1 - x1_1 + 1) * (y2_1 - y1_1 + 1)
    box2_area = (x2_2 - x1_2 + 1) * (y2_2 - y1_2 + 1)    
    union_area = box1_area + box2_area - intersection_area
    
    iou = intersection_area / union_area

    return iou
#-----------------------------------------------------------------------------------------------------------------

In [62]:
for video_file in os.listdir(input_video_dir):      
    
    if 'human' in video_file:
        continue
    

    #-- log --
    print(f'Processing {video_file} ==========================================================')
    
    #-- Create Folder for saving results --
    dot_index = video_file.rfind('.')   
    video_result_dir_name = 'result_for_' + video_file[:dot_index]
    video_result_dir_path = result_video_dir + video_result_dir_name + '/'
    os.makedirs(video_result_dir_path, exist_ok=True)
        
    #-- load video --
    video_path = os.path.join(input_video_dir, video_file)    
    video = cv2.VideoCapture(video_path)
    
    #-- Get number of frames and fps -- 
    number_of_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(video.get(cv2.CAP_PROP_FPS))
    print(f'number_of_frames: {number_of_frames}\nfps: {fps}')

    #-- Get the width and height of the frames --
    frame_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    #-- Initialize VideoWriter to save the output video --
    result_video = cv2.VideoWriter(video_result_dir_path + video_file[:dot_index] + '.avi',
                                   cv2.VideoWriter_fourcc(*'XVID'),
                                   fps,
                                   (frame_width, frame_height))    
    
    
    #-- Run Object Detection Models Frame by Frame --
    frame_number = 0
    while video.isOpened():
        ret, frame = video.read()
        if not ret:
            break
        
        main_frame = frame.copy()
        frame_number += 1   
        #-- log --
        print(f'\tProcessing frame {frame_number} ------------------------------')       
        
        
        #-- Apply background subtraction --
        fg_mask = back_sub.apply(frame)        
        
        #-- show some frames --
        if frame_number % (number_of_frames//5) == 0:
            plt.figure(figsize=(5, 5))
            plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            plt.axis('off')
            plt.title(f'main frame - frame number={frame_number}')
            file_name = f'main_frame_{frame_number}.png'
            plt.savefig(video_result_dir_path + file_name)
            plt.show()

            plt.figure(figsize=(5, 5))
            plt.imshow(cv2.cvtColor(fg_mask, cv2.COLOR_BGR2RGB))
            plt.axis('off')
            plt.title(f'fg_mask - frame number={frame_number}')
            file_name = f'fg_mask_{frame_number}.png'
            plt.savefig(video_result_dir_path + file_name)
            plt.show()
        
        #-- Detect objects by YOLO-World --
        results_yolo_world_zsod = model_yolo_world_zsod.predict(source=frame,
                                                                conf=YOLO_CONF_THRESHOLD,
                                                                iou=YOLO_IOU_THRESHOLD,
                                                                show=False,
                                                                save=False)
        
        
        #-- Detect objects by DINO --
        pil_image = Image.fromarray(frame)           
        image_source, image = dino_load_image(pil_image)
        boxes, logits, phrases = dn_predict(model = model_dino_zsod,
                                            image = image,
                                            caption = dino_all_labels,
                                            box_threshold = DINO_BOX_THRESHOLD,
                                            text_threshold = DINO_TEXT_THRESHOLD)   
        
        
        #-- Detect Drones by trained YOLO-v8 model --
        results_yolov8_drone = model_drone_detector_yolov8.predict(source=frame,
                                                                   conf=YOLO_CONF_THRESHOLD,
                                                                   iou=YOLO_IOU_THRESHOLD,
                                                                   show=False,
                                                                   save=False)
        
        
        #-- Get only moving objects from results_yolo_world_zsod --
        moving_objects_yolo_world_zsod = []        
        for result in results_yolo_world_zsod:
            for box in result.boxes:  
                class_id = int(box.cls)                 
                label = yolo_all_labels[class_id]    
                bbox = box.xyxy.tolist()[0]            
                x1, y1, x2, y2 = map(int, bbox)                

                #-- Check if the detected object has motion --
                if fg_mask[y1:y2, x1:x2].mean() > MOTION_THRESHOLD:  
                    moving_objects_yolo_world_zsod.append((x1, y1, x2, y2, label))  
        
        #-- Get only moving objects from results_yolov8_drone --
        moving_objects_yolov8_drone = []        
        for result in results_yolov8_drone:
            for box in result.boxes:  
                class_id = int(box.cls) 
                label = yolo_all_labels[class_id]    
                bbox = box.xyxy.tolist()[0]            
                x1, y1, x2, y2 = map(int, bbox)                

                #-- Check if the detected object has motion --
                if fg_mask[y1:y2, x1:x2].mean() > MOTION_THRESHOLD:  
                    moving_objects_yolov8_drone.append((x1, y1, x2, y2, label))  
        
        
        #-- Get only moving objects from results_dino --
        moving_objects_dino_zsod = []
        for bbox, phrase in zip(boxes, phrases):        
            label = phrase 
            center_x, center_y, width, height = bbox.tolist()
            
            #-- convert bbox to x1,y1,x2,y2 (un-normal) --
            center_x_abs = center_x * frame_width
            center_y_abs = center_y * frame_height
            width_abs = width * frame_width
            height_abs = height * frame_height            
            x1 = int(center_x_abs - (width_abs / 2))
            y1 = int(center_y_abs - (height_abs / 2))
            x2 = int(center_x_abs + (width_abs / 2))
            y2 = int(center_y_abs + (height_abs / 2)) 
            
            #-- Check if the detected object has motion --
            if fg_mask[y1:y2, x1:x2].mean() > MOTION_THRESHOLD:  
                moving_objects_dino_zsod.append((x1, y1, x2, y2, label))  
            
        
        #-- log --
        print(f'yolov8:{len(moving_objects_yolov8_drone)}\nyolo_world:{len(moving_objects_yolo_world_zsod)}\ndino:{len(moving_objects_dino_zsod)}')
        
        #-- Get union of moving objects --
        moving_objects = []        
        matched_indices_yolo_world = set()
        matched_indices_dino = set()
        
        
        for drone_obj in moving_objects_yolov8_drone:
            drone_box = drone_obj[0:4]
            drone_lbl = 'drone' + '_yolov8'         

            merged = False            

            for i, yolo_obj in enumerate(moving_objects_yolo_world_zsod):
                if i in matched_indices_yolo_world:
                    continue  
                    
                yolo_box =  yolo_obj[0:4]  
                
                iou = calculate_iou(drone_box, yolo_box)
                if iou >= IOU_THRESHOLD and not merged:
                    merged_box_corners = [
                        min(drone_box[0], yolo_box[0]),  # x1
                        min(drone_box[1], yolo_box[1]),  # y1
                        max(drone_box[2], yolo_box[2]),  # x2
                        max(drone_box[3], yolo_box[3])]   # y2

                    moving_objects.append((merged_box_corners[0],
                                           merged_box_corners[1],
                                           merged_box_corners[2],
                                           merged_box_corners[3],
                                           drone_lbl))                        

                    matched_indices_yolo_world.add(i)
                    merged = True
                    #break
                
                elif iou >= IOU_THRESHOLD and merged:
                    matched_indices_yolo_world.add(i)
                
                
            for i, dino_obj in enumerate(moving_objects_dino_zsod):
                if i in matched_indices_dino:
                    continue  

                dino_box =  dino_obj[0:4]  

                iou = calculate_iou(drone_box, dino_box)
                if iou >= IOU_THRESHOLD and not merged:
                    merged_box_corners = [
                        min(drone_box[0], dino_box[0]),  # x1
                        min(drone_box[1], dino_box[1]),  # y1
                        max(drone_box[2], dino_box[2]),  # x2
                        max(drone_box[3], dino_box[3])]   # y2

                    moving_objects.append((merged_box_corners[0],
                                               merged_box_corners[1],
                                               merged_box_corners[2],
                                               merged_box_corners[3],
                                               drone_lbl))                        

                    matched_indices_dino.add(i)
                    merged = True
                    #break   
                    
                elif iou >= IOU_THRESHOLD and merged:
                    matched_indices_dino.add(i)
                    
            
            if not merged:  
                moving_objects.append((drone_box[0],
                                       drone_box[1],
                                       drone_box[2],
                                       drone_box[3],
                                       drone_lbl))    
                    
#                 moving_objects.append(drone_obj)
        

        for j, yolo_obj in enumerate(moving_objects_yolo_world_zsod):
            if j in matched_indices_yolo_world:
                    continue 
       
            yolo_box = yolo_obj[0:4]
            yolo_lbl = yolo_obj[-1]+ '_yolo_world'        

            merged = False            

            for i, dino_obj in enumerate(moving_objects_dino_zsod):
                if i in matched_indices_dino:
                    continue  
                    
                dino_box =  dino_obj[0:4]  
                
                iou = calculate_iou(yolo_box, dino_box)
                if iou >= IOU_THRESHOLD and not merged:
                    merged_box_corners = [
                        min(yolo_box[0], dino_box[0]),  # x1
                        min(yolo_box[1], dino_box[1]),  # y1
                        max(yolo_box[2], dino_box[2]),  # x2
                        max(yolo_box[3], dino_box[3])]   # y2

                    moving_objects.append((merged_box_corners[0],
                                           merged_box_corners[1],
                                           merged_box_corners[2],
                                           merged_box_corners[3],
                                           yolo_lbl))                        

                    matched_indices_dino.add(i)
                    merged = True
                    #break
                elif iou >= IOU_THRESHOLD and merged:
                    matched_indices_dino.add(i)
                
            if not merged:     
                moving_objects.append((yolo_box[0],
                                       yolo_box[1],
                                       yolo_box[2],
                                       yolo_box[3],
                                       yolo_lbl)) 
                #moving_objects.append(yolo_obj)

        #-- Add remaining boxes from dino_objects that were not matched --
        for i, dino_obj in enumerate(moving_objects_dino_zsod):
            if i not in matched_indices_dino:
                dino_box = dino_obj[0:4]
                dino_lbl = dino_obj[-1]+ '_dino'  
                moving_objects.append((dino_box[0],
                                       dino_box[1],
                                       dino_box[2],
                                       dino_box[3],
                                       dino_lbl)) 
#                 moving_objects.append(dino_obj)        


        #-- plot bounding box for moving objects on the frame --
        for i, (x1, y1, x2, y2, label) in enumerate(moving_objects):            
            #-- crop detected object --
            cropped_object = main_frame[y1:y2, x1:x2]
            
            #-- save coped object --
            if frame_number % (number_of_frames//5) == 0:
                file_name = f'frame_{frame_number}_{i}_{label}.png'            
                cv2.imwrite(video_result_dir_path + file_name, cropped_object)
            
            #-- show cropped object --
            if frame_number % (number_of_frames//5) == 0:
                plt.figure(figsize=(3, 3))
                plt.imshow(cropped_object)
                plt.axis('off')
                plt.title(label)
                plt.show
            
            #-- plot bbox on the frame --
            if 'yolov8' in label:
                color = (255, 0, 0)
            elif 'yolo_world' in label:
                color = (0, 255, 0)
            elif 'dino' in label:
                color = (0, 0, 255)
                
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)  #-- Red box with thickness 2 --               
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
            
            
            
        #-- Add frame to result video --
        result_video.write(frame)

        #-- show some frames --
        if frame_number % (number_of_frames//5) == 0:
            plt.figure(figsize=(10, 10))
            plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            plt.axis('off')
            plt.title(f'moving objcs - frame number={frame_number}')
            file_name = f'moving_objcs_{frame_number}.png'
            plt.savefig(video_result_dir_path + file_name)
            plt.show()   
    
        
        
    #-- zip results --
    shutil.make_archive(out_path+video_result_dir_name, 'zip', video_result_dir_path)    

    #-- release videos --
    video.release()
    result_video.release()    
    
    


#-- remove folders --
shutil.rmtree(result_video_dir)
display.clear_output()   
print(':)')
#-----------------------------------------------------------------------------------------------------------------    

:)
