In [8]:
#-- Burglary Detection
#-- YOLO-World
#-- SMART_Dropping: K-Means _With 32 key frames 

In [9]:
#-- Install ultralytics for YOLO  --------------------------------------------------------------------------------
!pip install ultralytics

from IPython import display
display.clear_output()

import ultralytics
ultralytics.checks()
#---------------------------------------------------------------------------------------------------------------

Ultralytics 8.3.63 🚀 Python-3.10.13 torch-2.1.2 CUDA:0 (Tesla T4, 15095MiB)
Setup complete ✅ (4 CPUs, 31.4 GB RAM, 6095.9/8062.4 GB disk)


In [10]:
#-- Import libraries  ------------------------------------------------------------------------------------------
from ultralytics import YOLO
import torch
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import os
import shutil
import numpy as np
import pandas as pd
import csv
import random
#---------------------------------------------------------------------------------------------------------------

In [13]:
#-- Initialize ---------------------------------------------------------------------------------------------------
burglary_samples_dir = '/kaggle/input/novin-create-binary-burglary-ds/burglary_samples/'
not_burglary_samples_dir = '/kaggle/input/novin-create-binary-burglary-ds/not_burglary_samples/'

key_frames_burglary_samples_dir = '/kaggle/input/novin-smart-dropping-kmeans-clustering-v1/key_frames_burglary_samples/'
key_frames_not_burglary_samples_dir = '/kaggle/input/novin-smart-dropping-kmeans-clustering-v1/key_frames_not_burglary_samples/' 

CONF_THRESHOLD = 0.2
IOU_THRESHOLD = 0.5

#BRUGLARY_THRESHOLD_PERCENT = 0.1

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:' , DEVICE)
#---------------------------------------------------------------------------------------------------------------

device: cuda


In [15]:
#-- Get path for all videos and key_frames as a list ---------------------------------------------------------------------------
#-- Burglary samples --
burglary_videos = []
for dirpath, _, filenames in os.walk(burglary_samples_dir):
    for filename in filenames:
        full_path = os.path.join(dirpath, filename)
        burglary_videos.append(full_path)

burglary_key_frames = []
for dirpath, _, filenames in os.walk(key_frames_burglary_samples_dir):
    for filename in filenames:
        full_path = os.path.join(dirpath, filename)
        burglary_key_frames.append(full_path)


#-- Not Burglary Samples --
not_burglary_videos = []
for dirpath, _, filenames in os.walk(not_burglary_samples_dir):
    for filename in filenames:
        full_path = os.path.join(dirpath, filename)
        not_burglary_videos.append(full_path)

not_urglary_key_frames = []
for dirpath, _, filenames in os.walk(key_frames_not_burglary_samples_dir):
    for filename in filenames:
        full_path = os.path.join(dirpath, filename)
        not_urglary_key_frames.append(full_path)

burglary_videos.sort()
not_burglary_videos.sort()

print(f'Burglary Samples: {len(burglary_videos)} - key-frames: {len(burglary_key_frames)}')
print(f'Not-Burglary Samples: {len(not_burglary_videos)}- key-frames: {len(burglary_key_frames)}')
#---------------------------------------------------------------------------------------------------------------

Burglary Samples: 34 - key-frames: 34
Not-Burglary Samples: 34- key-frames: 34


In [17]:
#-- Set label prompts for ZSOD Models ------------------------------------------------------------------------------------
labels = [
    "Person climbing over a fence",
    "Person climbing a wall",
    "Person breaking a lock with tools",
    "Person trying to pick a lock",
    "Person forcing a door open with strength",          
    "Person hiding behind an object",    
    "Person running away from a building",
    "Person carrying tools like a crowbar",
    "Person breaking a window with an object",
    "Person tampering with a security camera",
    "Person cutting alarm wires",   
    "Person jumping out of a window",
    "Person disabling an alarm system",
    "Person wearing a mask and avoiding detection"
] 
#-----------------------------------------------------------------------------------------------------------------

In [18]:
#-- Create and Initialize Model ----------------------------------------------------------------------------------

#-- YOLO World (Zero-Shot Model) --
model_burglary_detection = YOLO('yolov8x-worldv2.pt')
model_burglary_detection.set_classes(labels)

display.clear_output()
print('YOLO-world model was loaded successfully :)')
#-----------------------------------------------------------------------------------------------------------------

YOLO-world model was loaded successfully :)


In [20]:
#-- create an empty df for saving reults --------------------------------------------------------------------
columns = ["video_file", "true_label", "predicted_label", "all_detected_prompts", "burglary_threshold"]
df_result = pd.DataFrame(columns=columns)
print(df_result.shape)
#------------------------------------------------------------------------------------------------------------

(0, 5)


In [29]:
video_labels_dict = {}
video_all_detections = {}

for video_path in burglary_videos:    

    index = video_path.rfind('/')
    video_file = video_path[index+1:]
    index = video_file.rfind('.') 
    video_name = video_file[:index]    
    
    #-- log --
    print(f'Processing {video_file} ==========================================================') 

    #-- Find corresponding key frame file --
    key_frame_path = next((kf for kf in burglary_key_frames if video_name+'_keyframes.npy' in kf), None)
    if key_frame_path is None:
        print(f"Key frame file not found for video {video_file}")
        continue

    #-- Load key frames from .npy file --
    key_frames = np.load(key_frame_path)  # Load the array of key frame indices
    total_frames = len(key_frames)

    #-- count number of detected prompts --
    labels_count = {}
    
    for frame_idx in key_frames:  #-- Process only key frames
        cap = cv2.VideoCapture(video_path)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)  #-- Set video to the specific frame index
        ret, frame = cap.read()
        cap.release()

        if not ret:
            print(f"Failed to read frame {frame_idx} from {video_file}")
            continue

        
        results = model_burglary_detection.predict(source=frame, 
                                                   conf=CONF_THRESHOLD,
                                                   iou=IOU_THRESHOLD,
                                                   show=False,
                                                   save=False,
                                                   stream=False)

        for r in results:  # Process predictions
            for cls_index in r.boxes.cls.int().tolist():
                if labels[cls_index] in labels_count:
                    labels_count[labels[cls_index]] += 1
                else:
                    labels_count[labels[cls_index]] = 1     

    final_labels_list = []
    all_detection_list = []
    for cls_lbl, count in labels_count.items():
        all_detection_list.append((cls_lbl, count))        
        
        # if count >= BRUGLARY_THRESHOLD:            
        final_labels_list.append((cls_lbl, count))
    
    
    video_labels_dict[video_file] = final_labels_list
    video_all_detections[video_file] = all_detection_list

    

    
    
    
    
display.clear_output()
print(':)')



0: 640x384 (no detections), 66.0ms
Speed: 2.7ms preprocess, 66.0ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 65.9ms
Speed: 3.1ms preprocess, 65.9ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 65.9ms
Speed: 2.5ms preprocess, 65.9ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 66.0ms
Speed: 3.1ms preprocess, 66.0ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 66.0ms
Speed: 2.3ms preprocess, 66.0ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 Person jumping out of a window, 66.0ms
Speed: 3.3ms preprocess, 66.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 66.0ms
Speed: 2.3ms preprocess, 66.0ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 66.0ms
Speed: 2.4ms pre

In [30]:
true_label = "burglary"
for video, lbls_list in video_all_detections.items():   
    all_detected_prompts = lbls_list
    if len(video_labels_dict[video]) > 0:
        predicted_label = "burglary"
    else:
        predicted_label = "not burglary"
    
    df_result = pd.concat([df_result, pd.DataFrame([{
        "video_file": video,
        "true_label": true_label,
        "predicted_label": predicted_label,
        "all_detected_prompts": all_detected_prompts,
        "burglary_threshold": '-' 
    }])], ignore_index=True)

print(df_result)
print(df_result.shape)

       video_file true_label predicted_label  \
0  burglary_1.mp4   burglary        burglary   

                                all_detected_prompts burglary_threshold  
0  [(Person jumping out of a window, 2), (Person ...                  -  
(1, 5)


In [33]:
video_labels_dict = {}
video_all_detections = {}

for video_path in not_burglary_videos:    

    index = video_path.rfind('/')
    video_file = video_path[index+1:]
    index = video_file.rfind('.') 
    video_name = video_file[:index]    
    
    #-- log --
    print(f'Processing {video_file} ==========================================================') 

    #-- Find corresponding key frame file --
    key_frame_path = next((kf for kf in not_urglary_key_frames if video_name+'_keyframes.npy' in kf), None)
    if key_frame_path is None:
        print(f"Key frame file not found for video {video_file}")
        continue

    #-- Load key frames from .npy file --
    key_frames = np.load(key_frame_path)  # Load the array of key frame indices
    total_frames = len(key_frames)

    #-- count number of detected prompts --
    labels_count = {}
    
    for frame_idx in key_frames:  #-- Process only key frames
        cap = cv2.VideoCapture(video_path)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)  #-- Set video to the specific frame index
        ret, frame = cap.read()
        cap.release()

        if not ret:
            print(f"Failed to read frame {frame_idx} from {video_file}")
            continue

        
        results = model_burglary_detection.predict(source=frame, 
                                                   conf=CONF_THRESHOLD,
                                                   iou=IOU_THRESHOLD,
                                                   show=False,
                                                   save=False,
                                                   stream=False)

        for r in results:  # Process predictions
            for cls_index in r.boxes.cls.int().tolist():
                if labels[cls_index] in labels_count:
                    labels_count[labels[cls_index]] += 1
                else:
                    labels_count[labels[cls_index]] = 1     

    final_labels_list = []
    all_detection_list = []
    for cls_lbl, count in labels_count.items():
        all_detection_list.append((cls_lbl, count))        
        
        # if count >= BRUGLARY_THRESHOLD:            
        final_labels_list.append((cls_lbl, count))
    
    
    video_labels_dict[video_file] = final_labels_list
    video_all_detections[video_file] = all_detection_list

    

    
    
    
    
display.clear_output()
print(':)')



0: 480x640 (no detections), 78.5ms
Speed: 2.2ms preprocess, 78.5ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 77.8ms
Speed: 1.7ms preprocess, 77.8ms inference, 0.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 46.9ms
Speed: 1.7ms preprocess, 46.9ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 47.2ms
Speed: 2.3ms preprocess, 47.2ms inference, 0.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 44.9ms
Speed: 1.9ms preprocess, 44.9ms inference, 0.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 43.9ms
Speed: 1.7ms preprocess, 43.9ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 47.8ms
Speed: 1.7ms preprocess, 47.8ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 43.8ms
Speed: 1.6ms preprocess, 43.8ms i

KeyboardInterrupt: 

In [32]:
true_label = "not burglary"
for video, lbls_list in video_all_detections.items():   
    all_detected_prompts = lbls_list
    if len(video_labels_dict[video]) > 0:
        predicted_label = "burglary"
    else:
        predicted_label = "not burglary"
    
    df_result = pd.concat([df_result, pd.DataFrame([{
        "video_file": video,
        "true_label": true_label,
        "predicted_label": predicted_label,
        "all_detected_prompts": all_detected_prompts,
        "burglary_threshold": '-'
    }])], ignore_index=True)

print(df_result)
print(df_result.shape)

           video_file    true_label predicted_label  \
0      burglary_1.mp4      burglary        burglary   
1  not_burglary_1.mp4  not burglary    not burglary   

                                all_detected_prompts burglary_threshold  
0  [(Person jumping out of a window, 2), (Person ...                  -  
1                                                 []                  -  
(2, 5)


In [None]:
df_result.to_csv('results.csv', index = False)