In [None]:
# pip install numpy==1.24.4 Pillow==10.3.0 Requests==2.31.0 torch torchvision accelerate jinja2>3.1 qwen-vl-utils av git+https://github.com/huggingface/transformers.git
# pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 accelerate
# pip install ultralytics supervision
# pip install pybind11
# sudo apt-get install build-essential
# sudo apt-get install python3-dev
# python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

# Step 1: Detect a person in region of interest e.g. checkout area
Using YOLOv8, we can define a region of interest, so analysis can start when a person is in that zone

In [2]:
from IPython import display
import ultralytics
import supervision as sv
import numpy as np



In [3]:
from ultralytics import YOLO
model = YOLO('yolov8s.pt')

In [4]:
#Video processing and metedata about the video file
ROOT_DIR = "/home/ubuntu" 
vid_path = "video-3.mp4"

sv.VideoInfo.from_video_path(vid_path)

VideoInfo(width=1080, height=1920, fps=30, total_frames=302)

In [5]:
#process a video frame by frame, 
#perform object detection using the YOLO model,&
#annotate the frames with detection boxes, labels, and trigger actions based on a polygon zone.

# initiate polygon zone
polygon = np.array([(900, 0), (1080, 0), (1080, 1920), (100, 1920)])

video_info = sv.VideoInfo.from_video_path(f"{ROOT_DIR}/{vid_path}")   # This uses the VideoInfo class from the supervision library (sv) to retrieve metadata from the video file located at the path specified 
zone = sv.PolygonZone(polygon=polygon) # initializes a Polygon Zone, which is a region of interest (ROI) in the video frame defined by the previously created polygon.

# initiate annotators
box_annotator = sv.BoxAnnotator(thickness=4)
label_annotator = sv.LabelAnnotator(text_thickness=4, text_scale=2)
zone_annotator = sv.PolygonZoneAnnotator(zone=zone, color=sv.Color.WHITE, thickness=6, text_thickness=6, text_scale=4)

def process_frame(frame: np.ndarray, _) -> np.ndarray:
    # detect
    results = model(frame, imgsz=320)[0]
    detections = sv.Detections.from_ultralytics(results)
    detections = detections[detections.class_id == 0]
    zone.trigger(detections=detections)

    # annotate
    labels = [f"{model.names[class_id]} {confidence:0.2f}" for _, _, confidence, class_id, _, _ in detections]
    frame = box_annotator.annotate(scene=frame, detections=detections)
    frame = label_annotator.annotate(scene=frame, detections=detections, labels=labels)
    frame = zone_annotator.annotate(scene=frame)

    return frame

sv.process_video(source_path=vid_path, target_path=f"{ROOT_DIR}/out_{vid_path}", callback=process_frame)

from IPython import display
display.clear_output()

## Result

By defining our **polygon coordinates** we create a zone of interest (the white box in the middle)

We see that our count is zero in the left frame, where the customer has not entered the zone

And the count increases to 1 as they enter

Since we have this information programmtically, we can save the next 20 seconds of video feed to capture all the activities that happened. Only then do we need to analyze the occurrence, saving us on inference cost.

Example scene 1
<br>
<p float="left">
  <img src="media/scene1-example1.png" alt="Image 1" width="45%" style="margin-right: 5%;" />
  <img src="media/scene1-example2.png" alt="Image 2" width="45%" />
</p>

Example scene 2 
<br>
<img src="media/scene2-example.png" alt="Image 2" width="45%" />

Example scene 3
<br>
<p float="left">
  <img src="media/scene3-example1.png" alt="Image 1" width="45%" style="margin-right: 5%;" />
  <img src="media/scene3-example2.png" alt="Image 2" width="45%" />
</p>

Basically, we would need to map out the coordinates for every camera the client has

# Step 2: Ask the VLM to tell us if there is theft in the scene or not

In [6]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

E0000 00:00:1731539786.004284   11123 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731539786.009218   11123 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


## Zero Shot

In zero shot, we directly ask the model to tell us if there is any shoplifting in the video or not, without providing examples

In [None]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": "/home/ubuntu/video-3.mp4",
                "max_pixels": 360 * 420,
                "fps": 1.0,
            },
            {
                "type": "text", 
                "text": """
                    Is there an indication of suspicious store activity in this video such as shoplifting or not. 
                    Reply with two fields
                    answer: Yes or No
                    reason: Your reason
                    """
            },
        ],
    }
]

# Prepare input for inference, only including video-3
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

# Processing Vision inputs. Process only the user video (video-6) by passing the relevant part of `messages` directly
image_inputs, video_inputs = process_vision_info(messages)

# Preparing final inputs
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Model Inference
generated_ids = model.generate(**inputs, max_new_tokens=128)

#processing the output
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

# Display the output
from IPython.display import Markdown
Markdown(output_text[0].replace('\n', '<br>'))

qwen-vl-utils using torchvision to read video.


No<br>reason: The video shows a man shopping in a store and placing items in a basket. There is no indication of shoplifting or any suspicious activity.

## Few Shot

To perform few shot prompting, the model is provided with 2 or more examples

Here, we mimic the typical user-assistant conversation flow and add the examples as the chat history. This is demonstrated below


---

<div align="right">

**User**  
*Is there an indication of suspicious store activity in this video such as shoplifting or not. 
<br>Reply with two fields
<br>answer: Yes or No
<br>reason: Your reason
<br>Video: 📽️*

</div>

---

**Assistant**  
*Yes <br> Reason: The man in the video is seen taking items from the cashier's drawer*  

---

<div align="right">

**User**  
*Is there an indication of suspicious store activity in this video such as shoplifting or not. 
<br>Reply with two fields
<br>answer: Yes or No
<br>reason: Your reason
<br>Video: 📽️*

</div>

---

**Assistant**  
*No <br> Reason: The video shows a man engaging in normal shopping activities with no signs of shoplifting*

---

<div align="right">

**User**  
*Is there an indication of suspicious store activity in this video such as shoplifting or not. 
<br>Reply with two fields
<br>answer: Yes or No
<br>reason: Your reason
<br>Video: 📽️*

</div>

---

**Assistant**  


---


With this history of the conversation that includes the **processed video data** and corresponding **manual responses** provided as context, the model continues its generation, leveraging on the examples to align its future outputs.

In [None]:
messages = [
    {
        # ===================== Example One =====================
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": "/home/ubuntu/video-6.mp4",
                "max_pixels": 360 * 420,
                "fps": 1.0,
            },
            {
                "type": "text", 
                "text": """Is there an indication of suspicious store activity in this video such as shoplifting or not. 
                    Reply with two fields
                    answer: Yes or No
                    reason: Your reason"""
            },
        ],
    },
    # ===================== Response One (Manually Added) =====================
    {
        "role": "assistant",
        "content": [
            {
                "type": "text", 
                "text": """Yes\nReason: The video shows a man picking office supplies from a table and hiding them in his pockets in a suspicious manner"""
            },
        ],
    },
    # ===================== Example Two =====================
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": "/home/ubuntu/video-3.mp4",
                "max_pixels": 360 * 420,
                "fps": 1.0,
            },
            {
                "type": "text", 
                "text": """Is there an indication of suspicious store activity in this video such as shoplifting or not. 
                    Reply with two fields
                    answer: Yes or No
                    reason: Your reason"""
            },
        ],
    },
    # ===================== Response Two (Manually Added) =====================
    {
        "role": "assistant",
        "content": [
            {
                "type": "text", 
                "text": """No\nReason: The video shows one man casually shopping with a basket and another browsing items on the shelf, no strong signs of shoplifting"""
            },
        ],
    },
    # ===================== Query Video =====================
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": "/home/ubuntu/video-4.mp4",
                "max_pixels": 360 * 420,
                "fps": 1.0,
            },
            {
                "type": "text", 
                "text": """
                    Is there a strong indication of suspicious store activity in this video such as shoplifting or not. 
                    Reply with two fields
                    answer: Yes or No
                    reason: Your reason
                    """
            },
        ],
    }
]

# Prepare input for inference, only including video-3
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

# Processing Vision inputs. Process only the user video (video-6) by passing the relevant part of `messages` directly
image_inputs, video_inputs = process_vision_info(messages)

# Preparing final inputs
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Model Inference
generated_ids = model.generate(**inputs, max_new_tokens=128)

#processing the output
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

# Display the output
from IPython.display import Markdown
Markdown(output_text[0].replace('\n', '<br>'))

## Evaluation

We setup an evaluation set comprising shoplifting videos from the [UCF Crime Dataset](https://www.crcv.ucf.edu/projects/real-world/) and few other videos

We then use the model to analyze each of videos, storing each response for comparison with the ground truth. This is to help us use a scientific approach to comparing methods

In [None]:
# !pip install gdown
# !gdown "https://drive.google.com/uc?export=download&id=1_zFhG7g2s4qBU0bCyVHLVyHHAa3jYmqr"
# !unzip shoplifting-videos.zip

In [None]:
import os
import pandas as pd
from pathlib import Path
import json
import cv2
from moviepy.editor import VideoFileClip
import tempfile
from tqdm import tqdm

def prepare_messages(process_path):
    return [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": "/home/ubuntu/video-6.mp4",
                "max_pixels": 360 * 420,
                "fps": 1.0,
            },
            {
                "type": "text", 
                "text": """Is there an indication of suspicious store activity in this video such as shoplifting or not. 
                    Reply with two fields
                    answer: Yes or No
                    reason: Your reason"""
            },
        ],
    },
    {
        "role": "assistant",
        "content": [
            {
                "type": "text", 
                "text": """Yes\nReason: The video shows a man picking office supplies from a table and hiding them in his pockets in a suspicious manner"""
            },
        ],
    },
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": "/home/ubuntu/video-3.mp4",
                "max_pixels": 360 * 420,
                "fps": 1.0,
            },
            {
                "type": "text", 
                "text": """Is there an indication of suspicious store activity in this video such as shoplifting or not. 
                    Reply with two fields
                    answer: Yes or No
                    reason: Your reason"""
            },
        ],
    },
    {
        "role": "assistant",
        "content": [
            {
                "type": "text", 
                "text": """No\nReason: The video shows one man casually shopping with a basket and another browsing items on the shelf, no strong signs of shoplifting"""
            },
        ],
    },
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": process_path,
                "max_pixels": 360 * 420,
                "fps": 1.0,
            },
            {
                "type": "text", 
                "text": """
                    Is there a strong indication of suspicious store activity in this video such as shoplifting or not. 
                    Reply with two fields
                    answer: Yes or No
                    reason: Your reason
                    """
            },
        ],
    }
]
    

def get_video_duration(video_path):
    """Get the duration of a video in seconds."""
    try:
        clip = VideoFileClip(video_path)
        duration = clip.duration
        clip.close()
        return duration
    except Exception as e:
        raise Exception(f"Error getting video duration: {str(e)}")

def trim_video(input_path, output_path, duration=30):
    """Trim video to specified duration in seconds."""
    try:
        with VideoFileClip(input_path) as clip:
            trimmed_clip = clip.subclip(0, duration)
            trimmed_clip.write_videofile(output_path, 
                                       codec='libx264', 
                                       audio=False,
                                       logger=None)  
    except Exception as e:
        raise Exception(f"Error trimming video: {str(e)}")

def sample_frames_dynamic_v1(video_path, output_path, target_frames=30):
    try:
        cap = cv2.VideoCapture(video_path)
        fps = 1 #cap.get(cv2.CAP_PROP_FPS)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        duration = total_frames / fps
        print(f"Video duration: {duration:.2f} seconds, Total frames: {total_frames}")

        sampling_interval = max(1, int(total_frames / target_frames))
        print(f"Sampling every {sampling_interval} frames to extract {target_frames} frames.")
        
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
        
        frame_count = 0
        saved_count = 0

        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if frame_count % sampling_interval == 0 and saved_count < target_frames:
                out.write(frame)
                saved_count += 1
            frame_count += 1
            if saved_count >= target_frames:
                break
        cap.release()
        out.release()
    except Exception as e:
        raise Exception(f"Error sampling and recreating video: {str(e)}")

def sample_frames_dynamic_v2(video_path, output_path, target_frames=30, target_duration=30, output_fps=30):
    try:
        cap = cv2.VideoCapture(video_path)
        input_fps = cap.get(cv2.CAP_PROP_FPS)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        
        input_duration = total_frames / input_fps      
        target_frames = target_duration * output_fps
        sampling_interval = max(1, total_frames / target_frames)
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, output_fps, (width, height))
        frame_count = 0
        saved_count = 0
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if frame_count >= saved_count * sampling_interval and saved_count < target_frames:
                out.write(frame)
                saved_count += 1
            frame_count += 1
            if saved_count >= target_frames:
                break
        
        cap.release()
        out.release()
        
    except Exception as e:
        raise Exception(f"Error sampling and recreating video: {str(e)}")

def process_video_directory(video_dir, output_csv, max_duration=30):
    results = []
    temp_dir = tempfile.mkdtemp()
    video_files = [f for f in os.listdir(video_dir) if f.endswith(('.mp4', '.avi', '.mov'))]
    
    for video_file in tqdm(video_files):
        try:
            video_path = os.path.join(video_dir, video_file)
            print(f"\nProcessing {video_file}...")
            
            duration = get_video_duration(video_path)
            print(f"Video duration: {duration:.2f} seconds")
            
            if duration > max_duration:
                sampled_frames_dir = os.path.join(temp_dir, f"resampled_{(video_file)}")
                sample_frames_dynamic_v1(video_path, sampled_frames_dir)
                process_path = sampled_frames_dir 
                print("Frame sampling complete")
            else:
                process_path = video_path

            ## uncomment for 30-second trimming

            # if duration > max_duration:
            #     print(f"Video longer than {max_duration} seconds, trimming...")
            #     temp_video_path = os.path.join(temp_dir, f"trimmed_{video_file}")
            #     trim_video(video_path, temp_video_path, max_duration)
            #     process_path = temp_video_path
            # else:
            #     process_path = video_path
            
            messages = prepare_messages(process_path)

            text = processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            image_inputs, video_inputs = process_vision_info(messages)
            inputs = processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt",
            )
            inputs = inputs.to("cuda")

            # Generate response
            generated_ids = model.generate(**inputs, max_new_tokens=128)
            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            output_text = processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )[0]

            response_lines = output_text.strip().split('\n')
            
            answer = response_lines[0].split(': ')[-1].strip()
            reason = response_lines[1].split(': ')[1].strip()

            results.append({
                'filename': video_file,
                'duration': duration,
                'trimmed': duration > max_duration,
                'anomaly': answer,
                'reason': reason,
            })
            
            print(f"Analysis complete - Anomaly detected: {answer}")
            
        except Exception as e:
            print(f"Error processing {video_file}: {str(e)}")
            results.append({
                'filename': video_file,
                'duration': -1,
                'trimmed': False,
                'anomaly': 'ERROR',
                'reason': str(e)
            })

    try:
        import shutil
        shutil.rmtree(temp_dir)
        print(f"\nCleaned up temporary directory: {temp_dir}")
    except Exception as e:
        print(f"Error cleaning up temporary directory: {str(e)}")

    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"\nResults saved to {output_csv}")
    return df


VIDEO_DIR = "/home/ubuntu/content/drive/MyDrive/shoplifting-videos"  
OUTPUT_CSV = "qwen_results.csv"
MAX_DURATION = 30 

results_df = process_video_directory(VIDEO_DIR, OUTPUT_CSV, MAX_DURATION)
print("\nAnalysis Summary:")
print(f"Total videos processed: {len(results_df)}")
print(f"Videos trimmed: {len(results_df[results_df['trimmed']])}")
print(f"Anomalies detected: {len(results_df[results_df['anomaly'] == 'Yes'])}")
print(f"Processing errors: {len(results_df[results_df['anomaly'] == 'ERROR'])}")

In [None]:
!python evaluate.py --csv result.csv

This calculates the prediction accuracy, precision and recall etc.