In [None]:
# pip install numpy==1.24.4 Pillow==10.3.0 Requests==2.31.0 torch torchvision opencv-python 'numpy<2' 'moviepy==1.0' accelerate 'jinja2>3.1' qwen-vl-utils av git+https://github.com/huggingface/transformers.git
# pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 accelerate
# pip install ultralytics supervision
# pip install pybind11
# sudo apt-get install build-essential
# sudo apt-get install python3-dev
# python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

# Step 1: Detect a person in region of interest e.g. checkout area
Using YOLOv8, we can define a region of interest, so analysis can start when a person is in that zone

In [None]:
from IPython import display
import ultralytics
import supervision as sv
import numpy as np

In [3]:
from ultralytics import YOLO
model = YOLO('yolov8s.pt')

In [4]:
#Video processing and metedata about the video file
ROOT_DIR = "/home/ubuntu" 
vid_path = "video-3.mp4"

sv.VideoInfo.from_video_path(vid_path)

VideoInfo(width=1080, height=1920, fps=30, total_frames=302)

In [5]:
#process a video frame by frame, 
#perform object detection using the YOLO model,&
#annotate the frames with detection boxes, labels, and trigger actions based on a polygon zone.

# initiate polygon zone
polygon = np.array([(900, 0), (1080, 0), (1080, 1920), (100, 1920)])

video_info = sv.VideoInfo.from_video_path(f"{ROOT_DIR}/{vid_path}")   # This uses the VideoInfo class from the supervision library (sv) to retrieve metadata from the video file located at the path specified 
zone = sv.PolygonZone(polygon=polygon) # initializes a Polygon Zone, which is a region of interest (ROI) in the video frame defined by the previously created polygon.

# initiate annotators
box_annotator = sv.BoxAnnotator(thickness=4)
label_annotator = sv.LabelAnnotator(text_thickness=4, text_scale=2)
zone_annotator = sv.PolygonZoneAnnotator(zone=zone, color=sv.Color.WHITE, thickness=6, text_thickness=6, text_scale=4)

def process_frame(frame: np.ndarray, _) -> np.ndarray:
    # detect
    results = model(frame, imgsz=320)[0]
    detections = sv.Detections.from_ultralytics(results)
    detections = detections[detections.class_id == 0]
    zone.trigger(detections=detections)

    # annotate
    labels = [f"{model.names[class_id]} {confidence:0.2f}" for _, _, confidence, class_id, _, _ in detections]
    frame = box_annotator.annotate(scene=frame, detections=detections)
    frame = label_annotator.annotate(scene=frame, detections=detections, labels=labels)
    frame = zone_annotator.annotate(scene=frame)

    return frame

sv.process_video(source_path=vid_path, target_path=f"{ROOT_DIR}/out_{vid_path}", callback=process_frame)

from IPython import display
display.clear_output()

## Result

By defining our **polygon coordinates** we create a zone of interest (the white box in the middle)

We see that our count is zero in the left frame, where the customer has not entered the zone

And the count increases to 1 as they enter

Since we have this information programmtically, we can save the next 20 seconds of video feed to capture all the activities that happened. Only then do we need to analyze the occurrence, saving us on inference cost.

Example scene 1
<br>
<p float="left">
  <img src="media/scene1-example1.png" alt="Image 1" width="45%" style="margin-right: 5%;" />
  <img src="media/scene1-example2.png" alt="Image 2" width="45%" />
</p>

Example scene 2 
<br>
<img src="media/scene2-example.png" alt="Image 2" width="45%" />

Example scene 3
<br>
<p float="left">
  <img src="media/scene3-example1.png" alt="Image 1" width="45%" style="margin-right: 5%;" />
  <img src="media/scene3-example2.png" alt="Image 2" width="45%" />
</p>

Basically, we would need to map out the coordinates for every camera the client has

# Step 2: Ask the VLM to tell us if there is theft in the scene or not

In [3]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00,  1.24it/s]


## Zero Shot

In zero shot, we directly ask the model to tell us if there is any shoplifting in the video or not, without providing examples

In [None]:
for _ in range(5):
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": "/home/ubuntu/shopping-activitity-detection/media/video-2.mp4",
                    "max_pixels": 360 * 420,
                    "fps": 1.0,
                },
                {
                    "type": "text", 
                    "text": """
                        Is there an indication of suspicious store activity in this video such as shoplifting or not. 
                        Reply with two fields
                        answer: Yes or No
                        reason: Your reason
                        """
                },
            ],
        }
    ]

    # Prepare input for inference, only including video-3
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Processing Vision inputs. Process only the user video (video-6) by passing the relevant part of `messages` directly
    image_inputs, video_inputs = process_vision_info(messages)

    # Preparing final inputs
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Model Inference
    generated_ids = model.generate(**inputs, max_new_tokens=128)

    #processing the output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    # Display the output
    from IPython.display import Markdown
    # Markdown(output_text[0].replace('\n', '<br>'))
    print(output_text[0])
    print()

Yes
Reason: The video shows a man stealing a wallet from a woman's purse in a store.

Yes
Reason: The video shows a man stealing a wallet from a woman's purse in a store.

Yes
Reason: The video shows a man stealing a wallet from a woman's purse in a store.

Yes
Reason: The video shows a man stealing a wallet from a woman's purse in a store.

Yes
Reason: The video shows a man stealing a wallet from a woman's purse in a store.



## Few Shot

To perform few shot prompting, the model is provided with 2 or more examples

Here, we mimic the typical user-assistant conversation flow and add the examples as the chat history. This is demonstrated below


---

<div align="right">

**User**  
*Is there an indication of suspicious store activity in this video such as shoplifting or not. 
<br>Reply with two fields
<br>answer: Yes or No
<br>reason: Your reason
<br>Video: 📽️*

</div>

---

**Assistant**  
*Yes <br> Reason: The man in the video is seen taking items from the cashier's drawer*  

---

<div align="right">

**User**  
*Is there an indication of suspicious store activity in this video such as shoplifting or not. 
<br>Reply with two fields
<br>answer: Yes or No
<br>reason: Your reason
<br>Video: 📽️*

</div>

---

**Assistant**  
*No <br> Reason: The video shows a man engaging in normal shopping activities with no signs of shoplifting*

---

<div align="right">

**User**  
*Is there an indication of suspicious store activity in this video such as shoplifting or not. 
<br>Reply with two fields
<br>answer: Yes or No
<br>reason: Your reason
<br>Video: 📽️*

</div>

---

**Assistant**  


---


With this history of the conversation that includes the **processed video data** and corresponding **manual responses** provided as context, the model continues its generation, leveraging on the examples to align its future outputs.

In [None]:
for _ in range(5):
    messages = [
        {
            # ===================== Example One =====================
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": "/home/ubuntu/shopping-activitity-detection/media/video-6.mp4",
                    "max_pixels": 360 * 420,
                    "fps": 1.0,
                },
                {
                    "type": "text", 
                    "text": """Is there an indication of suspicious store activity in this video such as shoplifting or not. 
                        Reply with two fields
                        answer: Yes or No
                        reason: Your reason"""
                },
            ],
        },
        # ===================== Response One (Manually Added) =====================
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text", 
                    "text": """Yes\nReason: The video shows a man picking office supplies from a table and hiding them in his pockets in a suspicious manner"""
                },
            ],
        },
        # ===================== Example Two =====================
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": "/home/ubuntu/shopping-activitity-detection/media/video-3.mp4",
                    "max_pixels": 360 * 420,
                    "fps": 1.0,
                },
                {
                    "type": "text", 
                    "text": """Is there an indication of suspicious store activity in this video such as shoplifting or not. 
                        Reply with two fields
                        answer: Yes or No
                        reason: Your reason"""
                },
            ],
        },
        # ===================== Response Two (Manually Added) =====================
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text", 
                    "text": """No\nReason: The video shows one man casually shopping with a basket and another browsing items on the shelf, no strong signs of shoplifting"""
                },
            ],
        },
        # ===================== Query Video =====================
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": "/home/ubuntu/shopping-activitity-detection/media/video-2.mp4",
                    "max_pixels": 360 * 420,
                    "fps": 1.0,
                },
                {
                    "type": "text", 
                    "text": """
                        Is there a strong indication of suspicious store activity in this video such as shoplifting or not. 
                        Reply with two fields
                        answer: Yes or No
                        reason: Your reason
                        """
                },
            ],
        }
    ]

    # Prepare input for inference, only including video-3
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Processing Vision inputs. Process only the user video (video-6) by passing the relevant part of `messages` directly
    image_inputs, video_inputs = process_vision_info(messages)

    # Preparing final inputs
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Model Inference
    generated_ids = model.generate(**inputs, max_new_tokens=128)

    #processing the output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    # Display the output
    from IPython.display import Markdown
    # Markdown(output_text[0].replace('\n', '<br>'))
    print(output_text[0])
    print()

Yes
Reason: The video shows a man stealing a bag from a woman's cart and running away

Yes
Reason: The video shows a man stealing a bag from a woman's cart and running away

Yes
Reason: The video shows a man stealing a bag from a woman's cart and running away

Yes
Reason: The video shows a man stealing a bag from a woman's cart and running away

Yes
Reason: The video shows a man stealing a bag from a woman's cart and running away



## Evaluation

We setup an evaluation set comprising shoplifting videos from the [UCF Crime Dataset](https://www.crcv.ucf.edu/projects/real-world/) and few other videos

We then use the model to analyze each of videos, storing each response for comparison with the ground truth. This is to help us use a scientific approach to comparing methods

In [None]:
# pip install gdown
# gdown "https://drive.google.com/uc?export=download&id=1sHxoiUIZ66Dh0jBrAalBZ5Rp-yLUmt_3"
# unzip All-Shoplifting.zip

In [None]:
import os
import pandas as pd
from pathlib import Path
import json
import cv2
from moviepy.editor import VideoFileClip
import tempfile
from tqdm import tqdm

def prepare_messages(process_path, mode="few_shot"):
    if mode == "zero_shot":

        return [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": process_path,
                        "max_pixels": 360 * 420,
                        "fps": 1.0,
                    },
                    {
                        "type": "text", 
                        "text": """
                            Is there a strong indication of suspicious store activity in this video such as shoplifting or not. 
                            Reply with two fields
                            answer: Yes or No
                            reason: Your reason
                            """
                    },
                ],
            }
        ]

    else:

        return [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": "/home/ubuntu/shopping-activitity-detection/media/video-6.mp4",
                        "max_pixels": 360 * 420,
                        "fps": 1.0,
                    },
                    {
                        "type": "text", 
                        "text": """Is there an indication of suspicious store activity in this video such as shoplifting or not. 
                            Reply with two fields
                            answer: Yes or No
                            reason: Your reason"""
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [
                    {
                        "type": "text", 
                        "text": """Yes\nReason: The video shows a man picking office supplies from a table and hiding them in his pockets in a suspicious manner"""
                    },
                ],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": "/home/ubuntu/shopping-activitity-detection/media/video-3.mp4",
                        "max_pixels": 360 * 420,
                        "fps": 1.0,
                    },
                    {
                        "type": "text", 
                        "text": """Is there an indication of suspicious store activity in this video such as shoplifting or not. 
                            Reply with two fields
                            answer: Yes or No
                            reason: Your reason"""
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [
                    {
                        "type": "text", 
                        "text": """No\nReason: The video shows one man casually shopping with a basket and another browsing items on the shelf, no strong signs of shoplifting"""
                    },
                ],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": process_path,
                        "max_pixels": 360 * 420,
                        "fps": 1.0,
                    },
                    {
                        "type": "text", 
                        "text": """
                            Is there a strong indication of suspicious store activity in this video such as shoplifting or not. 
                            Reply with two fields
                            answer: Yes or No
                            reason: Your reason
                            """
                    },
                ],
            }
        ]
    

def get_video_duration(video_path):
    """Get the duration of a video in seconds."""
    try:
        clip = VideoFileClip(video_path)
        duration = clip.duration
        clip.close()
        return duration
    except Exception as e:
        raise Exception(f"Error getting video duration: {str(e)}")

def trim_video(input_path, output_path, duration=30):
    """Trim video to specified duration in seconds."""
    try:
        with VideoFileClip(input_path) as clip:
            trimmed_clip = clip.subclip(0, duration)
            trimmed_clip.write_videofile(output_path, 
                                       codec='libx264', 
                                       audio=False,
                                       logger=None)  
    except Exception as e:
        raise Exception(f"Error trimming video: {str(e)}")

def sample_frames_dynamic_v1(video_path, output_path, target_frames=30):
    try:
        cap = cv2.VideoCapture(video_path)
        fps = 1 #cap.get(cv2.CAP_PROP_FPS)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        duration = total_frames / fps
        print(f"Video duration: {duration:.2f} seconds, Total frames: {total_frames}")

        sampling_interval = max(1, int(total_frames / target_frames))
        print(f"Sampling every {sampling_interval} frames to extract {target_frames} frames.")
        
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
        
        frame_count = 0
        saved_count = 0

        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if frame_count % sampling_interval == 0 and saved_count < target_frames:
                out.write(frame)
                saved_count += 1
            frame_count += 1
            if saved_count >= target_frames:
                break
        cap.release()
        out.release()
    except Exception as e:
        raise Exception(f"Error sampling and recreating video: {str(e)}")

def sample_frames_dynamic_v2(video_path, output_path, target_frames=30, target_duration=30, output_fps=30):
    try:
        cap = cv2.VideoCapture(video_path)
        input_fps = cap.get(cv2.CAP_PROP_FPS)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        
        input_duration = total_frames / input_fps      
        target_frames = target_duration * output_fps
        sampling_interval = max(1, total_frames / target_frames)
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, output_fps, (width, height))
        frame_count = 0
        saved_count = 0
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if frame_count >= saved_count * sampling_interval and saved_count < target_frames:
                out.write(frame)
                saved_count += 1
            frame_count += 1
            if saved_count >= target_frames:
                break
        
        cap.release()
        out.release()
        
    except Exception as e:
        raise Exception(f"Error sampling and recreating video: {str(e)}")

def process_video_directory(video_dir, output_csv, max_duration=30):
    results = []
    temp_dir = tempfile.mkdtemp()
    video_files = [f for f in os.listdir(video_dir) if f.endswith(('.mp4', '.avi', '.mov'))]
    
    for video_file in tqdm(video_files):
        try:
            video_path = os.path.join(video_dir, video_file)
            print(f"\nProcessing {video_file}...")
            
            duration = get_video_duration(video_path)
            print(f"Video duration: {duration:.2f} seconds")
            
            if duration > max_duration:
                sampled_frames_dir = os.path.join(temp_dir, f"resampled_{(video_file)}")
                sample_frames_dynamic_v1(video_path, sampled_frames_dir)
                process_path = sampled_frames_dir 
                print("Frame sampling complete")
            else:
                process_path = video_path

            ## uncomment for 30-second trimming

            # if duration > max_duration:
            #     print(f"Video longer than {max_duration} seconds, trimming...")
            #     temp_video_path = os.path.join(temp_dir, f"trimmed_{video_file}")
            #     trim_video(video_path, temp_video_path, max_duration)
            #     process_path = temp_video_path
            # else:
            #     process_path = video_path
            
            messages = prepare_messages(process_path, mode="zero_shot")

            text = processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            image_inputs, video_inputs = process_vision_info(messages)
            inputs = processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt",
            )
            inputs = inputs.to("cuda")

            # Generate response
            generated_ids = model.generate(**inputs, max_new_tokens=128)
            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            output_text = processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )[0]

            response_lines = output_text.strip().split('\n')
            
            answer = response_lines[0].split(': ')[-1].strip()
            reason = response_lines[1].split(': ')[1].strip()

            results.append({
                'filename': video_file,
                'duration': duration,
                'trimmed': duration > max_duration,
                'anomaly': answer,
                'reason': reason,
            })
            
            print(f"Analysis complete - Anomaly detected: {answer}")
            
        except Exception as e:
            print(f"Error processing {video_file}: {str(e)}")
            results.append({
                'filename': video_file,
                'duration': -1,
                'trimmed': False,
                'anomaly': 'ERROR',
                'reason': str(e)
            })

    try:
        import shutil
        shutil.rmtree(temp_dir)
        print(f"\nCleaned up temporary directory: {temp_dir}")
    except Exception as e:
        print(f"Error cleaning up temporary directory: {str(e)}")

    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"\nResults saved to {output_csv}")
    return df


VIDEO_DIR = "/home/ubuntu/shopping-activitity-detection/All-Shoplifting"  
OUTPUT_CSV = "qwen_results.csv"
MAX_DURATION = 30 

results_df = process_video_directory(VIDEO_DIR, OUTPUT_CSV, MAX_DURATION)
print("\nAnalysis Summary:")
print(f"Total videos processed: {len(results_df)}")
print(f"Videos trimmed: {len(results_df[results_df['trimmed']])}")
print(f"Anomalies detected: {len(results_df[results_df['anomaly'] == 'Yes'])}")
print(f"Processing errors: {len(results_df[results_df['anomaly'] == 'ERROR'])}")

  0%|          | 0/97 [00:00<?, ?it/s]


Processing Nonshoplifting-19.mp4...
Video duration: 25.16 seconds


  1%|          | 1/97 [00:03<05:22,  3.36s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting052_x264.mp4...
Video duration: 254.88 seconds
Video duration: 7646.00 seconds, Total frames: 7646
Sampling every 254 frames to extract 30 frames.
Frame sampling complete


  2%|▏         | 2/97 [00:06<05:12,  3.29s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-24.mp4...
Video duration: 10.32 seconds


  3%|▎         | 3/97 [00:08<04:02,  2.58s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-28.mp4...
Video duration: 12.00 seconds


  4%|▍         | 4/97 [00:10<03:30,  2.27s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-37.mp4...
Video duration: 18.36 seconds


  5%|▌         | 5/97 [00:12<03:43,  2.42s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting034_x264.mp4...
Video duration: 397.90 seconds
Video duration: 11937.00 seconds, Total frames: 11937
Sampling every 397 frames to extract 30 frames.
Frame sampling complete


  6%|▌         | 6/97 [00:17<04:35,  3.03s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-39.mp4...
Video duration: 9.84 seconds


  7%|▋         | 7/97 [00:18<03:59,  2.66s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-41.mp4...
Video duration: 9.96 seconds


  8%|▊         | 8/97 [00:20<03:34,  2.41s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting004_x264.mp4...
Video duration: 222.48 seconds
Video duration: 6673.00 seconds, Total frames: 6673
Sampling every 222 frames to extract 30 frames.
Frame sampling complete


  9%|▉         | 9/97 [00:23<03:45,  2.56s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-31.mp4...
Video duration: 14.35 seconds


 10%|█         | 10/97 [00:26<03:35,  2.48s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting038_x264.mp4...
Video duration: 96.83 seconds
Video duration: 2904.00 seconds, Total frames: 2904
Sampling every 96 frames to extract 30 frames.
Frame sampling complete


 11%|█▏        | 11/97 [00:28<03:40,  2.57s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-45.mp4...
Video duration: 25.73 seconds


 12%|█▏        | 12/97 [00:32<04:11,  2.95s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting016_x264.mp4...
Video duration: 49.47 seconds
Video duration: 1483.00 seconds, Total frames: 1483
Sampling every 49 frames to extract 30 frames.
Frame sampling complete


 13%|█▎        | 13/97 [00:34<03:48,  2.72s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-47.mp4...
Video duration: 13.14 seconds


 14%|█▍        | 14/97 [00:36<03:25,  2.48s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-18.mp4...
Video duration: 16.08 seconds


 15%|█▌        | 15/97 [00:38<03:13,  2.36s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting017_x264.mp4...
Video duration: 15.34 seconds


 16%|█▋        | 16/97 [00:41<03:12,  2.37s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting041_x264.mp4...
Video duration: 411.30 seconds
Video duration: 12335.00 seconds, Total frames: 12335
Sampling every 411 frames to extract 30 frames.
Frame sampling complete


 18%|█▊        | 17/97 [00:44<03:39,  2.74s/it]

Analysis complete - Anomaly detected: Yes

Processing video-7.mp4...
Video duration: 11.20 seconds


 19%|█▊        | 18/97 [00:46<03:11,  2.43s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting044_x264.mp4...
Video duration: 485.17 seconds
Video duration: 14555.00 seconds, Total frames: 14555
Sampling every 485 frames to extract 30 frames.
Frame sampling complete


 20%|█▉        | 19/97 [00:51<04:07,  3.17s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting054_x264.mp4...
Video duration: 34.23 seconds
Video duration: 1025.00 seconds, Total frames: 1025
Sampling every 34 frames to extract 30 frames.
Frame sampling complete


 21%|██        | 20/97 [00:53<03:41,  2.87s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting055_x264.mp4...
Video duration: 225.70 seconds
Video duration: 6770.00 seconds, Total frames: 6770
Sampling every 225 frames to extract 30 frames.
Frame sampling complete


 22%|██▏       | 21/97 [00:56<03:39,  2.89s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-2.mp4...
Video duration: 10.72 seconds


 23%|██▎       | 22/97 [00:57<03:01,  2.42s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-43.mp4...
Video duration: 11.64 seconds


 24%|██▎       | 23/97 [00:59<02:47,  2.26s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting026_x264.mp4...
Video duration: 47.09 seconds
Video duration: 1410.00 seconds, Total frames: 1410
Sampling every 47 frames to extract 30 frames.
Frame sampling complete


 25%|██▍       | 24/97 [01:01<02:36,  2.14s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-17.mp4...
Video duration: 15.62 seconds


 26%|██▌       | 25/97 [01:03<02:33,  2.14s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting019_x264.mp4...
Video duration: 21.92 seconds


 27%|██▋       | 26/97 [01:05<02:33,  2.16s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-23.mp4...
Video duration: 11.71 seconds


 28%|██▊       | 27/97 [01:07<02:22,  2.04s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-50.mp4...
Video duration: 23.71 seconds


 29%|██▉       | 28/97 [01:12<03:10,  2.76s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-25.mp4...
Video duration: 21.92 seconds


 30%|██▉       | 29/97 [01:14<03:07,  2.76s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting030_x264.mp4...
Video duration: 806.48 seconds
Video duration: 24193.00 seconds, Total frames: 24193
Sampling every 806 frames to extract 30 frames.
Frame sampling complete


 31%|███       | 30/97 [01:19<03:50,  3.45s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-46.mp4...
Video duration: 110.90 seconds
Video duration: 1283.00 seconds, Total frames: 1283
Sampling every 42 frames to extract 30 frames.
Frame sampling complete


 32%|███▏      | 31/97 [01:23<03:56,  3.58s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-32.mp4...
Video duration: 14.22 seconds


 33%|███▎      | 32/97 [01:26<03:26,  3.17s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-1.mp4...
Video duration: 21.44 seconds


 34%|███▍      | 33/97 [01:28<03:16,  3.06s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-48.mp4...
Video duration: 27.91 seconds


 35%|███▌      | 34/97 [01:33<03:49,  3.65s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting047_x264.mp4...
Video duration: 79.29 seconds
Video duration: 2377.00 seconds, Total frames: 2377
Sampling every 79 frames to extract 30 frames.
Frame sampling complete


 36%|███▌      | 35/97 [01:36<03:33,  3.45s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-15.mp4...
Video duration: 10.31 seconds


 37%|███▋      | 36/97 [01:38<02:55,  2.88s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-4.mp4...
Video duration: 18.72 seconds


 38%|███▊      | 37/97 [01:40<02:44,  2.74s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting031_x264.mp4...
Video duration: 14.94 seconds


 39%|███▉      | 38/97 [01:42<02:18,  2.35s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-9.mp4...
Video duration: 13.81 seconds


 40%|████      | 39/97 [01:44<02:08,  2.22s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-22.mp4...
Video duration: 15.15 seconds


 41%|████      | 40/97 [01:46<02:08,  2.25s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-49.mp4...
Video duration: 17.26 seconds


 42%|████▏     | 41/97 [01:49<02:21,  2.53s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-27.mp4...
Video duration: 19.64 seconds


 43%|████▎     | 42/97 [01:51<02:11,  2.40s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-13.mp4...
Video duration: 15.48 seconds


 44%|████▍     | 43/97 [01:53<02:03,  2.30s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting028_x264.mp4...
Video duration: 45.28 seconds
Video duration: 1357.00 seconds, Total frames: 1357
Sampling every 45 frames to extract 30 frames.
Frame sampling complete


 45%|████▌     | 44/97 [01:55<01:54,  2.15s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting010_x264.mp4...
Video duration: 91.20 seconds
Video duration: 2736.00 seconds, Total frames: 2736
Sampling every 91 frames to extract 30 frames.
Frame sampling complete


 46%|████▋     | 45/97 [01:57<01:49,  2.11s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting042_x264.mp4...
Video duration: 170.75 seconds
Video duration: 5121.00 seconds, Total frames: 5121
Sampling every 170 frames to extract 30 frames.
Frame sampling complete


 47%|████▋     | 46/97 [02:00<02:01,  2.37s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting049_x264.mp4...
Video duration: 71.67 seconds
Video duration: 2149.00 seconds, Total frames: 2149
Sampling every 71 frames to extract 30 frames.
Frame sampling complete


 48%|████▊     | 47/97 [02:02<01:54,  2.29s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting009_x264.mp4...
Video duration: 173.37 seconds
Video duration: 5201.00 seconds, Total frames: 5201
Sampling every 173 frames to extract 30 frames.
Frame sampling complete


 49%|████▉     | 48/97 [02:05<01:55,  2.36s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-3.mp4...
Video duration: 10.68 seconds


 51%|█████     | 49/97 [02:06<01:43,  2.15s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-21.mp4...
Video duration: 25.96 seconds


 52%|█████▏    | 50/97 [02:09<01:53,  2.42s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-5.mp4...
Video duration: 16.56 seconds


 53%|█████▎    | 51/97 [02:12<01:53,  2.47s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting050_x264.mp4...
Video duration: 83.63 seconds
Video duration: 2506.00 seconds, Total frames: 2506
Sampling every 83 frames to extract 30 frames.
Frame sampling complete


 54%|█████▎    | 52/97 [02:15<01:51,  2.48s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-8.mp4...
Video duration: 9.74 seconds


 55%|█████▍    | 53/97 [02:16<01:38,  2.24s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting036_x264.mp4...
Video duration: 121.98 seconds
Video duration: 3657.00 seconds, Total frames: 3657
Sampling every 121 frames to extract 30 frames.
Frame sampling complete


 56%|█████▌    | 54/97 [02:19<01:46,  2.47s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting037_x264.mp4...
Video duration: 46.22 seconds
Video duration: 1386.00 seconds, Total frames: 1386
Sampling every 46 frames to extract 30 frames.
Frame sampling complete


 57%|█████▋    | 55/97 [02:21<01:40,  2.38s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting032_x264.mp4...
Video duration: 47.59 seconds
Video duration: 1426.00 seconds, Total frames: 1426
Sampling every 47 frames to extract 30 frames.
Frame sampling complete


 58%|█████▊    | 56/97 [02:23<01:32,  2.26s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting053_x264.mp4...
Video duration: 69.94 seconds
Video duration: 2097.00 seconds, Total frames: 2097
Sampling every 69 frames to extract 30 frames.
Frame sampling complete


 59%|█████▉    | 57/97 [02:26<01:33,  2.35s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-29.mp4...
Video duration: 9.60 seconds


 60%|█████▉    | 58/97 [02:28<01:25,  2.19s/it]

Analysis complete - Anomaly detected: No

Processing video-6.mp4...
Video duration: 11.90 seconds


 61%|██████    | 59/97 [02:30<01:19,  2.09s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-12.mp4...
Video duration: 12.00 seconds


 62%|██████▏   | 60/97 [02:31<01:14,  2.00s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-20.mp4...
Video duration: 18.62 seconds


 63%|██████▎   | 61/97 [02:34<01:16,  2.13s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-30.mp4...
Video duration: 15.72 seconds


 64%|██████▍   | 62/97 [02:36<01:14,  2.12s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-14.mp4...
Video duration: 10.76 seconds


 65%|██████▍   | 63/97 [02:38<01:08,  2.03s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting025_x264.mp4...
Video duration: 60.94 seconds
Video duration: 1824.00 seconds, Total frames: 1824
Sampling every 60 frames to extract 30 frames.
Frame sampling complete


 66%|██████▌   | 64/97 [02:40<01:04,  1.96s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting040_x264.mp4...
Video duration: 907.05 seconds
Video duration: 27208.00 seconds, Total frames: 27208
Sampling every 906 frames to extract 30 frames.
Frame sampling complete


 67%|██████▋   | 65/97 [02:46<01:42,  3.22s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting043_x264.mp4...
Video duration: 317.30 seconds
Video duration: 9518.00 seconds, Total frames: 9518
Sampling every 317 frames to extract 30 frames.
Frame sampling complete


 68%|██████▊   | 66/97 [02:49<01:39,  3.20s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-42.mp4...
Video duration: 12.44 seconds


 69%|██████▉   | 67/97 [02:51<01:24,  2.80s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting006_x264.mp4...
Video duration: 105.20 seconds
Video duration: 3156.00 seconds, Total frames: 3156
Sampling every 105 frames to extract 30 frames.
Frame sampling complete


 70%|███████   | 68/97 [02:53<01:18,  2.69s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting020_x264.mp4...
Video duration: 192.37 seconds
Video duration: 5770.00 seconds, Total frames: 5770
Sampling every 192 frames to extract 30 frames.
Frame sampling complete


 71%|███████   | 69/97 [02:56<01:14,  2.66s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting027_x264.mp4...
Video duration: 62.47 seconds
Video duration: 1873.00 seconds, Total frames: 1873
Sampling every 62 frames to extract 30 frames.
Frame sampling complete


 72%|███████▏  | 70/97 [02:58<01:07,  2.49s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-11.mp4...
Video duration: 18.05 seconds


 73%|███████▎  | 71/97 [03:00<01:04,  2.50s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting014_x264.mp4...
Video duration: 2223.27 seconds
Video duration: 66698.00 seconds, Total frames: 66698
Sampling every 2223 frames to extract 30 frames.
Frame sampling complete


 74%|███████▍  | 72/97 [03:11<01:59,  4.80s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting005_x264.mp4...
Video duration: 65.57 seconds
Video duration: 1967.00 seconds, Total frames: 1967
Sampling every 65 frames to extract 30 frames.
Frame sampling complete


 75%|███████▌  | 73/97 [03:13<01:36,  4.01s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting001_x264.mp4...
Video duration: 144.88 seconds
Video duration: 4344.00 seconds, Total frames: 4344
Sampling every 144 frames to extract 30 frames.
Frame sampling complete


 76%|███████▋  | 74/97 [03:15<01:22,  3.57s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-16.mp4...
Video duration: 7.92 seconds


 77%|███████▋  | 75/97 [03:17<01:05,  2.99s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting051_x264.mp4...
Video duration: 240.64 seconds
Video duration: 7218.00 seconds, Total frames: 7218
Sampling every 240 frames to extract 30 frames.
Frame sampling complete


 78%|███████▊  | 76/97 [03:20<01:02,  2.99s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-34.mp4...
Video duration: 25.58 seconds


 79%|███████▉  | 77/97 [03:23<01:01,  3.05s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-33.mp4...
Video duration: 15.31 seconds


 80%|████████  | 78/97 [03:25<00:50,  2.65s/it]

Analysis complete - Anomaly detected: No

Processing video-2.mp4...
Video duration: 21.90 seconds


 81%|████████▏ | 79/97 [03:30<01:02,  3.47s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting033_x264.mp4...
Video duration: 30.01 seconds
Video duration: 899.00 seconds, Total frames: 899
Sampling every 29 frames to extract 30 frames.
Frame sampling complete


 82%|████████▏ | 80/97 [03:32<00:50,  2.97s/it]

Analysis complete - Anomaly detected: Yes

Processing video-3.mp4...
Video duration: 10.02 seconds


 84%|████████▎ | 81/97 [03:37<00:55,  3.49s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-36.mp4...
Video duration: 10.04 seconds


 85%|████████▍ | 82/97 [03:39<00:45,  3.04s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-6.mp4...
Video duration: 20.08 seconds


 86%|████████▌ | 83/97 [03:41<00:40,  2.87s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-26.mp4...
Video duration: 18.60 seconds


 87%|████████▋ | 84/97 [03:44<00:35,  2.75s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting045_x264.mp4...
Video duration: 54.73 seconds
Video duration: 1640.00 seconds, Total frames: 1640
Sampling every 54 frames to extract 30 frames.
Frame sampling complete


 88%|████████▊ | 85/97 [03:46<00:30,  2.58s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting048_x264.mp4...
Video duration: 103.69 seconds
Video duration: 3108.00 seconds, Total frames: 3108
Sampling every 103 frames to extract 30 frames.
Frame sampling complete


 89%|████████▊ | 86/97 [03:48<00:27,  2.53s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting029_x264.mp4...
Video duration: 72.56 seconds
Video duration: 2176.00 seconds, Total frames: 2176
Sampling every 72 frames to extract 30 frames.
Frame sampling complete


 90%|████████▉ | 87/97 [03:51<00:24,  2.48s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting039_x264.mp4...
Video duration: 93.46 seconds
Video duration: 2803.00 seconds, Total frames: 2803
Sampling every 93 frames to extract 30 frames.
Frame sampling complete


 91%|█████████ | 88/97 [03:53<00:23,  2.58s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting018_x264.mp4...
Video duration: 71.16 seconds
Video duration: 2132.00 seconds, Total frames: 2132
Sampling every 71 frames to extract 30 frames.
Frame sampling complete


 92%|█████████▏| 89/97 [03:57<00:22,  2.82s/it]

Analysis complete - Anomaly detected: Yes

Processing Shoplifting015_x264.mp4...
Video duration: 75.20 seconds
Video duration: 2256.00 seconds, Total frames: 2256
Sampling every 75 frames to extract 30 frames.
Frame sampling complete


 93%|█████████▎| 90/97 [03:59<00:18,  2.63s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-7.mp4...
Video duration: 13.68 seconds


 94%|█████████▍| 91/97 [04:01<00:14,  2.41s/it]

Analysis complete - Anomaly detected: No

Processing Shoplifting003_x264.mp4...
Video duration: 360.62 seconds
Video duration: 10817.00 seconds, Total frames: 10817
Sampling every 360 frames to extract 30 frames.
Frame sampling complete


 95%|█████████▍| 92/97 [04:05<00:14,  2.83s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-38.mp4...
Video duration: 16.08 seconds


 96%|█████████▌| 93/97 [04:07<00:10,  2.74s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-35.mp4...
Video duration: 5.24 seconds


 97%|█████████▋| 94/97 [04:09<00:07,  2.34s/it]

Analysis complete - Anomaly detected: No

Processing Nonshoplifting-44.mp4...
Video duration: 16.37 seconds


 98%|█████████▊| 95/97 [04:11<00:04,  2.27s/it]

Analysis complete - Anomaly detected: No

Processing video-4.mp4...
Video duration: 19.98 seconds


 99%|█████████▉| 96/97 [04:13<00:02,  2.30s/it]

Analysis complete - Anomaly detected: Yes

Processing Nonshoplifting-10.mp4...
Video duration: 14.28 seconds


100%|██████████| 97/97 [04:15<00:00,  2.63s/it]

Analysis complete - Anomaly detected: No

Cleaned up temporary directory: /tmp/tmprc7w0uj3

Results saved to qwen_results.csv

Analysis Summary:
Total videos processed: 97
Videos trimmed: 41
Anomalies detected: 46
Processing errors: 0





In [None]:
# python3 evaluate.py --csv qwen_result.csv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Error: Could not find file - [Errno 2] No such file or directory: 'qwen_result.csv'


This calculates the prediction accuracy, precision and recall etc.