# Video Detection Demo with PytorchWildlife

This tutorial guides you on how to use PyTorchWildlife for video detection and classification. We will go through the process of setting up the environment, defining the detection and classification models, as well as performing inference and saving the results in an annotated video.

## Prerequisites
Install PytorchWildlife running the following commands:
```bash
conda create -n pytorch_wildlife python=3.8 -y
conda activate pytorch_wildlife
pip install PytorchWildlife
```
Also, make sure you have a CUDA-capable GPU if you intend to run the model on a GPU. This notebook can also run on CPU.

## Importing libraries
First, let's import the necessary libraries and modules.

In [17]:
from PIL import Image
import numpy as np
import supervision as sv
import torch
from PytorchWildlife.models import detection as pw_detection
from PytorchWildlife.models import classification as pw_classification
from PytorchWildlife.data import transforms as pw_trans
from PytorchWildlife import utils as pw_utils

In [18]:
import os
import requests
from bs4 import BeautifulSoup

url_prefix = "https://clarksonmsda.org/datafiles/video_detection/"
local_folder = "demo_data/videos_raw/"

# Create a folder to store downloaded files if it doesn't exist
os.makedirs(local_folder, exist_ok=True)
os.makedirs('demo_data/videos_processed/', exist_ok=True)

# Fetch the list of files from the server
response = requests.get(url_prefix)
links = []
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    for table in soup.find_all('table'):
        for td in table.find_all("a", href = True):
            #print(td.text)
            if td.text.endswith('.mp4'):
                print(td.text)
                links.append(td.text)

    print(links)
    for link in links:
        file_url = url_prefix + link
        file_name = link.rstrip('/')
        local_path = os.path.join(local_folder, file_name)
        print(file_url)
        print(local_path)
        # Download the file
        response = requests.get(file_url)
        if response.status_code == 200:
            with open(local_path, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded: {file_name} to {local_path}")
        else:
            print(f"Failed to download: {file_name}. Status code: {response.status_code}")

    print("Download process completed.")
else:
    print(f"Failed to fetch file list. Status code: {response.status_code}")

c17_23_09_01_00_00.mp4
c17_23_09_02_00_00.mp4
c17_23_10_19_00_00.mp4
c17_23_10_20_00_00.mp4
c17_23_10_21_00_00.mp4
['c17_23_09_01_00_00.mp4', 'c17_23_09_02_00_00.mp4', 'c17_23_10_19_00_00.mp4', 'c17_23_10_20_00_00.mp4', 'c17_23_10_21_00_00.mp4']
https://clarksonmsda.org/datafiles/video_detection/c17_23_09_01_00_00.mp4
demo_data/videos_raw/c17_23_09_01_00_00.mp4
Downloaded: c17_23_09_01_00_00.mp4 to demo_data/videos_raw/c17_23_09_01_00_00.mp4
https://clarksonmsda.org/datafiles/video_detection/c17_23_09_02_00_00.mp4
demo_data/videos_raw/c17_23_09_02_00_00.mp4
Downloaded: c17_23_09_02_00_00.mp4 to demo_data/videos_raw/c17_23_09_02_00_00.mp4
https://clarksonmsda.org/datafiles/video_detection/c17_23_10_19_00_00.mp4
demo_data/videos_raw/c17_23_10_19_00_00.mp4
Downloaded: c17_23_10_19_00_00.mp4 to demo_data/videos_raw/c17_23_10_19_00_00.mp4
https://clarksonmsda.org/datafiles/video_detection/c17_23_10_20_00_00.mp4
demo_data/videos_raw/c17_23_10_20_00_00.mp4
Downloaded: c17_23_10_20_00_00.mp4 t

In [19]:
import os
for vid in os.listdir(local_folder):
    if vid.endswith('.mp4') or vid.endswith('.MP4'):
        SOURCE_VIDEO_PATH = local_folder + vid
        TARGET_VIDEO_PATH = 'demo_data/videos_processed' + vid.split('.')[0]+'_processeed.MP4'
        print(SOURCE_VIDEO_PATH) 
        print(TARGET_VIDEO_PATH)
        
        
    

demo_data/videos_raw/c17_23_09_02_00_00.mp4
demo_data/videos_processedc17_23_09_02_00_00_processeed.MP4
demo_data/videos_raw/c17_23_10_21_00_00.mp4
demo_data/videos_processedc17_23_10_21_00_00_processeed.MP4
demo_data/videos_raw/c17_23_10_19_00_00.mp4
demo_data/videos_processedc17_23_10_19_00_00_processeed.MP4
demo_data/videos_raw/c17_23_10_20_00_00.mp4
demo_data/videos_processedc17_23_10_20_00_00_processeed.MP4
demo_data/videos_raw/c17_23_09_01_00_00.mp4
demo_data/videos_processedc17_23_09_01_00_00_processeed.MP4


## Setting GPU
If you are using a GPU for this exercise, please specify which GPU to use for the computations. By default, GPU number 0 is used. Adjust this as per your setup. You don't need to run this cell if you are using a CPU.

In [20]:
torch.cuda.set_device(0) # Use only if you are running on GPU

In [21]:
DEVICE = "cuda" # Use "cuda" if you are running on GPU. Use "cpu" if you are running on CPU

def callback(frame: np.ndarray, index: int) -> np.ndarray:
    results_det = detection_model.single_image_detection(trans_det(frame), frame.shape, index)
    labels = []
    for xyxy in results_det["detections"].xyxy:
        cropped_image = sv.crop_image(image=frame, xyxy=xyxy)
        results_clf = classification_model.single_image_classification(trans_clf(Image.fromarray(cropped_image)))
        labels.append("{} {:.2f}".format(results_clf["prediction"], results_clf["confidence"]))
    annotated_frame = box_annotator.annotate(scene=frame, detections=results_det["detections"], labels=labels)
    return annotated_frame 

import os
for vid in os.listdir(local_folder):
    if vid.endswith('.mp4') or vid.endswith('.MP4'):
        SOURCE_VIDEO_PATH = local_folder + vid
        TARGET_VIDEO_PATH = 'demo_data/videos_processed/' + vid.split('.')[0]+'_processeed.MP4'
        print(SOURCE_VIDEO_PATH) 
        print(TARGET_VIDEO_PATH)

        detection_model = pw_detection.MegaDetectorV5(device=DEVICE, pretrained=True)
        classification_model = pw_classification.AI4GAmazonRainforest(device=DEVICE, pretrained=True)
        trans_det = pw_trans.MegaDetector_v5_Transform(target_size=detection_model.IMAGE_SIZE,
                                               stride=detection_model.STRIDE)
        trans_clf = pw_trans.Classification_Inference_Transform(target_size=224)
        box_annotator = sv.BoxAnnotator(thickness=4, text_thickness=4, text_scale=2)

        pw_utils.process_video(source_path=SOURCE_VIDEO_PATH, target_path=TARGET_VIDEO_PATH, callback=callback, target_fps=5)

demo_data/videos_raw/c17_23_09_02_00_00.mp4
demo_data/videos_processed/c17_23_09_02_00_00_processeed.MP4


Fusing layers... 
Fusing layers... 
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
100%|█████████████████████████████████████████| 642/642 [03:03<00:00,  3.49it/s]


demo_data/videos_raw/c17_23_10_21_00_00.mp4
demo_data/videos_processed/c17_23_10_21_00_00_processeed.MP4


Fusing layers... 
Fusing layers... 
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
100%|█████████████████████████████████████████| 131/131 [00:37<00:00,  3.48it/s]


demo_data/videos_raw/c17_23_10_19_00_00.mp4
demo_data/videos_processed/c17_23_10_19_00_00_processeed.MP4


Fusing layers... 
Fusing layers... 
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
100%|█████████████████████████████████████████| 595/595 [02:49<00:00,  3.50it/s]


demo_data/videos_raw/c17_23_10_20_00_00.mp4
demo_data/videos_processed/c17_23_10_20_00_00_processeed.MP4


Fusing layers... 
Fusing layers... 
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
260it [01:13,  3.53it/s]                                                        


demo_data/videos_raw/c17_23_09_01_00_00.mp4
demo_data/videos_processed/c17_23_09_01_00_00_processeed.MP4


Fusing layers... 
Fusing layers... 
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
643it [03:06,  3.45it/s]                                                        


## Model Initialization
We'll  define the device to run the models and then we will initialize the models for both video detection and classification.

In [22]:
DEVICE = "cuda" # Use "cuda" if you are running on GPU. Use "cpu" if you are running on CPU
 
SOURCE_VIDEO_PATH = "./demo_data/videos/opossum_example.MP4"
TARGET_VIDEO_PATH = "./demo_data/videos/opossum_example_processed.MP4"
detection_model = pw_detection.MegaDetectorV5(device=DEVICE, pretrained=True)
classification_model = pw_classification.AI4GOpossum(device=DEVICE, pretrained=True)

Fusing layers... 
Fusing layers... 
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs


## Transformations
Define transformations for both detection and classification. These transformations preprocess the video frames for the models.

In [23]:
trans_det = pw_trans.MegaDetector_v5_Transform(target_size=detection_model.IMAGE_SIZE,
                                               stride=detection_model.STRIDE)
trans_clf = pw_trans.Classification_Inference_Transform(target_size=224)

## Video Processing
For each frame in the video, we'll apply detection and classification, and then annotate the frame with the results. The processed video will be saved with annotated detections and classifications.

In [24]:
box_annotator = sv.BoxAnnotator(thickness=4, text_thickness=4, text_scale=2)

def callback(frame: np.ndarray, index: int) -> np.ndarray:
    results_det = detection_model.single_image_detection(trans_det(frame), frame.shape, index)
    labels = []
    for xyxy in results_det["detections"].xyxy:
        cropped_image = sv.crop_image(image=frame, xyxy=xyxy)
        results_clf = classification_model.single_image_classification(trans_clf(Image.fromarray(cropped_image)))
        labels.append("{} {:.2f}".format(results_clf["prediction"], results_clf["confidence"]))
    annotated_frame = box_annotator.annotate(scene=frame, detections=results_det["detections"], labels=labels)
    return annotated_frame 

pw_utils.process_video(source_path=SOURCE_VIDEO_PATH, target_path=TARGET_VIDEO_PATH, callback=callback, target_fps=5)

100%|█████████████████████████████████████████| 100/100 [00:27<00:00,  3.63it/s]


### Copyright (c) Microsoft Corporation. All rights reserved.
### Licensed under the MIT License.