# processing_videos.ipynb

YouTube tutorial: https://docs.ultralytics.com/models/sam-3/

Associated code: https://drive.google.com/file/d/1mjRy6VmHke95zdkaCmR_RWvPP4Hnba7q/view

In [1]:
from ultralytics.models.sam import SAM3VideoSemanticPredictor
import glob
from icecream import ic
import cv2
import os


In [None]:
import gc
import torch

def delete_results_from_gpu_memory():
    """
    Explicitly manages memory after processing each image to prevent running out of GPU memory
    """
    global results
    del results
    gc.collect() 
    torch.cuda.empty_cache() # Clears unoccupied cached memory

# Usage example:
    
delete_results_from_gpu_memory()


In [2]:
# Create a short video from the Efate roadside survey

output_file = 'roadside.mp4'

if not os.path.exists(output_file):

    # get paths to 100 images starting with '20251129_152106.jpg' (a good example)
    good_example = '20251129_152106.jpg'
    image_dir = '/home/aubrey/Desktop/Efate2025/original_images'
    image_paths = sorted(glob.glob(f'{image_dir}/*.jpg'))
    good_example_index = image_paths.index(f'{image_dir}/{good_example}')
    ic(good_example_index)
    ic(len(image_paths))
    image_paths = image_paths[good_example_index: good_example_index+100]
    ic(len(image_paths))

    # Create a VideoWriter object to save the video
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Specify the codec for the output video file
    video = cv2.VideoWriter(output_file, fourcc, 1.0, (1920, 1080))

    # Iterate over each image and write it to the video
    for image_path in image_paths:
        # image_path = os.path.join(input_folder, image_file)
        frame = cv2.imread(image_path)
        video.write(frame)

    # Release the video writer and close the video file

    video.release()
    cv2.destroyAllWindows()

In [3]:
# Initialize SAM3 predictor with configuration
overrides = dict(
    conf=0.25,              # confidence threshold
    show_conf=False,        # Enable/disable confidence display
    task="segment",         # task i.e. segment
    mode="predict",         # mode i.e. predict
    model="sam3.pt",        # model file = sam3.pt
    half=True,              # Use FP16 for faster inference on GPU.
    # device='cpu',
    imgsz=1932,  # Adjusted image size from 1920 to meet stride 14 requirement
    batch=1,
)
predictor = SAM3VideoSemanticPredictor(overrides=overrides)
results = predictor(source=output_file, text=["coconut palm tree"], stream=True)
