In [2]:
import os
import torch 
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import argparse
from sam2.build_sam import build_sam2_video_predictor

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    print("Using CPU, not recommended.")

# For efficiency
if device.type == "cuda":
    torch.autocast("cuda", dtype = torch.bfloat16).__enter__()

# Change this to the directory where you cloned this repo
BASE_PATH = "C:\\Users\\syeda\\OneDrive\\Desktop\\4th Year\\COSC419\\sam2-tests"
CHKPT_PATH = os.path.join(BASE_PATH, "checkpoints", "sam2.1_hiera_large.pt")
CONFIG_PATH = os.path.join(BASE_PATH, "sam2", "configs", "sam2.1", "sam2.1_hiera_l.yaml")
FRAMES_PATH = os.path.join(BASE_PATH, "sample_frames2")

In [None]:
torch.cuda.empty_cache()  # remove lingering allocations
model_allocated = torch.cuda.memory_allocated(device=device)
model_reserved = torch.cuda.memory_reserved(device=device)

print(f"Memory before loading model:")
print(f"  Allocated: {model_allocated / (1024 ** 3):.2f} GB")
print(f"  Reserved: {model_reserved / (1024 ** 3):.2f} GB")

Memory before loading model:
  Allocated: 1.87 GB
  Reserved: 2.15 GB


In [None]:
predictor = build_sam2_video_predictor(CONFIG_PATH, CHKPT_PATH, device=device)

model_allocated = torch.cuda.memory_allocated(device=device)
model_reserved = torch.cuda.memory_reserved(device=device)

print(f"Memory after loading model:")
print(f"  Allocated: {model_allocated / (1024 ** 3):.2f} GB")
print(f"  Reserved: {model_reserved / (1024 ** 3):.2f} GB")

Memory after loading model:
  Allocated: 1.87 GB
  Reserved: 2.92 GB


In [11]:
inference_state = predictor.init_state(video_path=FRAMES_PATH, offload_video_to_cpu=True, async_loading_frames=True)
predictor.reset_state(inference_state)
print(inference_state["video_height"], inference_state["video_width"])

frame loading (JPEG):   4%|▎         | 13/360 [00:00<00:26, 12.94it/s]

133 53


frame loading (JPEG): 100%|██████████| 360/360 [00:11<00:00, 30.94it/s]


In [12]:
print(inference_state["video_height"], inference_state["video_width"])

133 53


In [15]:
def compute_image_dimensions(img_dir=FRAMES_PATH):
    widths = []
    heights = []
    
    # Iterate over files in the directory
    for filename in os.listdir(img_dir):
        if filename.lower().endswith(".jpg"):
            img_path = os.path.join(img_dir, filename)
            try:
                with Image.open(img_path) as img:
                    w, h = img.size
                    widths.append(w)
                    heights.append(h)
            except Exception as e:
                print(f"Error loading image {img_path}: {e}")
    
    if not widths or not heights:
        print("No images processed successfully.")
        return
    
    avg_width = np.mean(widths)
    avg_height = np.mean(heights)
    min_width = np.min(widths)
    max_width = np.max(widths)
    min_height = np.min(heights)
    max_height = np.max(heights)
    
    print("First width:", widths[0])
    print("First height:", heights[0])
    print("Average width:", avg_width)
    print("Average height:", avg_height)
    print("Min width:", min_width)
    print("Min height:", min_height)
    print("Max width:", max_width)
    print("Max height:", max_height)

In [16]:
compute_image_dimensions()

First width: 53
First height: 133
Average width: 49.28888888888889
Average height: 90.30555555555556
Min width: 18
Min height: 63
Max width: 87
Max height: 133


We observe that the "inference state" dictionary assigns its video_height and video_width according to the dimensions of the *first* frame - i.e. the model assumes all frames in the video are of the same dimension. So we need to pad all frames to the max height and width before feeding to the model.

In [None]:
def find_largest_directory(base_dir):
    max_files = 0
    largest_dir = None

    # Iterate through each subdirectory in the base directory
    for subdir in os.listdir(base_dir):
        subdir_path = os.path.join(base_dir, subdir)

        # Ensure it's a directory and its name is numeric
        if os.path.isdir(subdir_path) and subdir.isdigit():
            num_files = len([f for f in os.listdir(subdir_path) if os.path.isfile(os.path.join(subdir_path, f))])

            if num_files > max_files:
                max_files = num_files
                largest_dir = subdir

    if largest_dir is not None:
        print(f"Directory with the most files: {largest_dir} ({max_files} files)")
    else:
        print("No directories with numeric names found.")

# Path to the "train" directory
dir = "C:\\Users\\syeda\\OneDrive\\Desktop\\4th Year\\COSC419\\jersey-number-pipeline\\data\\SoccerNet\\test\\images"
find_largest_directory(dir)