# Assignment 1 Coding

Abhinav Kumar
9/21/2025

Custom Torchvision dataset: 25 points
Transformations: 25 points
Visualization: 25 points


In [2]:
from pathlib import Path
import os, torch, torchvision, av
import torchvision.io as io
import torchvision.transforms.v2 as v2
from torchvision.utils import make_grid, save_image

ROOT   = Path("/workspaces/eng-ai-agents").resolve()
WS     = (ROOT / "assignments/assignment-1").resolve()

DATA_DIR    = (WS / "data").resolve()
FRAMES_DIR  = (DATA_DIR / "frames").resolve()
PREVIEW_DIR = (DATA_DIR / "preview").resolve()

VIDEO_PATH = DATA_DIR / "video.mp4"
ASTRONAUT_PATH = DATA_DIR / "astronaut.jpg"

for d in (DATA_DIR, FRAMES_DIR, PREVIEW_DIR):
    d.mkdir(parents=True, exist_ok=True)

print("WS         :", WS)
print("DATA_DIR   :", DATA_DIR)
print("VIDEO_PATH :", VIDEO_PATH, "exists:", VIDEO_PATH.exists())
print("ASTRONAUT  :", ASTRONAUT_PATH, "exists:", ASTRONAUT_PATH.exists())

print("torch:", torch.__version__)
print("torchvision:", torchvision.__version__)

WS         : /workspaces/eng-ai-agents/assignments/assignment-1
DATA_DIR   : /workspaces/eng-ai-agents/assignments/assignment-1/data
VIDEO_PATH : /workspaces/eng-ai-agents/assignments/assignment-1/data/video.mp4 exists: True
ASTRONAUT  : /workspaces/eng-ai-agents/assignments/assignment-1/data/astronaut.jpg exists: True
torch: 2.8.0+cpu
torchvision: 0.23.0+cpu


In [25]:
MAX_FRAMES = 1000

with av.open(str(VIDEO_PATH)) as container:
    total = 0
    for _ in container.decode(video=0):
        total += 1

if total == 0:
    raise RuntimeError("No frames decoded from the video. Check VIDEO_PATH or install ffmpeg/PyAV correctly.")

target = min(MAX_FRAMES, total)
idxs = torch.linspace(0, total - 1, steps=target).round().to(torch.int64)
keep = set(int(i) for i in idxs.tolist())

saved = 0
with av.open(str(VIDEO_PATH)) as container:
    for i, frame in enumerate(container.decode(video=0)):
        if i in keep:
            arr = frame.to_ndarray(format="rgb24")
            img = torch.from_numpy(arr).permute(2, 0, 1)
            save_image(img.float() / 255.0, str(FRAMES_DIR / f"frame_{saved:05d}.png"))
            saved += 1
            if saved >= target:
                break

print(f"Saved {saved} frames → {FRAMES_DIR} (backend: PyAV)")

Saved 1000 frames → /workspaces/eng-ai-agents/assignments/assignment-1/data/frames (backend: PyAV)


In [26]:
files = sorted([os.path.join(FRAMES_DIR, f) for f in os.listdir(FRAMES_DIR)
                if f.lower().endswith((".png",".jpg",".jpeg",".bmp",".webp"))])

sum_c   = torch.zeros(3, dtype=torch.float64)
sumsq_c = torch.zeros(3, dtype=torch.float64)
total_px = 0

for p in files:
    x = io.read_image(p).to(torch.float32) / 255.0  # C,H,W in [0,1]
    c, h, w = x.shape
    flat = x.reshape(c, -1)
    sum_c   += flat.sum(dim=1, dtype=torch.float64)
    sumsq_c += (flat**2).sum(dim=1, dtype=torch.float64)
    total_px += h * w

mean = (sum_c / total_px).to(torch.float32)
var  = (sumsq_c / total_px - (mean.to(torch.float64)**2)).to(torch.float32)
std  = torch.sqrt(var.clamp_min(1e-12))

print("mean:", mean.tolist())
print("std :", std.tolist())

mean: [0.386422723531723, 0.35811859369277954, 0.3309529721736908]
std : [0.2662123143672943, 0.1989150494337082, 0.17553561925888062]


In [28]:
from torch.utils.data import Dataset

class FrameDataset(Dataset):
    def __init__(self, root, transform=None):
        self.root = root
        self.transform = transform
        exts = {".png",".jpg",".jpeg",".bmp",".webp"}
        self.paths = []
        for dp, _, fns in os.walk(root):
            for fn in fns:
                if os.path.splitext(fn.lower())[1] in exts:
                    self.paths.append(os.path.join(dp, fn))
        self.paths.sort()

    def __len__(self): return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        img = io.read_image(path)
        if self.transform is not None:
            img = self.transform(img)
        return img, path

transform = v2.Compose([
    v2.Resize((224, 224)),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=mean, std=std),
])

dataset = FrameDataset(FRAMES_DIR, transform=transform)
print("dataset size:", len(dataset))
x0, p0 = dataset[0]
print("sample tensor:", x0.shape, "from:", p0)

dataset size: 1000
sample tensor: torch.Size([3, 224, 224]) from: /workspaces/eng-ai-agents/assignments/assignment-1/data/frames/frame_00000.png


In [29]:
def denorm(x, m, s):
    return (x * s[:, None, None] + m[:, None, None]).clamp(0,1)

n = min(16, len(dataset))
imgs = []
for i in range(n):
    x, _ = dataset[i]
    imgs.append(denorm(x, mean, std))

grid = make_grid(torch.stack(imgs), nrow=4)
grid_path = os.path.join(PREVIEW_DIR, "grid.png")
save_image(grid, grid_path)
grid_path

'/workspaces/eng-ai-agents/assignments/assignment-1/data/preview/grid.png'

In [30]:
img_u8 = io.read_image(ASTRONAUT_PATH)
img_tf = transform(img_u8)
print("astronaut raw:", img_u8.shape, img_u8.dtype)
print("astronaut transformed:", img_tf.shape, img_tf.dtype,
      "min/max:", float(img_tf.min()), float(img_tf.max()))

astronaut raw: torch.Size([3, 512, 512]) torch.uint8
astronaut transformed: torch.Size([3, 224, 224]) torch.float32 min/max: -1.8853892087936401 3.811460018157959


In [3]:
import fiftyone as fo
from fiftyone import types

fo.config.notebook = True
fo.config.database_validation = False

try:
    fo.close_app()
except:
    pass

name = "assignment1_frames"
if name in fo.list_datasets():
    fo.delete_dataset(name)

ds = fo.Dataset.from_dir(
    dataset_dir=str(FRAMES_DIR),
    dataset_type=types.ImageDirectory,
    name=name,
)

session = fo.launch_app(ds)
session

 100% |███████████████| 1000/1000 [189.1ms elapsed, 0s remaining, 5.3K samples/s]     


Dataset:          assignment1_frames
Media type:       image
Num samples:      1000
Selected samples: 0
Selected labels:  0
Session URL:      http://0.0.0.0:5151/