# Assignment 1 Coding

Custom Torchvision dataset: 25 points
Transformations: 25 points
Visualization: 25 points


In [14]:
from pathlib import Path
import os, torch, torchvision
import torchvision.io as io
import torchvision.transforms.v2 as v2
from torchvision.utils import make_grid, save_image

WS = Path(os.environ.get("WORKSPACE_DIR", "/workspaces/eng-ai-agents/assignments/assignment-1"))

DATA_DIR    = (WS / "data").resolve()
FRAMES_DIR  = (DATA_DIR / "frames").resolve()
PREVIEW_DIR = (DATA_DIR / "preview").resolve()

VIDEO_PATH = DATA_DIR / "video.mp4"
ASTRONAUT_PATH = DATA_DIR / "astronaut.jpg"

for d in (DATA_DIR, FRAMES_DIR, PREVIEW_DIR):
    d.mkdir(parents=True, exist_ok=True)

print("WS         :", WS)
print("DATA_DIR   :", DATA_DIR)
print("VIDEO_PATH :", VIDEO_PATH, "exists:", VIDEO_PATH.exists())
print("ASTRONAUT  :", ASTRONAUT_PATH, "exists:", ASTRONAUT_PATH.exists())

print("torch:", torch.__version__)
print("torchvision:", torchvision.__version__)

WS         : /workspaces/eng-ai-agents
DATA_DIR   : /workspaces/eng-ai-agents/data
VIDEO_PATH : /workspaces/eng-ai-agents/data/video.mp4 exists: False
ASTRONAUT  : /workspaces/eng-ai-agents/data/astronaut.jpg exists: False
torch: 2.8.0+cpu
torchvision: 0.23.0+cpu


In [11]:
vframes, _, info = io.read_video(VIDEO_PATH, pts_unit="sec")
T = int(vframes.shape[0])

target = min(1000, T)
idxs = torch.linspace(0, T - 1, steps=target).round().to(torch.int64)

for i, t in enumerate(idxs.tolist()):
    frame = vframes[t].permute(2, 0, 1)        # to C,H,W
    save_image(frame.float() / 255.0, os.path.join(FRAMES_DIR, f"frame_{i:05d}.png"))

print(f"Saved {target} frames → {FRAMES_DIR}")



ImportError: PyAV is not installed, and is necessary for the video operations in torchvision.
See https://github.com/mikeboers/PyAV#installation for instructions on how to
install PyAV on your system.


In [None]:
files = sorted([os.path.join(FRAMES_DIR, f) for f in os.listdir(FRAMES_DIR)
                if f.lower().endswith((".png",".jpg",".jpeg",".bmp",".webp"))])

sum_c   = torch.zeros(3, dtype=torch.float64)
sumsq_c = torch.zeros(3, dtype=torch.float64)
total_px = 0

for p in files:
    x = io.read_image(p).to(torch.float32) / 255.0  # C,H,W in [0,1]
    c, h, w = x.shape
    flat = x.reshape(c, -1)
    sum_c   += flat.sum(dim=1, dtype=torch.float64)
    sumsq_c += (flat**2).sum(dim=1, dtype=torch.float64)
    total_px += h * w

mean = (sum_c / total_px).to(torch.float32)
var  = (sumsq_c / total_px - (mean.to(torch.float64)**2)).to(torch.float32)
std  = torch.sqrt(var.clamp_min(1e-12))

print("mean:", mean.tolist())
print("std :", std.tolist())

In [None]:
from torch.utils.data import Dataset

class FrameDataset(Dataset):
    def __init__(self, root, transform=None):
        self.root = root
        self.transform = transform
        exts = {".png",".jpg",".jpeg",".bmp",".webp"}
        self.paths = []
        for dp, _, fns in os.walk(root):
            for fn in fns:
                if os.path.splitext(fn.lower())[1] in exts:
                    self.paths.append(os.path.join(dp, fn))
        self.paths.sort()

    def __len__(self): return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        img = io.read_image(path)
        if self.transform is not None:
            img = self.transform(img)
        return img, path

transform = v2.Compose([
    v2.Resize((224, 224)),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=mean, std=std),
])

dataset = FrameDataset(FRAMES_DIR, transform=transform)
print("dataset size:", len(dataset))
x0, p0 = dataset[0]
print("sample tensor:", x0.shape, "from:", p0)

In [None]:
def denorm(x, m, s):
    return (x * s[:, None, None] + m[:, None, None]).clamp(0,1)

n = min(16, len(dataset))
imgs = []
for i in range(n):
    x, _ = dataset[i]
    imgs.append(denorm(x, mean, std))

grid = make_grid(torch.stack(imgs), nrow=4)
grid_path = os.path.join(PREVIEW_DIR, "grid.png")
save_image(grid, grid_path)
grid_path

In [None]:
img_u8 = io.read_image(ASTRONAUT_PATH)                  # uint8 CxHxW
img_tf = transform(img_u8)                              # normalized 224x224 float32
print("astronaut raw:", img_u8.shape, img_u8.dtype)
print("astronaut transformed:", img_tf.shape, img_tf.dtype,
      "min/max:", float(img_tf.min()), float(img_tf.max()))

In [None]:
import fiftyone as fo
from fiftyone import types

name = "assignment1_frames"
if name in fo.list_datasets():
    fo.delete_dataset(name)

fod = fo.Dataset.from_dir(
    dataset_dir=FRAMES_DIR,
    dataset_type=types.ImageDirectory,
    name=name,
)

session = fo.launch_app(fod)
session