Project Demo Working File

In [2]:
%matplotlib qt
# gives you a separate window on desktop


In [3]:
# === Extract frames from rear_end_1.mp4 using imageio + ffmpeg ===

import imageio.v2 as imageio
from pathlib import Path
from PIL import Image  # use PIL directly, simpler

VIDEO_PATH = Path(r"C:\Users\Andre\Documents\Machine Learning Project\rear_end_2.mp4")
FRAMES_DIR = VIDEO_PATH.with_name(VIDEO_PATH.stem + "_frames")
FRAMES_DIR.mkdir(exist_ok=True)

reader = imageio.get_reader(str(VIDEO_PATH), format="ffmpeg")
meta = reader.get_meta_data()
fps = meta.get("fps", None)
print("Video meta:", meta)
print("Video FPS:", fps)

frame_paths = []

for idx, frame in enumerate(reader):
    # frame: [H, W, C] uint8
    pil_img = Image.fromarray(frame)  # no transpose

    out_path = FRAMES_DIR / f"frame_{idx:05d}.png"
    pil_img.save(out_path)
    frame_paths.append(out_path)

reader.close()

print("Extracted frames:", len(frame_paths))
if frame_paths:
    print("Example frame:", frame_paths[0])


Video meta: {'plugin': 'ffmpeg', 'nframes': inf, 'ffmpeg_version': '7.1-essentials_build-www.gyan.dev built with gcc 14.2.0 (Rev1, Built by MSYS2 project)', 'codec': 'h264', 'pix_fmt': 'yuv420p(tv, bt709, progressive)', 'audio_codec': 'aac', 'fps': 29.97, 'source_size': (640, 360), 'size': (640, 360), 'rotate': 0, 'duration': 36.01}
Video FPS: 29.97
Extracted frames: 1079
Example frame: C:\Users\Andre\Documents\Machine Learning Project\rear_end_2_frames\frame_00000.png


In [4]:
# === Frame Dataset & DataLoader for rear_end_1_frames ===

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from pathlib import Path

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Same normalization as training
normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225],
)

eval_tf = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    normalize,
])

class FrameDataset(Dataset):
    def __init__(self, frames_dir, transform):
        self.frames_dir = Path(frames_dir)
        # Assuming our saved names are frame_00000.png, etc.
        self.paths = sorted(self.frames_dir.glob("frame_*.png"))
        self.transform = transform

        if not self.paths:
            raise RuntimeError(f"No frames found in {self.frames_dir}")

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        img = Image.open(path).convert("RGB")
        x = self.transform(img)
        return x, path.name  # return filename for later reference

frames_ds = FrameDataset(FRAMES_DIR, eval_tf)
frames_loader = DataLoader(
    frames_ds,
    batch_size=64,
    shuffle=False,
    num_workers=0,
    pin_memory=(device.type == "cuda"),
)

print(f"Number of frames: {len(frames_ds)}")
print(f"Example frame path: {frames_ds.paths[0]}")


Using device: cuda
Number of frames: 1079
Example frame path: C:\Users\Andre\Documents\Machine Learning Project\rear_end_2_frames\frame_00000.png


In [5]:
# === Load FP32 ResNet18 & ResNet50 for Demo Ensemble (CUDA if available) ===

import torch
import torch.nn as nn
from torchvision import models

NUM_CLASSES = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device for models:", device)

def build_resnet18_fp32(weights_path: str) -> nn.Module:
    m = models.resnet18(weights=None)
    in_feats = m.fc.in_features
    m.fc = nn.Linear(in_feats, NUM_CLASSES)

    state = torch.load(weights_path, map_location="cpu")
    m.load_state_dict(state)
    m.to(device).eval()
    return m

def build_resnet50_fp32(weights_path: str) -> nn.Module:
    m = models.resnet50(weights=None)
    in_feats = m.fc.in_features
    m.fc = nn.Linear(in_feats, NUM_CLASSES)

    state = torch.load(weights_path, map_location="cpu")
    m.load_state_dict(state)
    m.to(device).eval()
    return m

# Update these paths if your .pt files live elsewhere
R18_WEIGHTS = r"resnet18_clear_obstructed_best.pt"
R50_WEIGHTS = r"resnet50_clear_obstructed_best.pt"

ens_r18 = build_resnet18_fp32(R18_WEIGHTS)
ens_r50 = build_resnet50_fp32(R50_WEIGHTS)

ensemble_models = [ens_r18, ens_r50]
print("Loaded FP32 ResNet18 + ResNet50 ensemble for demo.")


Using device for models: cuda


  state = torch.load(weights_path, map_location="cpu")


Loaded FP32 ResNet18 + ResNet50 ensemble for demo.


  state = torch.load(weights_path, map_location="cpu")


In [6]:
# === Run Ensemble Over Frames & Find First "Obstructed" Moment ===

import torch.nn.functional as F
import numpy as np

all_probs = []     # P(obstructed) per frame
all_names = []     # frame_000xx.png

ensemble_models = [ens_r18, ens_r50]

for xb, names in frames_loader:
    xb = xb.to(device)

    with torch.no_grad():
        logits_sum = None
        for m in ensemble_models:
            out = m(xb)
            logits_sum = out if logits_sum is None else (logits_sum + out)

        logits = logits_sum / len(ensemble_models)
        probs = F.softmax(logits, dim=1)[:, 1]  # class index 1 = "obstructed"

    all_probs.extend(probs.cpu().tolist())
    all_names.extend(list(names))

all_probs = np.array(all_probs)

print(f"Ran ensemble on {len(all_probs)} frames.")

# Choose a decision threshold for "we would warn the driver"
OBSTRUCTED_THRESH = 0.95  # you can tune this later

if np.any(all_probs >= OBSTRUCTED_THRESH):
    trigger_idx = int(np.argmax(all_probs >= OBSTRUCTED_THRESH))
    trigger_frame = all_names[trigger_idx]
    trigger_time = trigger_idx / fps  # seconds, using fps from your earlier cell

    print(f"\nFirst 'obstructed' warning frame (>= {OBSTRUCTED_THRESH:.2f}):")
    print(f"  Frame index: {trigger_idx}")
    print(f"  Frame name:  {trigger_frame}")
    print(f"  Video time:  {trigger_time:.2f} s")

else:
    trigger_idx = None
    print(f"\nWARNING: P(obstructed) never crosses {OBSTRUCTED_THRESH:.2f} on this video.")

# Optional: peek at a few probabilities around the trigger
for i in range(max(0, (trigger_idx or 0) - 5), min(len(all_probs), (trigger_idx or 0) + 5)):
    print(f"frame {i:04d} ({all_names[i]}): P(obstructed) = {all_probs[i]:.3f}")


Ran ensemble on 1079 frames.

  Frame index: 612
  Frame name:  frame_00612.png
  Video time:  20.42 s
frame 0607 (frame_00607.png): P(obstructed) = 0.944
frame 0608 (frame_00608.png): P(obstructed) = 0.949
frame 0609 (frame_00609.png): P(obstructed) = 0.948
frame 0610 (frame_00610.png): P(obstructed) = 0.947
frame 0611 (frame_00611.png): P(obstructed) = 0.943
frame 0612 (frame_00612.png): P(obstructed) = 0.951
frame 0613 (frame_00613.png): P(obstructed) = 0.941
frame 0614 (frame_00614.png): P(obstructed) = 0.949
frame 0615 (frame_00615.png): P(obstructed) = 0.954
frame 0616 (frame_00616.png): P(obstructed) = 0.950


In [7]:
# === Plot P(obstructed) vs frame/time for the FP32 ResNet ensemble ===

import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Collect frame paths
frame_paths = sorted(FRAMES_DIR.glob("frame_*.png"))
num_frames = len(frame_paths)
print(f"Found {num_frames} frames in {FRAMES_DIR}")

probs_obstructed = []

with torch.no_grad():
    for fp in frame_paths:
        img = Image.open(fp).convert("RGB")
        x = eval_tf(img).unsqueeze(0).to(device)   # [1, 3, 224, 224]

        # Ensemble logits (mean of models)
        logits_sum = None
        for m in ensemble_models:
            out = m(x)
            logits_sum = out if logits_sum is None else (logits_sum + out)
        logits = logits_sum / len(ensemble_models)

        # Class 1 = "obstructed"
        p_obst = torch.softmax(logits, dim=1)[0, 1].item()
        probs_obstructed.append(p_obst)

probs_obstructed = np.array(probs_obstructed)

# x-axis: time (seconds) if fps is known, else frame index
frames = np.arange(num_frames)
if fps is not None:
    x_vals = frames / fps
    x_label = "Time [s]"
else:
    x_vals = frames
    x_label = "Frame index"

plt.figure(figsize=(12, 4))
plt.plot(x_vals, probs_obstructed, linewidth=1.5)
plt.axhline(0.5, linestyle="--", color="red", label="0.5 threshold")
plt.xlabel(x_label)
plt.ylabel("P(obstructed)")
plt.title("ResNet18+50 FP32 Ensemble: P(obstructed) over video")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()


Found 1079 frames in C:\Users\Andre\Documents\Machine Learning Project\rear_end_2_frames


In [8]:
# === "Live" playback with auto-pause on obstruction ===

import time
from pathlib import Path

import torch
import matplotlib.pyplot as plt
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Use the FP32 ResNet ensemble
ensemble_models = [ens_r18, ens_r50]

# Threshold & smoothing
OBSTRUCTED_THRESH = 0.95      # tweak this (e.g., 0.90, 0.92, 0.95)
MIN_CONSEC_FRAMES = 10         # require N consecutive frames above threshold

frame_paths = sorted(FRAMES_DIR.glob("frame_*.png"))
num_frames = len(frame_paths)
print(f"Found {num_frames} frames in {FRAMES_DIR}")
print(f"Video FPS (metadata): {fps}")

# Figure setup
fig, ax = plt.subplots(figsize=(8, 4))

im_artist = None
text_artist = None

consec_count = 0
trigger_frame_idx = None

with torch.no_grad():
    for idx, fp in enumerate(frame_paths):
        frame_start = time.perf_counter()

        # Load frame
        img = Image.open(fp).convert("RGB")

        # Model input
        x = eval_tf(img).unsqueeze(0).to(device)

        # Ensemble logits (mean of both models)
        logits_sum = None
        for m in ensemble_models:
            out = m(x)
            logits_sum = out if logits_sum is None else (logits_sum + out)
        logits = logits_sum / len(ensemble_models)

        probs = torch.softmax(logits, dim=1)
        p_obst = probs[0, 1].item()

        # --- Update display ---
        if im_artist is None:
            im_artist = ax.imshow(img)
            ax.axis("off")
            text_artist = ax.text(
                0.02,
                0.95,
                f"t={idx / fps:5.2f}s  P(obstructed)={p_obst:.2f}",
                transform=ax.transAxes,
                color="yellow",
                fontsize=12,
                bbox=dict(facecolor="black", alpha=0.5),
            )
        else:
            im_artist.set_data(img)
            text_artist.set_text(
                f"t={idx / fps:5.2f}s  P(obstructed)={p_obst:.2f}"
            )

        fig.canvas.draw()

        # --- auto-tuned pause for near-real-time playback ---
        compute_dt = time.perf_counter() - frame_start      # seconds spent this loop
        target_dt = 1.0 / fps                               # desired frame interval
        delay = max(target_dt - compute_dt, 0.0)            # don't go negative
        plt.pause(delay)

        # --- Threshold logic with consecutive-frame smoothing ---
        if p_obst >= OBSTRUCTED_THRESH:
            consec_count += 1
            if consec_count >= MIN_CONSEC_FRAMES:
                trigger_frame_idx = idx - MIN_CONSEC_FRAMES + 1
                trigger_time = trigger_frame_idx / fps
                print(
                    f"\n*** WARNING: model would alert at "
                    f"frame {trigger_frame_idx} (t={trigger_time:.2f}s) ***"
                )
                break
        else:
            consec_count = 0

if trigger_frame_idx is None:
    print("\nNo obstruction warning triggered with current threshold settings.")
else:
    print("Playback paused at warning frame. You can close the window when done.")


Found 1079 frames in C:\Users\Andre\Documents\Machine Learning Project\rear_end_2_frames
Video FPS (metadata): 29.97

