In [70]:
from torchvision.io.image import decode_image, encode_jpeg
from torchvision.models.segmentation import deeplabv3_resnet101, DeepLabV3_ResNet101_Weights
from torchvision.transforms.functional import to_pil_image

import torch
import torch.nn.functional as F
import numpy as np
import os

In [None]:
# Step 0: Load and decode input image and label image
img_dir = r"C:\Users\andre\Desktop\mlcvprac\semsegpipeline\data\images"
label_dir = r"C:\Users\andre\Desktop\mlcvprac\semsegpipeline\data\labels"

for img_name in os.listdir(path=img_dir):
    img_path = os.path.join(img_dir, img_name)
    img_tensor = decode_image(img_path)
    print(f"image path: {img_path}")
    print(f"image tensor shape {img_tensor.shape}")
    break

for label_name in os.listdir(path=label_dir):
    label_path = os.path.join(label_dir, label_name)
    label_tensor = decode_image(label_path)
    print(f"label path: {label_path}")
    print(f"label tensor shape: {label_tensor.shape}")
    break

image path: C:\Users\andre\Desktop\mlcvprac\semanticsegmentation_prac\data\images\000000.png
image tensor shape torch.Size([3, 2160, 3840])
label path: C:\Users\andre\Desktop\mlcvprac\semanticsegmentation_prac\data\labels\000000.png
label tensor shape: torch.Size([3, 2160, 3840])


In [None]:
# Step 1: Initialize Model with Weights
weights = DeepLabV3_ResNet101_Weights.DEFAULT
model = deeplabv3_resnet101(weights=weights)
model.eval()

# Step 2: Initialize and Apply image preprocessing transforms to match the pretrained image format
# weights.transforms() automatically does this
preprocess = weights.transforms()
batch_img_tensor = preprocess(img_tensor).unsqueeze(0) #unsqueeze to add batch dimension

# Step 3: Inference
# Get the scores per class per pixel, convert them to probabilities. Size of [1, 21, 520, 924]
prediction = model(batch_img_tensor)["out"]
normalized_masks = prediction.softmax(dim=1)
# Get the mask for the class we are trying to segment
class_to_idx = {cls: idx for (idx, cls) in enumerate(weights.meta["categories"])}
idx_of_desired_class = class_to_idx["car"]
mask = normalized_masks[0, idx_of_desired_class]

# Step 4: Convert to Binary Mask for Evaluation
# Convert it to a binary mask of the original gt image size for evaluation
tensor_mask = mask.unsqueeze(0).unsqueeze(0) #torch operations expect (N,C,H,W) tensor
pred_resized = F.interpolate(tensor_mask, size=(2160, 3840), mode='bilinear', align_corners=False)
binary_pred_mask_resized = (pred_resized.squeeze().detach().numpy() > 0.5).astype(np.uint8) #squeeze back to (H,W)
#logic operations turn it into 0s and 1s so multiply by 255 for visualization sake
visible_pred_mask = binary_pred_mask_resized * 255
to_pil_image(visible_pred_mask).show()

In [84]:
# Step 5: Evaluate Inference Result IoU with Label Mask
gt_image = to_pil_image(label_tensor)
gt_mask = np.array(gt_image)

static_car = np.all(gt_mask == [192, 0, 192], axis=-1)
moving_car = np.all(gt_mask == [64, 0, 128], axis=-1)

binary_gt_mask = static_car.astype(np.uint8)
visible_gt_mask = binary_gt_mask * 255 
to_pil_image(visible_gt_mask).show()

intersection = np.logical_and(binary_pred_mask_resized, binary_gt_mask).sum()
union = np.logical_or(binary_pred_mask_resized, binary_gt_mask).sum()
IoU = intersection / union
print(f"Car IoU: {IoU}")

Car IoU: 0.346600336084404
