In [1]:
# Import required libraries
import torch
import torchvision.transforms as transforms
from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
from PIL import Image
import requests
import numpy as np

processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")

print("Loaded model and processor")



Loaded model and processor


In [2]:
# Set the model to evaluation mode
model.eval()

SegformerForSemanticSegmentation(
  (segformer): SegformerModel(
    (encoder): SegformerEncoder(
      (patch_embeddings): ModuleList(
        (0): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(3, 32, kernel_size=(7, 7), stride=(4, 4), padding=(3, 3))
          (layer_norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        )
        (1): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
        (2): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(64, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
        )
        (3): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(160, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  

In [3]:
# Define preprocessing transforms
preprocess = transforms.Compose([
    transforms.Resize((1024, 1024)),  # Resize image to model's input dimensions
    transforms.ToTensor(),  # Convert PIL image to tensor
])


In [4]:
# Load and preprocess input data
#image = Image.open('path_to_input_image.jpg')

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

input_tensor = preprocess(image).unsqueeze(0)  # Add batch dimension

In [5]:
# Perform inference
with torch.no_grad():
    outputs = model(input_tensor)

# Get segmentation mask
predicted_masks = torch.argmax(outputs.logits, dim=1).squeeze().cpu().numpy()

# Define color map for segmentation classes
color_map = {
    0: [0, 0, 0],  # Background (black)
    1: [255, 0, 0],  # Person (red)
    # Add more classes as needed
}

# Generate colored segmentation output image
output_image = np.zeros((256, 256, 3), dtype=np.uint8)
for class_idx, color in color_map.items():
    output_image[predicted_masks == class_idx] = color

# Save or display the output image
output_image = Image.fromarray(output_image)
output_image.save('output_image.jpg')
output_image.show()

In [6]:
inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits # shape (batch_size, num_labels, height/4, width/4)

In [7]:
print (outputs)

SemanticSegmenterOutput(loss=None, logits=tensor([[[[ -4.6310,  -5.5232,  -6.2356,  ...,  -4.9868,  -4.7341,  -4.6612],
          [ -5.1921,  -6.1444,  -6.5996,  ...,  -5.1771,  -5.0288,  -5.1761],
          [ -5.4424,  -6.2790,  -6.7574,  ...,  -5.2748,  -5.1669,  -5.0999],
          ...,
          [ -8.5836,  -9.0887,  -9.5409,  ...,  -8.7190,  -8.5183,  -8.3098],
          [ -8.4320,  -8.8555,  -9.1848,  ...,  -7.7831,  -7.4822,  -7.3598],
          [ -8.3224,  -8.8764,  -9.1849,  ...,  -7.1564,  -6.8759,  -6.6428]],

         [[-12.1391, -13.3122, -13.9554,  ..., -11.8693, -11.5761, -11.3418],
          [-12.8732, -13.9352, -14.3563,  ..., -12.3348, -12.1524, -12.3176],
          [-12.9438, -13.8226, -14.2513,  ..., -12.3360, -12.3081, -12.2396],
          ...,
          [-13.9108, -14.2715, -14.6169,  ..., -13.2829, -13.3424, -13.3222],
          [-13.8718, -14.2715, -14.3808,  ..., -12.5270, -12.4334, -12.3057],
          [-13.6848, -14.2857, -14.5154,  ..., -11.8523, -11.8534, -