In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from PIL import Image
import numpy as np

In [16]:
# Define your custom dataset for semantic segmentation
class CustomDataset(Dataset):
    def __init__(self, image_folder, mask_folder, transform=None):
        self.image_folder = image_folder
        self.mask_folder = mask_folder
        self.transform = transform

        self.images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")]
        # print(len(self.images))
        self.masks = [mask for mask in os.listdir(mask_folder) if mask.endswith(".txt")]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_folder, self.images[idx])
        mask_path = os.path.join(self.mask_folder, self.masks[idx])

        image = Image.open(img_path).convert("RGB")

        # Resize the image to a common size
        image = image.resize((320, 240), Image.ANTIALIAS)

        # Load the mask from the text file
        with open(mask_path, 'r') as file:
            lines = file.readlines()
            mask_data = [list(map(int, line.strip().split())) for line in lines]

        mask = np.array(mask_data)

        if self.transform:
            image, mask = self.transform(image, mask)

        return image, mask

In [3]:
# Define your custom transformation (you might need to adjust it based on your requirements)
class CustomTransform:
    def __init__(self):
        self.transform = transforms.Compose([
            transforms.ToTensor(),
        ])

    def __call__(self, image, mask):
        image = self.transform(image)
        mask = torch.from_numpy(mask).unsqueeze(0)  # Convert mask to PyTorch tensor

        return image, mask

In [4]:
# Define the model using ResNet18 as the backbone
class SegmentationModel(nn.Module):
    def __init__(self, num_classes):
        super(SegmentationModel, self).__init__()
        resnet18 = models.resnet18(pretrained=True)
        self.encoder = nn.Sequential(*list(resnet18.children())[:-2])
        self.decoder = nn.Conv2d(512, num_classes, kernel_size=1)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [17]:
# Set your data and model paths
image_folder = "./iccv09Data/images"
mask_folder = "./iccv09Data/labels"
num_classes = 21  # Adjust based on your dataset

# Create dataset and DataLoader
transform = CustomTransform()
dataset = CustomDataset(image_folder, mask_folder, transform=transform)
# print(len(dataset.images))
# print("WHY")
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Instantiate the model
model = SegmentationModel(num_classes)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    for images, masks in dataloader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, masks)
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}")

# Save the trained model
torch.save(model.state_dict(), "semantic_segmentation_model.pth")

  image = image.resize((320, 240), Image.ANTIALIAS)


RuntimeError: stack expects each tensor to be equal size, but got [1, 193, 320] at entry 0 and [1, 214, 320] at entry 1

In [19]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import numpy as np

# Define the semantic segmentation model
class SegmentationModel(nn.Module):
    def __init__(self, num_classes):
        super(SegmentationModel, self).__init__()
        # Load the ResNet18 model pretrained on ImageNet
        self.encoder = models.resnet18(pretrained=True)
        # Replace the classification head to output feature maps with desired number of classes
        self.encoder.fc = nn.Conv2d(512, num_classes, kernel_size=1)
        # Define upsampling layers to increase the resolution of the feature maps
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(num_classes, num_classes, kernel_size=4, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(num_classes, num_classes, kernel_size=4, stride=2, padding=1),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        # Forward pass through the encoder
        x = self.encoder(x)
        # Forward pass through the decoder to upsample the feature maps
        x = self.decoder(x)
        return x

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, img_dir, mask_dir, transform=None):
        self.img_dir = img_dir
        self.mask_dir = mask_dir
        self.transform = transform

    def __len__(self):
        return len(os.listdir(self.img_dir))

    def __getitem__(self, idx):
        img_name = os.listdir(self.img_dir)[idx]
        img_path = os.path.join(self.img_dir, img_name)
        mask_path = os.path.join(self.mask_dir, img_name.replace('.jpg', '.layers.txt'))

        image = Image.open(img_path).convert("RGB")
        mask = np.loadtxt(mask_path)

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(mask, dtype=torch.long)

# Define transformations for image preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define directories for your dataset
img_dir = "./iccv09Data/images"
mask_dir = "./iccv09Data/labels"

# Create instances of the dataset and dataloader
dataset = CustomDataset(img_dir, mask_dir, transform=transform)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Define the number of classes in your dataset
num_classes = 21  # Example, adjust based on your dataset

# Initialize the model
model = SegmentationModel(num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    for images, masks in dataloader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, masks)
        loss.backward()
        optimizer.step()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Save the trained model
torch.save(model.state_dict(), 'semantic_segmentation_model.pth')



RuntimeError: stack expects each tensor to be equal size, but got [213, 320] at entry 0 and [240, 320] at entry 1