Бейзлайн (U-Net и SegFormer)

In [None]:
import torch
import segmentation_models_pytorch as smp
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import VOCSegmentation
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from sklearn.metrics import jaccard_score, accuracy_score

# Выбор устройства: GPU с поддержкой MPS, если доступно, или CPU
# -- 1a: Устройство и CIFAR10
device_name = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device_name)
device = torch.device(device_name)

# Загружаем VOC 2012 датасет
dataset_train = VOCSegmentation(
    root='./data',
    year='2012',
    image_set='train',
    download=True,
    transform=transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
    ]),
    target_transform=transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
    ])
)

dataset_val = VOCSegmentation(
    root='./data',
    year='2012',
    image_set='val',
    download=True,
    transform=transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
    ]),
    target_transform=transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
    ])
)

train_loader = DataLoader(dataset_train, batch_size=8, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=8, shuffle=False)

Using device: cuda


In [None]:
def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for images, masks in loader:
        images, masks = images.to(device), masks.long().to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, masks.squeeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, device):
    model.eval()
    total_iou = 0
    total_accuracy = 0
    n_samples = 0
    with torch.no_grad():
        for images, masks in loader:
            images, masks = images.to(device), masks.long().to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1)
            total_iou += jaccard_score(masks.cpu().numpy().reshape(-1), preds.cpu().numpy().reshape(-1), average='macro')
            total_accuracy += accuracy_score(masks.cpu().numpy().reshape(-1), preds.cpu().numpy().reshape(-1))
            n_samples += 1
    return total_iou / n_samples, total_accuracy / n_samples

def train_and_evaluate(model, train_loader, val_loader, device, num_epochs=5):
    optimizer = Adam(model.parameters(), lr=0.0001)
    criterion = CrossEntropyLoss()

    for epoch in range(num_epochs):
        train_loss = train(model, train_loader, optimizer, criterion, device)
        val_iou, val_accuracy = evaluate(model, val_loader, device)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}, Val mIoU: {val_iou:.4f}, Val Accuracy: {val_accuracy:.4f}')

    final_iou, final_accuracy = evaluate(model, val_loader, device)
    print(f"mIoU: {final_iou:.4f}, Accuracy: {final_accuracy:.4f}")

In [None]:
unet_model = smp.Unet(encoder_name='resnet34', classes=21, activation=None).to(device)
print("=== U-Net with ResNet34 Encoder ===")
train_and_evaluate(unet_model, train_loader, val_loader, device, num_epochs=5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


=== U-Net with ResNet34 Encoder ===
Epoch 1/5, Loss: 1.7636, Val mIoU: 0.2217, Val Accuracy: 0.9443
Epoch 2/5, Loss: 0.6099, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 3/5, Loss: 0.3257, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 4/5, Loss: 0.2406, Val mIoU: 0.4727, Val Accuracy: 0.9449
Epoch 5/5, Loss: 0.1993, Val mIoU: 0.4830, Val Accuracy: 0.9453
mIoU: 0.4830, Accuracy: 0.9453


In [None]:
segformer_model = smp.create_model('segformer', in_channels=3, classes=21).to(device)
print("=== SegFormer ===")
train_and_evaluate(segformer_model, train_loader, val_loader, device, num_epochs=5)

=== SegFormer ===
Epoch 1/5, Loss: 1.2667, Val mIoU: 0.4612, Val Accuracy: 0.9446
Epoch 2/5, Loss: 0.2817, Val mIoU: 0.4281, Val Accuracy: 0.9441
Epoch 3/5, Loss: 0.2171, Val mIoU: 0.4828, Val Accuracy: 0.9446
Epoch 4/5, Loss: 0.1929, Val mIoU: 0.4871, Val Accuracy: 0.9445
Epoch 5/5, Loss: 0.1706, Val mIoU: 0.4815, Val Accuracy: 0.9427
mIoU: 0.4815, Accuracy: 0.9427


Улучшение бейзлайна

In [None]:
from torchvision.transforms import RandomHorizontalFlip, ColorJitter, RandomRotation

data_transforms = transforms.Compose([
    transforms.Resize((256, 256)),
    RandomHorizontalFlip(),
    ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.2),
    RandomRotation(degrees=20),
    transforms.ToTensor(),
])

dataset_train = VOCSegmentation(
    root='./data',
    year='2012',
    image_set='train',
    download=True,
    transform=data_transforms,
    target_transform=transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
    ])
)

def improved_train_and_evaluate(model, train_loader, val_loader, device, num_epochs=5, learning_rate=0.0001, batch_size=16):
    optimizer = Adam(model.parameters(), lr=learning_rate)
    criterion = CrossEntropyLoss()

    train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)

    for epoch in range(num_epochs):
        train_loss = train(model, train_loader, optimizer, criterion, device)
        val_iou, val_accuracy = evaluate(model, val_loader, device)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}, Val mIoU: {val_iou:.4f}, Val Accuracy: {val_accuracy:.4f}')

    final_iou, final_accuracy = evaluate(model, val_loader, device)
    print(f"mIoU: {final_iou:.4f}, Accuracy: {final_accuracy:.4f}\n")

print("=== Improved Training U-Net with ResNet34 Encoder ===")
improved_unet_model = smp.Unet(encoder_name='resnet34', classes=21, activation=None).to(device)
improved_train_and_evaluate(improved_unet_model, train_loader, val_loader, device, num_epochs=5, learning_rate=0.00005, batch_size=12)

print("=== Improved Training SegFormer ===")
improved_segformer_model = smp.create_model('segformer', in_channels=3, classes=21).to(device)
improved_train_and_evaluate(improved_segformer_model, train_loader, val_loader, device, num_epochs=5, learning_rate=0.00005, batch_size=12)


=== Improved Training U-Net with ResNet34 Encoder ===
Epoch 1/5, Loss: 3.0019, Val mIoU: 0.0316, Val Accuracy: 0.6336
Epoch 2/5, Loss: 2.2800, Val mIoU: 0.0531, Val Accuracy: 0.9250
Epoch 3/5, Loss: 1.7928, Val mIoU: 0.0657, Val Accuracy: 0.9370
Epoch 4/5, Loss: 1.3273, Val mIoU: 0.1053, Val Accuracy: 0.9406
Epoch 5/5, Loss: 0.9505, Val mIoU: 0.2297, Val Accuracy: 0.9443
mIoU: 0.4997, Accuracy: 0.9443

=== Improved Training SegFormer ===
Epoch 1/5, Loss: 1.7709, Val mIoU: 0.4655, Val Accuracy: 0.9449
Epoch 2/5, Loss: 0.5293, Val mIoU: 0.4671, Val Accuracy: 0.9449
Epoch 3/5, Loss: 0.3614, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 4/5, Loss: 0.3010, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 5/5, Loss: 0.2706, Val mIoU: 0.4727, Val Accuracy: 0.9449
mIoU: 0.4727, Accuracy: 0.9449


Собственная реализация

In [12]:
import torch.nn as nn
import torch.nn.functional as F

class SimpleUNet(nn.Module):
    def __init__(self, in_channels=3, out_channels=21):
        super(SimpleUNet, self).__init__()

        def conv_block(in_dim, out_dim):
            return nn.Sequential(
                nn.Conv2d(in_dim, out_dim, kernel_size=3, padding=1),
                nn.ReLU(inplace=True),
                nn.Conv2d(out_dim, out_dim, kernel_size=3, padding=1),
                nn.ReLU(inplace=True),
            )

        def up_conv(in_dim, out_dim):
            return nn.ConvTranspose2d(in_dim, out_dim, kernel_size=2, stride=2)

        self.encoder1 = conv_block(in_channels, 64)
        self.encoder2 = conv_block(64, 128)
        self.encoder3 = conv_block(128, 256)
        self.encoder4 = conv_block(256, 512)

        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        self.bottleneck = conv_block(512, 1024)

        self.upconv4 = up_conv(1024, 512)
        self.decoder4 = conv_block(1024, 512)

        self.upconv3 = up_conv(512, 256)
        self.decoder3 = conv_block(512, 256)

        self.upconv2 = up_conv(256, 128)
        self.decoder2 = conv_block(256, 128)

        self.upconv1 = up_conv(128, 64)
        self.decoder1 = conv_block(128, 64)

        self.final_conv = nn.Conv2d(64, out_channels, kernel_size=1)

    def forward(self, x):
        enc1 = self.encoder1(x)
        enc2 = self.encoder2(self.pool(enc1))
        enc3 = self.encoder3(self.pool(enc2))
        enc4 = self.encoder4(self.pool(enc3))

        bottleneck = self.bottleneck(self.pool(enc4))

        dec4 = self.upconv4(bottleneck)
        dec4 = torch.cat((enc4, dec4), dim=1)
        dec4 = self.decoder4(dec4)

        dec3 = self.upconv3(dec4)
        dec3 = torch.cat((enc3, dec3), dim=1)
        dec3 = self.decoder3(dec3)

        dec2 = self.upconv2(dec3)
        dec2 = torch.cat((enc2, dec2), dim=1)
        dec2 = self.decoder2(dec2)

        dec1 = self.upconv1(dec2)
        dec1 = torch.cat((enc1, dec1), dim=1)
        dec1 = self.decoder1(dec1)

        return self.final_conv(dec1)

# Инициализируем и обучаем модель
simple_unet_model = SimpleUNet().to(device)

In [13]:
# Обучение модели SimpleUNet
train_and_evaluate(simple_unet_model, train_loader, val_loader, device, num_epochs=5)

Epoch 1/5, Loss: 0.6021, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 2/5, Loss: 0.2216, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 3/5, Loss: 0.2119, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 4/5, Loss: 0.2042, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 5/5, Loss: 0.1999, Val mIoU: 0.4725, Val Accuracy: 0.9449
mIoU: 0.4725, Accuracy: 0.9449


In [23]:
class SimpleSegFormer(nn.Module):
    def __init__(self, in_channels=3, out_channels=21):
        super(SimpleSegFormer, self).__init__()

        # Простая свертка для экстракции признаков
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=3, stride=2, padding=1),  # (128, 128)
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),           # (64, 64)
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),          # (32, 32)
            nn.ReLU(inplace=True)
        )

        # Трансформер в качестве напоминания глобальных связей
        self.transformer_block = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=128, nhead=8),
            num_layers=2
        )

        # Обратная свертка для восстановления до оригинального размера
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2),            # (64, 64)
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(64, 32, kernel_size=2, stride=2),             # (128, 128)
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(32, 32, kernel_size=2, stride=2),             # (256, 256)
            nn.ReLU(inplace=True),
            nn.Conv2d(32, out_channels, kernel_size=1)                        # (256, 256)
        )

    def forward(self, x):
        # Извлечение признаков
        x = self.encoder(x)
        b, c, h, w = x.size()

        # Преобразование для трансформера
        x = x.flatten(2).permute(2, 0, 1)  # shape: [h*w, b, c]
        x = self.transformer_block(x)
        x = x.permute(1, 2, 0).reshape(b, c, h, w)  # shape: [b, c, h, w]

        # Декодирование в карту сегментации
        return self.decoder(x)

# Инициализация и обучение SimpleSegFormer
simple_segformer_model = SimpleSegFormer().to(device)

In [24]:
train_and_evaluate(simple_segformer_model, train_loader, val_loader, device, num_epochs=5)

Epoch 1/5, Loss: 2.1926, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 2/5, Loss: 0.2536, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 3/5, Loss: 0.2215, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 4/5, Loss: 0.2099, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 5/5, Loss: 0.2044, Val mIoU: 0.4725, Val Accuracy: 0.9449
mIoU: 0.4725, Accuracy: 0.9449


Улучшение собственной реализации

In [17]:
print('=== Improved Self implementation Unet ===')
improved_train_and_evaluate(simple_unet_model, train_loader, val_loader, device, num_epochs=5, learning_rate=0.00005, batch_size=12)


=== Improved Self implementation Unet ===
Epoch 1/10, Loss: 0.2126, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 2/10, Loss: 0.2122, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 3/10, Loss: 0.2115, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 4/10, Loss: 0.2111, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 5/10, Loss: 0.2110, Val mIoU: 0.4725, Val Accuracy: 0.9449
mIoU: 0.4725, Accuracy: 0.9449



In [25]:
print('=== Improved Self implementation SegFormer ===')
improved_train_and_evaluate(simple_segformer_model, train_loader, val_loader, device, num_epochs=5, learning_rate=0.00005, batch_size=12)

=== Improved Self implementation SegFormer ===
Epoch 1/5, Loss: 0.2123, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 2/5, Loss: 0.2108, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 3/5, Loss: 0.2098, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 4/5, Loss: 0.2105, Val mIoU: 0.4725, Val Accuracy: 0.9449
Epoch 5/5, Loss: 0.2097, Val mIoU: 0.4725, Val Accuracy: 0.9449
mIoU: 0.4725, Accuracy: 0.9449

