In [2]:
import torch
import torch.nn as nn
import torchvision.models
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F

import albumentations as A
from albumentations.pytorch import ToTensorV2

from tqdm import tqdm
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import numpy as np

import os
from time import time

  check_for_updates()


### Get the data

In [3]:
import gdown
url = 'https://drive.google.com/uc?id=10f1H2T-5W-BiqabHHtlZ4ASs19TZmg8R'
output = 'data.zip'
gdown.download(url, output, quiet=False)
!unzip data.zip


Downloading...
From (original): https://drive.google.com/uc?id=10f1H2T-5W-BiqabHHtlZ4ASs19TZmg8R
From (redirected): https://drive.google.com/uc?id=10f1H2T-5W-BiqabHHtlZ4ASs19TZmg8R&confirm=t&uuid=11782639-53e8-4b19-a639-1b9d366a401d
To: /content/data.zip
100%|██████████| 979M/979M [00:19<00:00, 49.5MB/s]


'data.zip'

### Utilities (0.5 point)

Complete dataset to load prepared images and masks. Don't forget to use augmentations.

Some of the images are 1 channels, so use `gray2rgb`.

In [41]:
# config.py
from dataclasses import dataclass
from typing import Tuple, Optional
import torch


@dataclass
class TrainingConfig:
    # Training parameters
    epochs: int = 15
    batch_size: int = 8
    learning_rate: float = 1e-4
    device: str = "cuda" if torch.cuda.is_available() else "cpu"

    # Dataset parameters
    input_size: Tuple[int, int] = (256, 256)
    num_classes: int = 1

    # Optimizer parameters
    weight_decay: float = 0.01
    scheduler_patience: int = 2
    scheduler_factor: float = 0.1

    # WandB parameters
    project_name: str = "bird-segmentation"
    run_name: Optional[str] = None

    # Data augmentation parameters
    normalize_mean: Tuple[float, float, float] = (0.485, 0.456, 0.406)
    normalize_std: Tuple[float, float, float] = (0.229, 0.224, 0.225)

In [42]:
# dataset.py
import cv2
import os
from torch.utils.data import Dataset
import albumentations as A
from albumentations.pytorch import ToTensorV2
from typing import Tuple, Optional
import torch


class BirdsDataset(Dataset):
    """Dataset class for bird segmentation"""

    def __init__(
        self,
        folder: str,
        config: TrainingConfig,
        transform: Optional[A.Compose] = None,
    ) -> None:
        self.image_paths, self.mask_paths = self._get_paths(folder)
        self.transform = transform or self._get_default_transforms(config)

    def _get_paths(self, folder: str) -> Tuple[list, list]:
        """Get paths for images and masks"""
        images_folder = os.path.join(folder, "images")
        gt_folder = os.path.join(folder, "gt")

        image_paths = []
        mask_paths = []

        for class_name in os.listdir(images_folder):
            class_folder = os.path.join(images_folder, class_name)
            if os.path.isdir(class_folder):
                for fname in os.listdir(class_folder):
                    image_paths.append(os.path.join(class_folder, fname))
                    mask_paths.append(
                        os.path.join(gt_folder, class_name, fname[:-3] + "png")
                    )

        return image_paths, mask_paths

    @staticmethod
    def _get_default_transforms(config: TrainingConfig) -> A.Compose:
        """Get default augmentation pipeline"""
        return A.Compose(
            [
                A.Resize(*config.input_size),
                A.Normalize(
                    mean=config.normalize_mean, std=config.normalize_std
                ),
                ToTensorV2(),
            ]
        )

    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
        """Get a sample from the dataset"""
        # Load images
        img = cv2.imread(self.image_paths[index])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        mask = cv2.imread(self.mask_paths[index], cv2.IMREAD_GRAYSCALE)

        # Apply transforms
        transformed = self.transform(image=img, mask=mask)

        return (
            transformed["image"],
            transformed["mask"].float().unsqueeze(0) / 255.0,
        )

    def __len__(self) -> int:
        return len(self.image_paths)


### Architecture (1 point)
Your task for today is to build your own Unet to solve the segmentation problem.

As an encoder, you can use pre-trained on IMAGENET models(or parts) from torchvision. The decoder must be trained from scratch.
It is forbidden to use data not from the `data` folder.

I advise you to experiment with the number of blocks so as not to overfit on the training sample and get good quality on validation.

In [43]:
# model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision


class DecoderBlock(nn.Module):
    """Decoder block for U-Net architecture"""

    def __init__(self, in_channels: int, skip_channels: int, out_channels: int):
        super().__init__()
        self.upconv = nn.ConvTranspose2d(
            in_channels, out_channels, kernel_size=2, stride=2
        )

        self.conv_block = nn.Sequential(
            nn.Conv2d(out_channels + skip_channels, out_channels, 3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, 3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        )

    def forward(self, x: torch.Tensor, skip: torch.Tensor) -> torch.Tensor:
        x = self.upconv(x)
        x = self._handle_size_mismatch(x, skip)
        x = torch.cat([x, skip], dim=1)
        return self.conv_block(x)

    @staticmethod
    def _handle_size_mismatch(
        x: torch.Tensor, skip: torch.Tensor
    ) -> torch.Tensor:
        """Handle size mismatch between decoder and skip connection features"""
        if x.size() != skip.size():
            diff_h = skip.size()[2] - x.size()[2]
            diff_w = skip.size()[3] - x.size()[3]
            x = F.pad(
                x,
                [
                    diff_w // 2,
                    diff_w - diff_w // 2,
                    diff_h // 2,
                    diff_h - diff_h // 2,
                ],
            )
        return x


class UNet(nn.Module):
    """U-Net architecture with ResNet34 encoder"""

    def __init__(self, config: TrainingConfig):
        super().__init__()
        self.config = config

        # Initialize ResNet encoder
        resnet = torchvision.models.resnet34(
            weights=torchvision.models.ResNet34_Weights.DEFAULT
        )

        # Encoder
        self.encoder1 = nn.Sequential(
            nn.Conv2d(3, 64, 7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
        )
        self.pool = nn.MaxPool2d(kernel_size=2)
        self.encoder2 = resnet.layer1  # 64 channels
        self.encoder3 = resnet.layer2  # 128 channels
        self.encoder4 = resnet.layer3  # 256 channels
        self.encoder5 = resnet.layer4  # 512 channels

        # Decoder
        self.decoder4 = DecoderBlock(512, 256, 256)
        self.decoder3 = DecoderBlock(256, 128, 128)
        self.decoder2 = DecoderBlock(128, 64, 64)
        self.decoder1 = DecoderBlock(64, 64, 32)

        self.final_conv = nn.Conv2d(32, config.num_classes, kernel_size=1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Encoder
        enc1 = self.encoder1(x)
        enc2 = self.encoder2(self.pool(enc1))
        enc3 = self.encoder3(enc2)
        enc4 = self.encoder4(enc3)
        enc5 = self.encoder5(enc4)

        # Decoder
        dec4 = self.decoder4(enc5, enc4)
        dec3 = self.decoder3(dec4, enc3)
        dec2 = self.decoder2(dec3, enc2)
        dec1 = self.decoder1(dec2, enc1)

        # Final output
        out = self.final_conv(dec1)
        return F.interpolate(
            out, size=x.shape[2:], mode="bilinear", align_corners=False
        )


### Train script (0.5 point)

Complete the train and predict scripts.

In [53]:
import wandb
from torch.utils.data import DataLoader
from typing import Dict
import torch.nn as nn
from tqdm import tqdm
import torch


class Trainer:
    """Training class for U-Net model"""

    def __init__(self, model: nn.Module, config: TrainingConfig):
        self.model = model
        self.config = config
        self.device = config.device

        self.criterion = nn.BCEWithLogitsLoss()
        self.optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=config.learning_rate,
            weight_decay=config.weight_decay,
        )
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer,
            mode="min",
            patience=config.scheduler_patience,
            factor=config.scheduler_factor,
        )

        self._setup_wandb()

    def _setup_wandb(self):
        """Initialize WandB run"""
        self.run = wandb.init(
            project=self.config.project_name,
            name=self.config.run_name,
            config=self.config,
        )
        wandb.watch(self.model, self.criterion, log="all", log_freq=10)

    def _init_metrics(self) -> Dict[str, float]:
        """Initialize metrics dictionary"""
        return {
            "loss": 0.0,
            "iou": 0.0,
            "count": 0,
        }

    def _update_metrics(
        self, epoch_metrics: Dict[str, float], batch_metrics: Dict[str, float]
    ):
        """Update epoch metrics with batch metrics"""
        epoch_metrics["loss"] += batch_metrics["loss"]
        epoch_metrics["iou"] += batch_metrics["iou"]
        epoch_metrics["count"] += 1

    def _finalize_metrics(
        self, epoch_metrics: Dict[str, float], num_batches: int
    ) -> Dict[str, float]:
        """Calculate final metrics for the epoch"""
        return {
            "loss": epoch_metrics["loss"] / num_batches,
            "iou": epoch_metrics["iou"] / num_batches,
        }

    def _validate_epoch(self, val_loader: DataLoader) -> Dict[str, float]:
        """Validation loop for one epoch"""
        self.model.eval()
        val_metrics = self._init_metrics()

        with torch.no_grad():
            for inputs, masks in tqdm(val_loader):
                inputs = inputs.to(self.device)
                masks = masks.to(self.device)

                outputs = self.model(inputs)
                loss = self.criterion(outputs, masks)
                iou = self._calculate_iou(outputs, masks)

                self._update_metrics(
                    val_metrics, {"loss": loss.item(), "iou": iou}
                )

        return {
            "val_loss": val_metrics["loss"] / len(val_loader),
            "val_iou": val_metrics["iou"] / len(val_loader),
        }

    def _log_metrics(
        self,
        epoch: int,
        train_metrics: Dict[str, float],
        val_metrics: Dict[str, float],
    ):
        """Log metrics to WandB"""
        metrics = {
            "epoch": epoch,
            "train_loss": train_metrics["loss"],
            "train_iou": train_metrics["iou"],
            "val_loss": val_metrics["val_loss"],
            "val_iou": val_metrics["val_iou"],
            "learning_rate": self.optimizer.param_groups[0]["lr"],
        }
        wandb.log(metrics)
        print(f"Epoch {epoch}:", metrics)

    def train(
        self, train_loader: DataLoader, val_loader: DataLoader
    ) -> nn.Module:
        """Main training loop"""
        best_val_iou = 0

        for epoch in range(self.config.epochs):
            # Training phase
            train_metrics = self._train_epoch(train_loader)

            # Validation phase
            val_metrics = self._validate_epoch(val_loader)

            # Update scheduler
            self.scheduler.step(val_metrics["val_loss"])

            # Log metrics
            self._log_metrics(epoch, train_metrics, val_metrics)

            # Save best model
            if val_metrics["val_iou"] > best_val_iou:
                best_val_loss = val_metrics["val_iou"]
                self._save_model("best_model.pth")

        self.run.finish()
        return self.model

    def _train_epoch(self, train_loader: DataLoader) -> Dict[str, float]:
        """Training loop for one epoch"""
        self.model.train()
        epoch_metrics = self._init_metrics()

        for step, (inputs, masks) in enumerate(tqdm(train_loader)):
            batch_metrics = self._train_step(inputs, masks)
            self._update_metrics(epoch_metrics, batch_metrics)

            if step % 50 == 0:
                wandb.log(
                    {
                        "train_batch_loss": batch_metrics["loss"],
                        "train_batch_iou": batch_metrics["iou"],
                        "learning_rate": self.optimizer.param_groups[0]["lr"],
                    }
                )

        return self._finalize_metrics(epoch_metrics, len(train_loader))

    def _train_step(
        self, inputs: torch.Tensor, masks: torch.Tensor
    ) -> Dict[str, float]:
        """Single training step"""
        inputs = inputs.to(self.device)
        masks = masks.to(self.device)

        self.optimizer.zero_grad()
        outputs = self.model(inputs)
        loss = self.criterion(outputs, masks)
        loss.backward()
        self.optimizer.step()

        with torch.no_grad():
            iou = self._calculate_iou(outputs, masks)

        return {"loss": loss.item(), "iou": iou}

    @staticmethod
    def _calculate_iou(outputs: torch.Tensor, masks: torch.Tensor) -> float:
        """Calculate IoU score"""
        pred = (torch.sigmoid(outputs) > 0.5)
        intersection = (pred & masks.bool()).float().sum((1, 2, 3))
        union = (pred | masks.bool()).float().sum((1, 2, 3))
        iou = (intersection + 1e-6) / (union + 1e-6)
        return iou.mean().item()

    def _save_model(self, filename: str):
        """Save model to WandB"""
        path = os.path.join(wandb.run.dir, filename)
        torch.save(self.model.state_dict(), path)
        wandb.save(path)

In [54]:
def predict(model, img_path):
    transform = A.Compose([
        A.Resize(256, 256),
        A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ToTensorV2()
    ])

    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    transformed = transform(image=img)
    input_tensor = transformed['image'].unsqueeze(0)

    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
        pred = torch.sigmoid(output)
        pred = (pred > 0.5).float()

    # Resize prediction back to original size
    pred = F.interpolate(pred, size=(img.shape[0], img.shape[1]), mode='bilinear', align_corners=False)
    pred = pred.squeeze().numpy()

    return pred

def get_model(path):
    model = UNet()
    model.load_state_dict(torch.load(path))
    model.eval()
    return model

In [None]:
# main.py
from torch.utils.data import DataLoader


def main():
    # Initialize configuration
    config = TrainingConfig()

    # Create datasets
    train_dataset = BirdsDataset("data/train", config)
    val_dataset = BirdsDataset("data/val", config)

    # Create dataloaders
    train_loader = DataLoader(
        train_dataset, batch_size=config.batch_size, shuffle=True
    )
    val_loader = DataLoader(
        val_dataset, batch_size=config.batch_size, shuffle=False
    )

    # Initialize model
    model = UNet(config).to(config.device)

    # Initialize trainer and train
    trainer = Trainer(model, config)
    trained_model = trainer.train(train_loader, val_loader)

    return trained_model


if __name__ == "__main__":
    model = main()


100%|██████████| 1048/1048 [03:40<00:00,  4.74it/s]
100%|██████████| 176/176 [00:24<00:00,  7.33it/s]


Epoch 0: {'epoch': 0, 'train_loss': 0.3121429327733189, 'train_iou': 0.695910915974101, 'val_loss': 0.19234024530107324, 'val_iou': 0.7366907467896288, 'learning_rate': 0.0001}


100%|██████████| 1048/1048 [03:29<00:00,  4.99it/s]
100%|██████████| 176/176 [00:21<00:00,  8.11it/s]


Epoch 1: {'epoch': 1, 'train_loss': 0.1465482073587202, 'train_iou': 0.7452046153995827, 'val_loss': 0.1216924786567688, 'val_iou': 0.7555427889932286, 'learning_rate': 0.0001}


100%|██████████| 1048/1048 [03:28<00:00,  5.02it/s]
100%|██████████| 176/176 [00:21<00:00,  8.01it/s]


Epoch 2: {'epoch': 2, 'train_loss': 0.09958001649902977, 'train_iou': 0.7503941268980048, 'val_loss': 0.1069384604184465, 'val_iou': 0.7558964450250972, 'learning_rate': 0.0001}


100%|██████████| 1048/1048 [03:26<00:00,  5.07it/s]
100%|██████████| 176/176 [00:24<00:00,  7.16it/s]


Epoch 3: {'epoch': 3, 'train_loss': 0.08068916497695196, 'train_iou': 0.757062911759806, 'val_loss': 0.08174087467010725, 'val_iou': 0.7483179725029252, 'learning_rate': 0.0001}


100%|██████████| 1048/1048 [03:25<00:00,  5.10it/s]
100%|██████████| 176/176 [00:20<00:00,  8.79it/s]


Epoch 4: {'epoch': 4, 'train_loss': 0.07106913024033526, 'train_iou': 0.7638048601287012, 'val_loss': 0.08236916187557984, 'val_iou': 0.7173599130050703, 'learning_rate': 0.0001}


 16%|█▌        | 170/1048 [00:32<06:02,  2.42it/s]

You can also experiment with models and write a small report about results. If the report will be meaningful, you will receive an extra point.

### Testing (8 points)
Your model will be tested on the new data, similar to validation, so use techniques to prevent overfitting the model.

* IoU > 0.85 — 8 points
* IoU > 0.80 — 7 points
* IoU > 0.75 — 6 points
* IoU > 0.70 — 5 points
* IoU > 0.60 — 4 points
* IoU > 0.50 — 3 points
* IoU > 0.40 — 2 points
* IoU > 0.30 — 1 points

In [None]:
model = get_model('model_14.pth').to('cuda')

In [None]:
ious, times = [], []
test_dir = 'data/val/'

for class_name in tqdm(sorted(os.listdir(os.path.join(test_dir, 'images')))):
    for img_name in sorted(os.listdir(os.path.join(test_dir, 'images', class_name))):

        t_start = time()
        pred = predict(model, os.path.join(test_dir, 'images', class_name, img_name))
        times.append(time() - t_start)

        gt_name = img_name.replace('jpg', 'png')
        gt = np.asarray(Image.open(os.path.join(test_dir, 'gt', class_name, gt_name)), dtype = np.uint8)
        if len(gt.shape) > 2:
            gt = gt[:, :, 0]

        iou = get_iou(gt==255, pred>0.5)
        ious.append(iou)

np.mean(ious), np.mean(times)

### Compression (1 point)

Try to speed up the model in any way without losing more than 1% in iou score.
For example [torch2trt](https://github.com/NVIDIA-AI-IOT/torch2trt)

In [None]:
def get_fast_model():
    # YOUR CODE HERE
    return model

In [None]:
fast_model = get_fast_model().to('cuda')

In [None]:
ious, times = [], []
test_dir = 'data/val/'

for class_name in tqdm(sorted(os.listdir(os.path.join(test_dir, 'images')))):
    for img_name in sorted(os.listdir(os.path.join(test_dir, 'images', class_name))):

        t_start = time()
        pred = predict(fast_model, os.path.join(test_dir, 'images', class_name, img_name))
        times.append(time() - t_start)

        gt_name = img_name.replace('jpg', 'png')
        gt = np.asarray(Image.open(os.path.join(test_dir, 'gt', class_name, gt_name)), dtype = np.uint8)
        if len(gt.shape) > 2:
            gt = gt[:, :, 0]

        iou = get_iou(gt==255, pred>0.5)
        ious.append(iou)

np.mean(ious), np.mean(times)

**Bonus:** For the best iou score on test(without compression) in group you will get 1.5, 1, 0.5 extra points(for 1st, 2nd, 3rd places).