In [None]:
# git clone https://github.com/facebookresearch/dinov3.git
# pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126

In [None]:
# pip install -U 'git+https://github.com/facebookresearch/detectron2.git'
# git clone https://github.com/facebookresearch/Mask2Former.git
# pip install -r requirements.txt

In [11]:
# pretrained backbones
import torch

DINOv3_REPO = "/home/aminaasgarova/Desktop/dinov3/dinov3"
DINOv3_weights = "/home/aminaasgarova/Desktop/dinov3/dinov3/dinov3_vitl16_pretrain_sat493m-eadcf0ff.pth"

# DINOv3 ViT models pretrained on satellite imagery
dinov3_vitl16 = torch.hub.load(DINOv3_REPO, 'dinov3_vitl16', source='local', weights = DINOv3_weights)

In [None]:
import torchvision
from torchvision.transforms import v2

def make_transform(resize_size: int = 256): 
    to_tensor = v2.ToImage()
    resize = v2.Resize((resize_size, resize_size), antialias=True)
    to_float = v2.ToDtype(torch.float32, scale=True)
    normalize = v2.Normalize(
        mean=(0.430, 0.411, 0.296),
        std=(0.213, 0.156, 0.143),
    )
    return v2.Compose([to_tensor, resize, to_float, normalize])

In [None]:
# Dataset expolaration
import numpy as np

x = np.load("/home/aminaasgarova/Desktop/dinov33/data/train_images/0a0c3d101a954cb9950f476274bc70b4.npy")
print("shape:", x.shape)
print("dtype:", x.dtype)
print("min:", x.min())
print("max:", x.max())


import numpy as np
import torch

SAT_MEAN = (0.430, 0.411, 0.296)
SAT_STD  = (0.213, 0.156, 0.143)

def dinov3_input_from_npy(npy_path: str):
    x = np.load(npy_path)  # your array: (3, H, W), uint16, 0..255

    # 1) make float32 in [0, 1]
    x = x.astype(np.float32) / 255.0

    # 2) to torch (C,H,W)
    x = torch.from_numpy(x)  # float32

    # 3) normalize
    mean = torch.tensor(SAT_MEAN)[:, None, None]
    std  = torch.tensor(SAT_STD)[:, None, None]
    x = (x - mean) / std

    # 4) add batch dim -> (1,3,H,W)
    return x.unsqueeze(0)

inp = dinov3_input_from_npy("/mnt/data/0a0c3d101a954cb9950f476274bc70b4.npy")
print(inp.shape, inp.dtype)


shape: (3, 256, 256)
dtype: uint16
min: 1
max: 255


## Mask2Former head

In [None]:
# mmdetection 
# conda create -n dinov3_m2f python=3.10 -y
# conda activate dinov3_m2f

# pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
# python -m pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cu121

# pip install -U openmim
# mim install mmengine
# mim install "mmcv==2.1.0"
# mim install "mmdet>=3.0.0"
# mim install "mmsegmentation>=1.2.0"
# pip install rasterio tifffile pillow opencv-python


# git clone https://github.com/open-mmlab/mmsegmentation.git
# cd mmsegmentation
# pip install -e .

# mkdir -p projects/dinov3_building
# touch projects/__init__.py
# touch projects/dinov3_building/__init__.py

In [None]:
#Option 2 
# pip install -U 'git+https://github.com/facebookresearch/detectron2.git'
# git clone https://github.com/facebookresearch/Mask2Former.git
# cd Mask2Former
# pip install -r requirements.txt
# export PYTHONPATH=$(pwd):$PYTHONPATH


# mkdir -p buildings_project/configs
#touch buildings_project/buildings_npy_dataset.py
#touch buildings_project/buildings_npy_mapper.py
#touch buildings_project/dinov3_backbone.py
#touch buildings_project/train_buildings.py
#touch buildings_project/infer_one_image.py
#touch buildings_project/configs/dinov3_mask2former_buildings.yaml

# export PYTHONPATH=/home/aminaasgarova/Desktop/dinov33/Mask2Former:$PYTHONPATH
# cd /home/aminaasgarova/Desktop/buildings_project
# python train_buildings.py --config-file configs/dinov3_mask2former_buildings.yaml --num-gpus 1

In [None]:
# #Option3

#1
# git clone https://github.com/facebookresearch/dinov3.git
# cd dinov3

#2
# micromamba env create -f conda.yaml
# micromamba activate dinov3

# conda create -n dinov3 python=3.11.14 -y

#3 inside of building_seg_dinov3_mask2former folder 
# pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
# python -m pip install -U pip setuptools wheel
# python -m pip install opencv-python pillow rasterio numpy

# python -m pip install "git+https://github.com/facebookresearch/detectron2.git" --no-build-isolation
# git clone https://github.com/facebookresearch/Mask2Former.git
# cd Mask2Former
# python -m pip install -e .
# cd ..

# python -m pip install -e /home/aminaasgarova/Desktop/dinov3/dinov3
# export PYTHONPATH=/home/aminaasgarova/Desktop/dinov33/dinov3:$PYTHONPATH


# Final 
export PYTHONPATH=/home/aminaasgarova/Desktop/dinov33/dinov3:$PYTHONPATH
export PYTHONPATH=$(pwd):$PYTHONPATH
python custom/train_net_buildings.py \
  --num-gpus 1 \
  --config-file custom/buildings_mask2former_dinov3.yaml

In [None]:
curl -L https://micro.mamba.pm/install.sh | bash

micromamba remove -y torch torchvision torchaudio

micromamba install -y -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8

python -c "import torch; print(torch.__version__); print(torch.cuda.is_available()); print(torch.cuda.device_count())"

micromamba install -y -c nvidia cuda-nvcc=11.8
which nvcc
nvcc --version
export CUDA_HOME=$CONDA_PREFIXh
echo $CUDA_HOME

python -m pip uninstall -y detectron2
python -m pip install -U "git+https://github.com/facebookresearch/detectron2.git"


In [None]:
# Transpose dataset (C,H.W) as (H, W, C)

import numpy as np
import torch

def load_npy_image(path: str):
    x = np.load(path)  # could be (C,H,W) or (H,W,C)
    if x.ndim == 3 and x.shape[0] in (1, 3, 4):  # (C,H,W)
        x = np.transpose(x, (1, 2, 0))          # -> (H,W,C)
    x = x.astype(np.float32)
    # scale to 0..1 for normalization later
    if x.max() > 1.5:
        x = x / 255.0 if x.max() <= 255 else x / 65535.0
    return x  # (H,W,C) float32

def load_npy_mask(path: str):
    m = np.load(path)  # (H,W), values 0/1 or 0..K
    if m.ndim == 3 and m.shape[0] == 1:
        m = m[0]
    return m.astype(np.int64)

img = "/home/aminaasgarova/Desktop/dinov33/data/train_images/0a0c3d101a954cb9950f476274bc70b4.npy"
load_npy_image(img)

array([[[0.40392157, 0.36078432, 0.32156864],
        [0.4117647 , 0.35686275, 0.3254902 ],
        [0.4862745 , 0.41568628, 0.38431373],
        ...,
        [0.4745098 , 0.4392157 , 0.40392157],
        [0.47058824, 0.43137255, 0.39607844],
        [0.4627451 , 0.41960785, 0.38039216]],

       [[0.4117647 , 0.3647059 , 0.3254902 ],
        [0.45882353, 0.39607844, 0.36078432],
        [0.49803922, 0.41568628, 0.3882353 ],
        ...,
        [0.5058824 , 0.47058824, 0.4392157 ],
        [0.45882353, 0.42352942, 0.39215687],
        [0.42352942, 0.3882353 , 0.35686275]],

       [[0.4392157 , 0.38039216, 0.3372549 ],
        [0.46666667, 0.39215687, 0.3529412 ],
        [0.4862745 , 0.39215687, 0.3647059 ],
        ...,
        [0.5294118 , 0.49411765, 0.46666667],
        [0.47058824, 0.4392157 , 0.4117647 ],
        [0.43529412, 0.40784314, 0.3764706 ]],

       ...,

       [[0.10196079, 0.10196079, 0.08627451],
        [0.14901961, 0.14509805, 0.12156863],
        [0.21176471, 0

In [9]:
# Inference image support 
import numpy as np

def load_image_any(path: str):
    p = path.lower()
    if p.endswith(".npy"):
        return load_npy_image(path)

    if p.endswith((".jpg", ".jpeg", ".png")):
        from PIL import Image
        x = np.array(Image.open(path).convert("RGB")).astype(np.float32) / 255.0
        return x  # (H,W,3)

    if p.endswith((".tif", ".tiff")):
        import rasterio
        with rasterio.open(path) as src:
            x = src.read()  # (C,H,W)
        if x.shape[0] >= 3:
            x = x[:3]  # keep RGB if more bands
        x = np.transpose(x, (1, 2, 0)).astype(np.float32)
        if x.max() > 1.5:
            x = x / 255.0 if x.max() <= 255 else x / 65535.0
        return x

    raise ValueError(f"Unknown format: {path}")


## Main code

In [18]:
import os
from glob import glob

print("CWD:", os.getcwd())
print("exists dataset/train/images:", os.path.exists("/home/aminaasgarova/Desktop/dinov33/tiled/train/images"))
print("exists dataset/train/masks :", os.path.exists("/home/aminaasgarova/Desktop/dinov33/tiled/train/annotations"))

imgs = sorted(glob("/home/aminaasgarova/Desktop/dinov33/tiled/train/images/*.npy"))
masks = sorted(glob("/home/aminaasgarova/Desktop/dinov33/tiled/train/annotations/*.npy"))

print("num train images:", len(imgs))
print("num train masks :", len(masks))

if len(imgs) > 0:
    print("first image file:", imgs[0])
if len(masks) > 0:
    print("first mask file:", masks[0])


CWD: /home/aminaasgarova/Desktop/dinov33
exists dataset/train/images: True
exists dataset/train/masks : True
num train images: 15979
num train masks : 15979
first image file: /home/aminaasgarova/Desktop/dinov33/tiled/train/images/0001f0479c244ad1b0bfdd284cc0dfc0.npy
first mask file: /home/aminaasgarova/Desktop/dinov33/tiled/train/annotations/0001f0479c244ad1b0bfdd284cc0dfc0.npy


In [None]:
import os
from glob import glob
import numpy as np
import torch
from torch.utils.data import Dataset
import torch.nn.functional as F

SAT_MEAN = (0.430, 0.411, 0.296)
SAT_STD  = (0.213, 0.156, 0.143)

# # def normalize_img(x: torch.Tensor) -> torch.Tensor:
#     # x: (3,H,W), float in [0,1]
#     mean = torch.tensor(SAT_MEAN, device=x.device).view(3,1,1)  # makes a tensor from  3 numbers:,reshapes it to: (3,1,1) 
#     std  = torch.tensor(SAT_STD,  device=x.device).view(3,1,1)
#     return (x - mean) / std

SAT_MEAN_T = torch.tensor(SAT_MEAN).view(3,1,1)
SAT_STD_T  = torch.tensor(SAT_STD).view(3,1,1)

def normalize_img(x: torch.Tensor) -> torch.Tensor:
    mean = SAT_MEAN_T.to(device=x.device, dtype=x.dtype)
    std  = SAT_STD_T.to(device=x.device, dtype=x.dtype)
    return (x - mean) / std


def to_chw_rgb(img_np: np.ndarray) -> torch.Tensor:
    # Handles common cases:
    # (H,W,3), (3,H,W), (H,W) grayscale
    if img_np.ndim == 2:
        img_np = np.stack([img_np, img_np, img_np], axis=-1)  # (H,W,3)
    if img_np.shape[0] == 3 and img_np.ndim == 3:
        # already (3,H,W)
        x = torch.from_numpy(img_np)
    else:
        # assume (H,W,3)
        x = torch.from_numpy(img_np).permute(2,0,1)
    return x

class NpySegDataset(Dataset):
    def __init__(self, img_dir, mask_dir, out_size=256, augment=False):
        self.img_paths = sorted(glob(os.path.join(img_dir, "*.npy")))
        self.mask_dir = mask_dir
        self.out_size = out_size
        self.augment = augment

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        name = os.path.splitext(os.path.basename(img_path))[0]
        mask_path = os.path.join(self.mask_dir, f"{name}.npy")

        img_np = np.load(img_path)          # image npy
        mask_np = np.load(mask_path)        # mask npy, values 0/1

        x = to_chw_rgb(img_np).float()
        # scale if needed
        if x.max() > 1.0:
            # common if uint8 [0..255]
            x = x / 255.0

        y = torch.from_numpy(mask_np).float()  # (H,W)
        if y.ndim == 3:
            y = y.squeeze()  # if (H,W,1)

        # augment (same for x and y)
        if self.augment:
            if torch.rand(()) < 0.5:
                x = torch.flip(x, dims=[2])  # horizontal flip (W)
                y = torch.flip(y, dims=[1])
            if torch.rand(()) < 0.5:
                x = torch.flip(x, dims=[1])  # vertical flip (H)
                y = torch.flip(y, dims=[0])

        # resize
        x = F.interpolate(x.unsqueeze(0), size=(self.out_size, self.out_size),
                          mode="bilinear", align_corners=False).squeeze(0)
        y = F.interpolate(y.unsqueeze(0).unsqueeze(0), size=(self.out_size, self.out_size),
                          mode="nearest").squeeze(0).squeeze(0)

        # normalize
        x = normalize_img(x)

        # shapes for training
        y = y.unsqueeze(0)  # (1,H,W) for BCE
        return x, y, name


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleSegHead(nn.Module):
    def __init__(self, in_ch, mid_ch=256, out_ch=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_ch, mid_ch, 3, padding=1)
        self.gn1 = nn.GroupNorm(32, mid_ch)
        self.act = nn.GELU()
        self.conv2 = nn.Conv2d(mid_ch, out_ch, 1)

    def forward(self, feat, out_hw):
        x = self.conv1(feat)
        x = self.gn1(x)
        x = self.act(x)
        x = self.conv2(x)
        x = F.interpolate(x, size=out_hw, mode="bilinear", align_corners=False)
        return x

class DinoV3SegModel(nn.Module):
    def __init__(self, backbone, num_classes=1):
        super().__init__()
        self.backbone = backbone

        # try to get embedding dim safely
        in_dim = getattr(backbone, "embed_dim", None)
        if in_dim is None:
            in_dim = getattr(backbone, "num_features", None)
        if in_dim is None:
            raise ValueError("Could not find backbone embed dim. Print(backbone) and check attributes.")

        self.head = SimpleSegHead(in_ch=in_dim, out_ch=num_classes)

    def forward(self, x):
        # x: (B,3,H,W)
        B, _, H, W = x.shape

        # DINO style: get patch tokens
        # Most DINO backbones expose get_intermediate_layers.
        # If your model errors here, print(dir(backbone)) and adapt.
        layers = self.backbone.get_intermediate_layers(x, n=1, reshape=False)
        tokens = layers[0]  # (B, 1+N, C) OR (B, N, C) depending on model

        # handle CLS token if present
        if tokens.dim() == 3 and tokens.shape[1] == (H // 16) * (W // 16) + 1:
            patch_tokens = tokens[:, 1:, :]
        else:
            patch_tokens = tokens

        # reshape tokens -> feature map
        gh, gw = H // 16, W // 16
        feat = patch_tokens.transpose(1, 2).contiguous().view(B, -1, gh, gw)  # (B,C,gh,gw)

        logits = self.head(feat, out_hw=(H, W))  # (B,1,H,W)
        return logits


In [4]:
def dice_loss_with_logits(logits, targets, eps=1e-6):
    probs = torch.sigmoid(logits)
    num = 2 * (probs * targets).sum(dim=(2,3))
    den = (probs + targets).sum(dim=(2,3)) + eps
    dice = 1 - (num / den)
    return dice.mean()

def iou_from_logits(logits, targets, thresh=0.5, eps=1e-6):
    probs = torch.sigmoid(logits)
    preds = (probs > thresh).float()
    inter = (preds * targets).sum(dim=(2,3))
    union = (preds + targets - preds * targets).sum(dim=(2,3)) + eps
    return (inter / union).mean().item()


In [14]:
import torch
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1) load backbone (your code)
DINOv3_REPO = "/home/aminaasgarova/Desktop/dinov3/dinov3"
DINOv3_weights = "/home/aminaasgarova/Desktop/dinov3/dinov3/dinov3_vitl16_pretrain_sat493m-eadcf0ff.pth"
dinov3_vitl16 = torch.hub.load(DINOv3_REPO, 'dinov3_vitl16', source='local', weights = DINOv3_weights)

backbone = dinov3_vitl16.to(device)

# 2) build model
model = DinoV3SegModel(backbone=backbone, num_classes=1).to(device)

# 3) freeze backbone first
for p in model.backbone.parameters():
    p.requires_grad = False

# 4) data
train_ds = NpySegDataset(
    img_dir="/home/aminaasgarova/Desktop/dinov33/tiled/train/images",
    mask_dir="/home/aminaasgarova/Desktop/dinov33/tiled/train/annotations",
    out_size=256,
    augment=True,
)
val_ds = NpySegDataset(
    img_dir = "/home/aminaasgarova/Desktop/dinov33/tiled/val/images",
    mask_dir="/home/aminaasgarova/Desktop/dinov33/tiled/val/annotations",
    out_size=256,
    augment=False,
)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=8, shuffle=False, num_workers=4, pin_memory=True)

# 5) optimizer
optimizer = torch.optim.AdamW(model.head.parameters(), lr=1e-3, weight_decay=1e-2)
bce = torch.nn.BCEWithLogitsLoss()

scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))

best_iou = -1.0

for epoch in range(1, 21):
    model.train()
    train_loss = 0.0

    for x, y, _ in train_loader:
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):
            logits = model(x)
            loss = bce(logits, y) + 0.5 * dice_loss_with_logits(logits, y)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()

    # validation
    model.eval()
    val_iou = 0.0
    val_loss = 0.0
    n = 0

    with torch.no_grad():
        for x, y, _ in val_loader:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

            logits = model(x)
            loss = bce(logits, y) + 0.5 * dice_loss_with_logits(logits, y)

            val_loss += loss.item()
            val_iou += iou_from_logits(logits, y)
            n += 1

    train_loss /= max(1, len(train_loader))
    val_loss /= max(1, n)
    val_iou /= max(1, n)

    print(f"Epoch {epoch:02d} | train_loss {train_loss:.4f} | val_loss {val_loss:.4f} | val_iou {val_iou:.4f}")

    if val_iou > best_iou:
        best_iou = val_iou
        torch.save(
            {"model": model.state_dict(), "epoch": epoch, "best_iou": best_iou},
            "best_building_seg.pth",
        )
        print("Saved best model")


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))
  with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):


Epoch 01 | train_loss 0.4245 | val_loss 0.3713 | val_iou 0.5594
Saved best model
Epoch 02 | train_loss 0.4004 | val_loss 0.3774 | val_iou 0.5495
Epoch 03 | train_loss 0.3930 | val_loss 0.3573 | val_iou 0.5732
Saved best model
Epoch 04 | train_loss 0.3871 | val_loss 0.3534 | val_iou 0.5859
Saved best model
Epoch 05 | train_loss 0.3830 | val_loss 0.3462 | val_iou 0.5946
Saved best model
Epoch 06 | train_loss 0.3801 | val_loss 0.3477 | val_iou 0.5926
Epoch 07 | train_loss 0.3775 | val_loss 0.3379 | val_iou 0.5888
Epoch 08 | train_loss 0.3747 | val_loss 0.3340 | val_iou 0.5949
Saved best model
Epoch 09 | train_loss 0.3725 | val_loss 0.3368 | val_iou 0.5962
Saved best model
Epoch 10 | train_loss 0.3709 | val_loss 0.3360 | val_iou 0.6028
Saved best model
Epoch 11 | train_loss 0.3687 | val_loss 0.3323 | val_iou 0.6051
Saved best model
Epoch 12 | train_loss 0.3671 | val_loss 0.3316 | val_iou 0.6029
Epoch 13 | train_loss 0.3660 | val_loss 0.3274 | val_iou 0.6026
Epoch 14 | train_loss 0.3643 | v

In [None]:
# unfreeze backbone
for p in model.backbone.parameters():
    p.requires_grad = True

# smaller lr for backbone
optimizer = torch.optim.AdamW(
    [
        {"params": model.backbone.parameters(), "lr": 1e-5},
        {"params": model.head.parameters(), "lr": 1e-4},
    ],
    weight_decay=1e-2,
)
# freeze some layers of backbone used used in model 

In [1]:
import numpy as np
import torch
import torch.nn.functional as F
import rasterio

SAT_MEAN = (0.430, 0.411, 0.296)
SAT_STD  = (0.213, 0.156, 0.143)

def normalize_img_3ch(x: torch.Tensor) -> torch.Tensor:
    mean = torch.tensor(SAT_MEAN, device=x.device).view(3,1,1)
    std  = torch.tensor(SAT_STD,  device=x.device).view(3,1,1)
    return (x - mean) / std

def pad_to_multiple(x: torch.Tensor, mult: int = 16):
    _, _, H, W = x.shape
    pad_h = (mult - H % mult) % mult
    pad_w = (mult - W % mult) % mult
    x_pad = F.pad(x, (0, pad_w, 0, pad_h), mode="reflect")
    return x_pad, H, W

def scale_to_01(arr: np.ndarray) -> np.ndarray:
    # arr: float32 or int, shape (C,H,W)
    # simple robust scaling per band
    arr = arr.astype(np.float32)
    out = np.empty_like(arr, dtype=np.float32)
    for c in range(arr.shape[0]):
        band = arr[c]
        lo, hi = np.percentile(band, 2), np.percentile(band, 98)
        if hi <= lo:
            out[c] = 0.0
        else:
            out[c] = np.clip((band - lo) / (hi - lo), 0.0, 1.0)
    return out

@torch.no_grad()
def infer_geotiff(model, tif_path, out_mask_tif, device, out_thresh=0.5, rgb_bands=(1,2,3)):
    # rgb_bands are 1-based indices in rasterio
    with rasterio.open(tif_path) as src:
        meta = src.meta.copy()
        # read selected bands
        arr = src.read(list(rgb_bands))  # (3,H,W) usually
        H0, W0 = arr.shape[1], arr.shape[2]

    # Convert to float [0,1]
    # If it is already uint8 RGB, scaling by 255 works.
    # If it is uint16 or reflectance, use robust scaling.
    if arr.dtype == np.uint8:
        arr01 = arr.astype(np.float32) / 255.0
    else:
        arr01 = scale_to_01(arr)

    x = torch.from_numpy(arr01).to(device)           # (3,H,W)
    x = x.unsqueeze(0)                               # (1,3,H,W)
    x, Horig, Worig = pad_to_multiple(x, 16)         # (1,3,Hpad,Wpad)
    x = normalize_img_3ch(x[0]).unsqueeze(0)

    logits = model(x)
    prob = torch.sigmoid(logits)[0,0]                # (Hpad,Wpad)
    prob = prob[:Horig, :Worig]

    pred01 = (prob > out_thresh).to(torch.uint8).cpu().numpy()  # (H,W) 0/1

    # save as GeoTIFF mask with same georef
    meta.update(count=1, dtype="uint8")
    with rasterio.open(out_mask_tif, "w", **meta) as dst:
        dst.write((pred01 * 255).astype(np.uint8), 1)

    return pred01, prob.cpu().numpy()

# usage
model.to(device).eval()
pred01, prob = infer_geotiff(model, "/home/aminaasgarova/Desktop/dinov33/test_data/test_image.tif", "/home/aminaasgarova/Desktop/dinov33/test_data/pred_mask.tif", device, rgb_bands=(1,2,3))


NameError: name 'model' is not defined

In [None]:
import numpy as np
import rasterio
from rasterio.features import shapes
from shapely.geometry import shape
import geopandas as gpd

def pred_tif_to_vector(pred_tif_path, out_path, min_area=None):
    with rasterio.open(pred_tif_path) as src:
        mask = src.read(1)          # (H,W)
        transform = src.transform
        crs = src.crs

    # Make it binary (works for 0/1 or 0/255 or any >0)
    bin_mask = (mask > 0).astype(np.uint8)

    geoms = []
    for geom, val in shapes(bin_mask, mask=(bin_mask > 0), transform=transform):
        if val == 1:
            geoms.append(shape(geom))

    gdf = gpd.GeoDataFrame(geometry=geoms, crs=crs)

    if len(gdf) == 0:
        print("No polygons extracted. Check mask values and threshold.")
    else:
        # optional: remove tiny polygons
        if min_area is not None:
            gdf = gdf[gdf.geometry.area >= float(min_area)]

    if out_path.lower().endswith(".shp"):
        gdf.to_file(out_path, driver="ESRI Shapefile")
    else:
        gdf.to_file(out_path, driver="GeoJSON")

    print("polygons:", len(gdf))
    return gdf

# example
pred_tif_to_vector(
    pred_tif_path="/home/aminaasgarova/Desktop/dinov33/test_data/pred_mask.tif",
    out_path="/home/aminaasgarova/Desktop/dinov33/test_data/buildings.geojson",   # or "buildings.shp"
    min_area=None
)


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

DINOv3_REPO = "/home/aminaasgarova/Desktop/dinov3/dinov3"
DINOv3_weights = "/home/aminaasgarova/Desktop/dinov3/dinov3/dinov3_vitl16_pretrain_sat493m-eadcf0ff.pth"
CKPT_PATH = "/home/aminaasgarova/Desktop/dinov33/best_building_seg.pth"  # change if you saved elsewhere

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class SimpleSegHead(nn.Module):
    def __init__(self, in_ch, mid_ch=256, out_ch=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_ch, mid_ch, 3, padding=1)
        self.gn1 = nn.GroupNorm(32, mid_ch)
        self.act = nn.GELU()
        self.conv2 = nn.Conv2d(mid_ch, out_ch, 1)

    def forward(self, feat, out_hw):
        x = self.conv1(feat)
        x = self.gn1(x)
        x = self.act(x)
        x = self.conv2(x)
        x = F.interpolate(x, size=out_hw, mode="bilinear", align_corners=False)
        return x

class DinoV3SegModel(nn.Module):
    def __init__(self, backbone, num_classes=1):
        super().__init__()
        self.backbone = backbone

        in_dim = getattr(backbone, "embed_dim", None)
        if in_dim is None:
            in_dim = getattr(backbone, "num_features", None)
        if in_dim is None:
            raise ValueError("Could not find backbone embed dim.")

        self.head = SimpleSegHead(in_ch=in_dim, out_ch=num_classes)

    def forward(self, x):
        B, _, H, W = x.shape
        layers = self.backbone.get_intermediate_layers(x, n=1, reshape=False)
        tokens = layers[0]

        if tokens.shape[1] == (H // 16) * (W // 16) + 1:
            patch_tokens = tokens[:, 1:, :]
        else:
            patch_tokens = tokens

        gh, gw = H // 16, W // 16
        feat = patch_tokens.transpose(1, 2).contiguous().view(B, -1, gh, gw)
        logits = self.head(feat, out_hw=(H, W))
        return logits

# 1) load backbone
backbone = torch.hub.load(DINOv3_REPO, "dinov3_vitl16", source="local", weights=DINOv3_weights).to(device)

# 2) build model (must match training)
model = DinoV3SegModel(backbone=backbone, num_classes=1).to(device)

# 3) load checkpoint
ckpt = torch.load(CKPT_PATH, map_location=device)
model.load_state_dict(ckpt["model"], strict=True)

model.eval()
print("Loaded checkpoint from epoch:", ckpt.get("epoch"), "best_iou:", ckpt.get("best_iou"))


  from .autonotebook import tqdm as notebook_tqdm


Loaded checkpoint from epoch: 19 best_iou: 0.6131070247718267


In [6]:
import numpy as np
import rasterio

SAT_MEAN = (0.430, 0.411, 0.296)
SAT_STD  = (0.213, 0.156, 0.143)

def normalize_img_3ch(x: torch.Tensor) -> torch.Tensor:
    mean = torch.tensor(SAT_MEAN, device=x.device).view(3,1,1)
    std  = torch.tensor(SAT_STD,  device=x.device).view(3,1,1)
    return (x - mean) / std

def pad_to_multiple(x: torch.Tensor, mult: int = 16):
    _, _, H, W = x.shape
    pad_h = (mult - H % mult) % mult
    pad_w = (mult - W % mult) % mult
    x_pad = F.pad(x, (0, pad_w, 0, pad_h), mode="reflect")
    return x_pad, H, W

@torch.no_grad()
def infer_geotiff_save_tif(model, tif_path, out_tif, device, thresh=0.5, rgb_bands=(1,2,3)):
    with rasterio.open(tif_path) as src:
        meta = src.meta.copy()
        arr = src.read(list(rgb_bands))  # (3,H,W)
        H0, W0 = arr.shape[1], arr.shape[2]

    # scale to 0..1
    if arr.dtype == np.uint8:
        arr01 = arr.astype(np.float32) / 255.0
    else:
        arr01 = arr.astype(np.float32)
        # quick robust scaling per band
        for c in range(3):
            lo, hi = np.percentile(arr01[c], 2), np.percentile(arr01[c], 98)
            arr01[c] = 0.0 if hi <= lo else np.clip((arr01[c] - lo) / (hi - lo), 0, 1)

    x = torch.from_numpy(arr01).to(device).unsqueeze(0)
    x, Horig, Worig = pad_to_multiple(x, 16)
    x = normalize_img_3ch(x[0]).unsqueeze(0)

    logits = model(x)
    prob = torch.sigmoid(logits)[0,0][:Horig, :Worig]
    pred01 = (prob > thresh).to(torch.uint8).cpu().numpy()

    meta.update(count=1, dtype="uint8")
    with rasterio.open(out_tif, "w", **meta) as dst:
        dst.write((pred01 * 255).astype(np.uint8), 1)

    return pred01

# example
pred01 = infer_geotiff_save_tif(model, "/home/aminaasgarova/Desktop/dinov33/test_data/test2.tif", "/home/aminaasgarova/Desktop/dinov33/test_data/pred_mask2.tif", device)


In [7]:
import numpy as np
import rasterio
from rasterio.features import shapes
from shapely.geometry import shape
import geopandas as gpd

def pred_tif_to_vector(pred_tif_path, out_path):
    with rasterio.open(pred_tif_path) as src:
        mask = src.read(1)
        transform = src.transform
        crs = src.crs

    bin_mask = (mask > 0).astype(np.uint8)

    geoms = []
    for geom, val in shapes(bin_mask, mask=(bin_mask > 0), transform=transform):
        if val == 1:
            geoms.append(shape(geom))

    gdf = gpd.GeoDataFrame(geometry=geoms, crs=crs)

    if out_path.lower().endswith(".shp"):
        gdf.to_file(out_path, driver="ESRI Shapefile")
    else:
        gdf.to_file(out_path, driver="GeoJSON")

    print("polygons:", len(gdf))
    return gdf

pred_tif_to_vector("/home/aminaasgarova/Desktop/dinov33/test_data/pred_mask2.tif", "/home/aminaasgarova/Desktop/dinov33/test_data/buildings2.geojson")


polygons: 199


Unnamed: 0,geometry
0,"POLYGON ((536509.25 4341833.75, 536509.25 4341..."
1,"POLYGON ((536571.75 4341833.75, 536571.75 4341..."
2,"POLYGON ((536513.75 4341829, 536513.75 4341828..."
3,"POLYGON ((536761.5 4341833.75, 536761.5 434183..."
4,"POLYGON ((536605 4341831.25, 536605 4341831, 5..."
...,...
194,"POLYGON ((536393.75 4341588.5, 536393.75 43415..."
195,"POLYGON ((536573.25 4341589, 536573.25 4341588..."
196,"POLYGON ((536589.5 4341589.75, 536589.5 434158..."
197,"POLYGON ((536533.5 4341592.75, 536533.5 434159..."


In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel

# config
TRAIN_IMG = "/home/aminaasgarova/Desktop/dinov33/tiled/train/images"
TRAIN_MASK = "/home/aminaasgarova/Desktop/dinov33/tiled/train/annotations"
VAL_IMG = "/home/aminaasgarova/Desktop/dinov33/tiled/val/images"
VAL_MASK = "/home/aminaasgarova/Desktop/dinov33/tiled/val/annotations"
# TEST_IMG = "dataset/test/images"
# SAVE_PRED = "predictions"

MODEL_NAME = "facebook/dinov3-vitb16-pretrain-lvd1689m"
IMG_SIZE = 512
NUM_CLASSES = 2
BATCH_SIZE = 4
EPOCHS = 20
LR = 1e-3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# os.makedirs(SAVE_PRED, exist_ok=True)

#dataset
class NpyDataset(Dataset):
    def __init__(self, img_dir, mask_dir=None):
        self.img_dir = img_dir
        self.mask_dir = mask_dir
        self.files = sorted(os.listdir(img_dir))

        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        name = self.files[idx]

        img = np.load(os.path.join(self.img_dir, name))
        img = torch.from_numpy(img).float()

        # ---- handle channel order ----
        if img.ndim != 3:
            raise ValueError(f"Bad image ndim {img.shape}")

        # H W 3 -> 3 H W
        if img.shape[-1] == 3:
            img = img.permute(2, 0, 1)

        # already 3 H W
        elif img.shape[0] == 3:
            pass

        else:
            raise ValueError(f"Unsupported image shape {img.shape}")

        # normalize range
        if img.max() > 1:
            img = img / 255.0

        # resize
        img = F.interpolate(
            img.unsqueeze(0),
            size=(IMG_SIZE, IMG_SIZE),
            mode="bilinear",
            align_corners=False,
        ).squeeze(0)

        # imagenet normalization
        img = (img - self.mean) / self.std

        # inference only
        if self.mask_dir is None:
            return img, name

        # ---- mask ----
        mask = np.load(os.path.join(self.mask_dir, name))
        mask = torch.from_numpy(mask).long()

        if mask.ndim != 2:
            raise ValueError(f"Bad mask shape {mask.shape}")

        mask = F.interpolate(
            mask.unsqueeze(0).unsqueeze(0).float(),
            size=(IMG_SIZE, IMG_SIZE),
            mode="nearest",
        ).squeeze(0).long()

        return img, mask


# model 
class SegHead(nn.Module):
    def __init__(self, in_ch, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(in_ch, 256, 3, padding=1)
        self.conv2 = nn.Conv2d(256, num_classes, 1)

    def forward(self, x, out_size):
        x = F.relu(self.conv1(x))
        x = self.conv2(x)
        return F.interpolate(x, size=out_size, mode="bilinear", align_corners=False)

class DinoSeg(nn.Module):
    def __init__(self, backbone, head):
        super().__init__()
        self.backbone = backbone
        self.head = head

    def forward(self, images):
        with torch.no_grad():
            out = self.backbone(images).last_hidden_state

        b, n, c = out.shape
        h = w = int(n ** 0.5)
        feats = out.permute(0, 2, 1).reshape(b, c, h, w)

        return self.head(feats, images.shape[-2:])

# metrics 
def dice(pred, gt):
    pred = pred.flatten()
    gt = gt.flatten()
    inter = (pred * gt).sum()
    return (2 * inter + 1e-6) / (pred.sum() + gt.sum() + 1e-6)

def iou(pred, gt):
    inter = (pred & gt).sum()
    union = (pred | gt).sum()
    return (inter + 1e-6) / (union + 1e-6)

# training 
def train():
    backbone = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
    for p in backbone.parameters():
        p.requires_grad = False
    backbone.eval()

    model = DinoSeg(backbone, SegHead(768, NUM_CLASSES)).to(DEVICE)

    train_ds = NpyDataset(TRAIN_IMG, TRAIN_MASK)
    val_ds = NpyDataset(VAL_IMG, VAL_MASK)

    train_loader = DataLoader(train_ds, BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, BATCH_SIZE)

    optimizer = torch.optim.AdamW(model.head.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(EPOCHS):
        model.train()
        loss_sum = 0

        for imgs, masks in train_loader:
            imgs, masks = imgs.to(DEVICE), masks.to(DEVICE)
            preds = model(imgs)
            loss = criterion(preds, masks)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_sum += loss.item()

        model.eval()
        dices, ious = [], []

        with torch.no_grad():
            for imgs, masks in val_loader:
                imgs, masks = imgs.to(DEVICE), masks.to(DEVICE)
                pred = model(imgs).argmax(1)
                dices.append(dice(pred, masks).item())
                ious.append(iou(pred.bool(), masks.bool()).item())

        print(
            f"Epoch {epoch+1} | "
            f"Loss {loss_sum/len(train_loader):.4f} | "
            f"Dice {np.mean(dices):.4f} | "
            f"IoU {np.mean(ious):.4f}"
        )

    torch.save(model.head.state_dict(), "seg_head.pth")

# #  inference 
# def inference():
#     backbone = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
#     backbone.eval()

#     head = SegHead(768, NUM_CLASSES).to(DEVICE)
#     head.load_state_dict(torch.load("seg_head.pth"))
#     model = DinoSeg(backbone, head).to(DEVICE)
#     model.eval()

#     test_ds = NpyDataset(TEST_IMG)
#     loader = DataLoader(test_ds, batch_size=1)

#     with torch.no_grad():
#         for img, name in loader:
#             img = img.to(DEVICE)
#             pred = model(img).argmax(1).squeeze(0).cpu().numpy()
#             np.save(os.path.join(SAVE_PRED, name[0]), pred)

#     print("Inference done")

# main 
if __name__ == "__main__":
    train()
    # inference()


  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: shape '[4, 768, 32, 32]' is invalid for input of size 3161088