In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass
        # print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install torchvision torch faiss-cpu opencv-python-headless timm torchmetrics pytorch-metric-learning

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting pytorch-metric-learning
  Downloading pytorch_metric_learning-2.8.1-py3-none-any.whl.metadata (18 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [3]:
import os
import glob
import random
import numpy as np
import torch
from types import SimpleNamespace

def set_seed(seed: int = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.use_deterministic_algorithms(True, warn_only=True)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark     = False
set_seed(42)


input_dirs = glob.glob('/kaggle/input/*flowers*')
if input_dirs:
    dataset_root = input_dirs[0]
else:
    dataset_root = '../input/flowers-recognition/flowers'

cfg = SimpleNamespace(
    seed=42,
    verbose=True,
    dataset=SimpleNamespace(
        root=dataset_root,
        image_size=224,
        num_workers=0
    ),
    splits=SimpleNamespace(
        training_ratio=0.8
    ),
    training=SimpleNamespace(
        batch_size=32,
        epochs=15,
        lr_backbone=1e-4,
        lr_head=1e-3
    ),
    index=SimpleNamespace(
        type='hnsw',
        metric='cosine',
        hnsw=SimpleNamespace(
            M=32,
            ef_construction=200,
            ef_search=50
        )
    ),
    models=[
        'resnet50',
        'efficientnet_b0',
        'clip_zeroshot',
        'clip_finetune',
        'metric_learning',
        'dinov2',
        ''
    ],
    evaluation=SimpleNamespace(
        k=5
    ),
    api=SimpleNamespace(
        host='0.0.0.0',
        port=8000
    )
)


In [4]:
from pathlib import Path
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import random, os, torch, numpy as np

class FlowersDataset(Dataset):

    def __init__(self, root, transform=None):
        self.samples, self.labels = [], []
        classes = sorted(os.listdir(root))
        self.class_to_idx = {c:i for i,c in enumerate(classes)}
        for cls in classes:
            for p in Path(root, cls).glob('*'):
                self.samples.append(str(p))
                self.labels.append(self.class_to_idx[cls])
        self.transform = transform
        
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img = Image.open(self.samples[idx]).convert('RGB')
        if self.transform: img = self.transform(img)
        return img, self.labels[idx], self.samples[idx]


In [5]:
MEAN = (0.48145466, 0.4578275, 0.40821073)
STD  = (0.26862954, 0.26130258, 0.27577711)

clip_val_tf = transforms.Compose([
    transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(MEAN, STD),
])

In [6]:
def set_seed(seed: int = 42):
    # 1 Python
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    # 2 NumPy
    np.random.seed(seed)

    # 3 PyTorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.use_deterministic_algorithms(True, warn_only=True)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark     = False

    # 4 FAISS (HNSW), sklearn, etc.
    try:
        import faiss
        faiss.rand.seed(seed)
    except ImportError:
        pass


In [7]:
full_ds = FlowersDataset("/kaggle/input/flowers-recognition/flowers", None)
idxs = list(range(len(full_ds)))
random.Random(42).shuffle(idxs)

from collections import defaultdict
cls_bins = defaultdict(list)
for i in idxs:
    cls_bins[full_ds.labels[i]].append(i)

train_idx, test_idx = [], []
for lst in cls_bins.values():
    cut = int(len(lst) * 0.8)
    train_idx += lst[:cut]
    test_idx  += lst[cut:]

train_set = torch.utils.data.Subset(full_ds, train_idx)
test_set  = torch.utils.data.Subset(full_ds, test_idx)
train_set.dataset.transform = clip_val_tf
test_set.dataset.transform  = clip_val_tf

In [8]:
import abc
class FeatureExtractor(abc.ABC):
    dim: int
    @abc.abstractmethod
    def fit(self, loader): pass
    @abc.abstractmethod
    def encode(self, images): pass

In [9]:
from copy import deepcopy

class FineTuneMixin:
    def _make_val_split(self, train_subset, val_ratio=1-cfg.splits.training_ratio):
        idx = train_subset.indices if hasattr(train_subset, "indices") else list(range(len(train_subset)))
        split = int(len(idx) * (1 - val_ratio))
        return (
            torch.utils.data.Subset(train_subset.dataset, idx[:split]),
            torch.utils.data.Subset(train_subset.dataset, idx[split:])
        )

    def _train_one_epoch(self, loader, criterion, optimizer, scheduler):
        self.backbone.train()
        running_loss, correct, total = 0.0, 0, 0
        for x, y, _ in tqdm(loader, disable=not cfg.verbose, leave=False):
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            out = self.backbone(x)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()
            if scheduler: scheduler.step()

            running_loss += loss.item() * x.size(0)
            preds = out.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += x.size(0)
        return running_loss / total, correct / total

    @torch.no_grad()
    def _validate(self, loader, criterion):
        self.backbone.eval()
        vl, vc, vt = 0.0, 0, 0
        for x, y, _ in loader:
            x, y = x.cuda(), y.cuda()
            out = self.backbone(x)
            loss = criterion(out, y)
            vl += loss.item() * x.size(0)
            vc += (out.argmax(1) == y).sum().item()
            vt += x.size(0)
        return vl / vt, vc / vt

    def _fine_tune(self, train_loader, *, max_epochs=cfg.training.epochs, patience=3,
                   lr_head=cfg.training.lr_head, lr_base=cfg.training.lr_backbone, weight_decay=1e-4):
        train_ds, val_ds = self._make_val_split(train_loader.dataset, 0.1)
        train_dl = torch.utils.data.DataLoader(
            train_ds, batch_size=train_loader.batch_size, shuffle=True,
            num_workers=cfg.dataset.num_workers, drop_last=True)
        val_dl = torch.utils.data.DataLoader(
            val_ds, batch_size=train_loader.batch_size, shuffle=False,
            num_workers=cfg.dataset.num_workers)

        optim = torch.optim.AdamW(
            [
                {"params": self.head_params, "lr": lr_head, "weight_decay": weight_decay},
                {"params": self.base_params, "lr": lr_base, "weight_decay": weight_decay}
            ]
        )
        sched = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=max_epochs*len(train_dl))
        criterion = getattr(self, "loss_fn", nn.CrossEntropyLoss())
        best_wts, best_acc, wait = deepcopy(self.backbone.state_dict()), 0.0, 0

        for epoch in range(1, max_epochs + 1):
            tl, ta = self._train_one_epoch(train_dl, criterion, optim, sched)
            vl, va = self._validate(val_dl, criterion)
            print(f"Epoch {epoch:02d}: train loss={tl:.4f} acc={ta:.3f} | "
                  f"val loss={vl:.4f} acc={va:.3f}")
            if va > best_acc + 1e-4:
                best_acc, best_wts, wait = va, deepcopy(self.backbone.state_dict()), 0
            else:
                wait += 1
                if wait >= patience:
                    print("Early stopping triggered")
                    break

        self.backbone.load_state_dict(best_wts)
        print(f"Best val acc={best_acc:.3f} (epoch {epoch-wait})")

In [10]:
class ResNetExtractor(FineTuneMixin, FeatureExtractor):
    def __init__(self, num_classes=5):
        self.backbone = timm.create_model('resnet50', pretrained=True, drop_rate=0.2)
        self.backbone.reset_classifier(num_classes)
        self.dim = self.backbone.num_features

        classifier = self.backbone.get_classifier()
        self.head_params = list(classifier.parameters())
        self.base_params = [p for p in self.backbone.parameters() if id(p) not in set(map(id, self.head_params))]

    def fit(self, train_loader):
        print("Fine tune ResNet50")
        self.backbone.cuda()
        self._fine_tune(train_loader,
                        max_epochs=cfg.training.epochs,
                        patience=3,
                        lr_head=cfg.training.lr_head,
                        lr_base=cfg.training.lr_backbone)

    @staticmethod
    def _pool(feats):
        """
        B*C*H*W  ->  B*C или оставляет если уже
        """
        if feats.ndim == 4:
            feats = feats.mean(dim=(-1, -2))
        elif feats.ndim == 3:
            feats = feats.squeeze(-1)
        return torch.nn.functional.normalize(feats, dim=-1)

    @torch.no_grad()
    def encode(self, images):
        if isinstance(images, torch.Tensor):
            feats = self.backbone.forward_features(images.cuda())
            #print(feats.shape)
            feats = self._pool(feats)
            return feats.cpu().numpy().astype('float32')

In [11]:
class EfficientNetExtractor(FineTuneMixin, FeatureExtractor):
    def __init__(self, num_classes=5):
        self.backbone = timm.create_model(
            'efficientnet_b0',
            pretrained=True,
            drop_rate=0.2
        )
        self.backbone.reset_classifier(num_classes)
        self.dim = self.backbone.num_features
        classifier = self.backbone.get_classifier()
        self.head_params = list(classifier.parameters())
        self.base_params = [p for p in self.backbone.parameters()
                            if id(p) not in set(map(id, self.head_params))]

    def fit(self, train_loader):
        print("Fine tune EfficientNet-B0")
        self.backbone.cuda()
        self._fine_tune(train_loader,
                        max_epochs=cfg.training.epochs,
                        patience=3,
                        lr_head=cfg.training.lr_head,
                        lr_base=cfg.training.lr_backbone)
    @staticmethod
    def _pool(feats):
        """
        B*C*H*W  ->  B*C или оставляет если уже
        """
        if feats.ndim == 4:
            feats = feats.mean(dim=(-1, -2))
        elif feats.ndim == 3:
            feats = feats.squeeze(-1)
        return torch.nn.functional.normalize(feats, dim=-1)

    @torch.no_grad()
    def encode(self, images):
        if isinstance(images, torch.Tensor):
            feats = self.backbone.forward_features(images.cuda())
            #print(feats.shape)
            feats = self._pool(feats)
            return feats.cpu().numpy().astype('float32')

In [12]:
!pip install transformers accelerate scikit-learn



In [13]:
from transformers import CLIPProcessor, CLIPModel
class CLIPHFExtractor(FeatureExtractor):
    """
    CLIP ViT-B/32 по умолчанию без fine tune
    """
    def __init__(self,
                 model_name: str = "openai/clip-vit-base-patch32",
                 device: str = "cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device
        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.model     = CLIPModel.from_pretrained(model_name).to(device)
        self.model.eval()
        self.dim = self.model.config.projection_dim
        self.to_pil = transforms.ToPILImage()

    def fit(self, *_): pass

    @torch.no_grad()
    def encode(self, images):
        if isinstance(images, torch.Tensor):
            pil = [self.to_pil(img.cpu()) for img in images]
        else:
            pil = [Image.open(p).convert("RGB") for p in images]
        inputs = self.processor(images=pil, return_tensors="pt", padding=True).to(self.device)
        feats = self.model.get_image_features(**inputs)
        feats = F.normalize(feats, p=2, dim=-1)
        return feats.cpu().numpy().astype("float32")

2025-07-09 12:59:09.369966: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752065949.541702      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752065949.594376      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [14]:
from pytorch_metric_learning import losses, miners
import torch.nn as nn
import inspect
class MetricExtractor(nn.Module):
    """
    ResNet-18 backbone + Linear(256) + TripletLoss
    """
    def __init__(self, embed_dim: int = 256):
        super().__init__()
        self.backbone = timm.create_model(
            "resnet18", pretrained=True, num_classes=0, drop_rate=0.2
        )
        self.embed = nn.Linear(self.backbone.num_features, embed_dim)
        self.dim   = embed_dim

        self.loss_fn = losses.TripletMarginLoss(margin=0.2)
        self.miner   = miners.TripletMarginMiner(
            margin=0.2, type_of_triplets="semihard"
        )

    def fit(
        self,
        dl: DataLoader,
        *,
        epochs: int = cfg.training.epochs,
        patience: int = 4,
        lr_head: float = cfg.training.lr_head,
        lr_base: float = cfg.training.lr_backbone,
        weight_decay: float = 1e-4,
    ):
        idx = dl.dataset.indices if hasattr(dl.dataset, "indices") else range(len(dl.dataset))
        n_val = int(0.1 * len(idx))
        train_idx, val_idx = idx[:-n_val], idx[-n_val:]

        base_ds = dl.dataset.dataset if hasattr(dl.dataset, "dataset") else dl.dataset
        train_ds = Subset(base_ds, train_idx)
        val_ds   = Subset(base_ds, val_idx)

        train_dl = DataLoader(
            train_ds, batch_size=dl.batch_size, shuffle=True,
            num_workers=cfg.dataset.num_workers, drop_last=True
        )
        val_dl = DataLoader(
            val_ds, batch_size=dl.batch_size, shuffle=False,
            num_workers=cfg.dataset.num_workers
        )

        opt = torch.optim.AdamW(
            [
                {"params": self.embed.parameters(), "lr": lr_head, "weight_decay": weight_decay},
                {"params": self.backbone.parameters(), "lr": lr_base, "weight_decay": weight_decay},
            ]
        )
        sched = torch.optim.lr_scheduler.CosineAnnealingLR(
            opt, T_max=epochs * len(train_dl)
        )

        self.backbone.cuda(); self.embed.cuda()

        best_knn, wait = 0.0, 0
        best_state     = {k: v.clone() for k, v in self.state_dict().items()}

        for ep in range(1, epochs + 1):

            self.train()
            run_loss, seen = 0.0, 0
            for x, y, _ in tqdm(train_dl, leave=False, desc=f"E{ep:02d} train"):
                x, y = x.cuda(), y.cuda()
                opt.zero_grad()
                emb = F.normalize(self.embed(self.backbone(x)), dim=-1)
                hard = self.miner(emb, y)
                loss = self.loss_fn(emb, y, hard)
                loss.backward(); opt.step(); sched.step()
                run_loss += loss.item() * x.size(0)
                seen     += x.size(0)
            train_loss = run_loss / seen
            knn_acc = self._val_knn_acc(train_dl, val_dl, k=5)

            print(f"Epoch {ep:02d}: loss={train_loss:.4f} | val kNN@1={knn_acc:.3f}")

            if knn_acc > best_knn + 1e-4:
                best_knn, best_state, wait = knn_acc, \
                    {k: v.clone() for k, v in self.state_dict().items()}, 0
            else:
                wait += 1
                if wait >= patience:
                    print("Early stopping.")
                    break

        self.load_state_dict(best_state)
        print(f"Best val kNN@1 = {best_knn:.3f}")

    @torch.no_grad()
    def _val_knn_acc(self, train_dl: DataLoader, val_dl: DataLoader, *, k: int = 5) -> float:
        """
        Строит Flat-индекс из train-эмбеддингов и меряет top-1 точность на val
        """
        tr_vecs, tr_lbls = [], []
        for x, y, _ in train_dl:
            z = F.normalize(self.embed(self.backbone(x.cuda())), dim=-1)
            tr_vecs.append(z.cpu()); tr_lbls.append(y)
        tr_vecs = torch.cat(tr_vecs).numpy().astype("float32")
        tr_lbls = torch.cat(tr_lbls).numpy()

        faiss.normalize_L2(tr_vecs)
        index = faiss.IndexFlatIP(tr_vecs.shape[1])
        index.add(tr_vecs)

        correct, total = 0, 0
        for x, y, _ in val_dl:
            q = F.normalize(self.embed(self.backbone(x.cuda())), dim=-1).cpu().numpy()
            faiss.normalize_L2(q)
            _, I = index.search(q, k)                # (B,k)
            pred = tr_lbls[I[:, 0]]                  # ближайший сосед
            correct += np.sum(pred == y.numpy())
            total   += y.size(0)
        return correct / total

    @torch.no_grad()
    def encode(self, images):
        """Возвращает numpy (B,D)"""
        if isinstance(images, torch.Tensor):
            x = images.cuda()
        else:
            x = torch.stack([clip_val_tf(Image.open(p).convert("RGB")) for p in images]).cuda()

        z = F.normalize(self.embed(self.backbone(x)), dim=-1)
        return z.cpu().numpy().astype("float32")


In [15]:
class _ClipImageEncoder(nn.Module):
    def __init__(self, clip_model):
        super().__init__()
        self.clip = clip_model

    def forward(self, x):                 # x: B*3*H*W
        return self.clip.get_image_features(pixel_values=x)


In [16]:
class _Wrapper:
    def __init__(self, loader, fn_map):
        self.loader = loader
        self.fn_map = fn_map
        self.batch_size = loader.batch_size
        self.dataset = loader.dataset
    def __iter__(self):
        return self.fn_map(self.loader)
    def __len__(self):
        return len(self.loader)

class CLIPFineTuneExtractor(FineTuneMixin, FeatureExtractor):
    def __init__(self, model_name="openai/clip-vit-base-patch32"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.clip      = CLIPModel.from_pretrained(model_name).to(self.device)
        self.dim       = self.clip.config.projection_dim
        self.head_params = list(self.clip.visual_projection.parameters())
        self.base_params = list(self.clip.vision_model.parameters())

        for p in self.clip.text_model.parameters():
            p.requires_grad = False

        self.backbone = _ClipImageEncoder(self.clip)
        self.t2pil    = transforms.ToPILImage()

    def fit(self, train_loader):
        print("Fine tune CLIP")

        def _tensor_loader(dl):
            for x, y, pths in dl:
                pil = [self.t2pil(img.cpu()) for img in x]
                px  = self.processor(images=pil, return_tensors="pt", padding=True)["pixel_values"]
                yield px, y, pths

        wrapped = _Wrapper(train_loader, _tensor_loader)
        self.clip.train()
        self._fine_tune(
            wrapped,
            max_epochs=cfg.training.epochs,
            patience=2,
            lr_head=cfg.training.lr_head,
            lr_base=cfg.training.lr_backbone
        )

    @torch.no_grad()
    def encode(self, images):
        if isinstance(images, torch.Tensor):
            pil = [self.t2pil(img.cpu()) for img in images]
        else:
            pil = [Image.open(p).convert("RGB") for p in images]

        px = self.processor(images=pil, return_tensors="pt", padding=True)["pixel_values"].to(self.device)
        feats = self.clip.get_image_features(pixel_values=px)
        return F.normalize(feats, p=2, dim=-1).cpu().numpy().astype("float32")

In [24]:
class DINOv2Extractor(FeatureExtractor):
    def __init__(self, model_name: str = "vit_base_patch14_dinov2"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.IMG = 518
        self.backbone = timm.create_model(
            model_name,
            pretrained=True,
            num_classes=0,
            img_size=self.IMG,          # задаём правильный размер тк дино на 518
        ).to(self.device).eval()

        self.dim = self.backbone.num_features

        self.tf = transforms.Compose([
            transforms.Resize(self.IMG, interpolation=transforms.InterpolationMode.BICUBIC),
            transforms.CenterCrop(self.IMG),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.485, 0.456, 0.406),
                                 std=(0.229, 0.224, 0.225)),
        ])

    def fit(self, *_): 
        pass

    @torch.no_grad()
    def encode(self, images):
        if isinstance(images, torch.Tensor):
            x = images.to(self.device)
            if x.shape[-1] != self.IMG:
                x = F.interpolate(x, size=self.IMG, mode="bicubic", align_corners=False)
        else:
            x = torch.stack([self.tf(Image.open(p).convert("RGB")) for p in images]).to(self.device)

        feats = self.backbone(x)            # B * 768
        feats = F.normalize(feats, dim=-1)
        return feats.cpu().numpy().astype("float32")

In [18]:
import faiss


def build_index(vecs: np.ndarray, metric: str = "cosine") -> faiss.Index:
    """
    vecs – (N,D) float32,
    metric: 'cosine' | 'l2'
    """
    vecs = np.ascontiguousarray(vecs, dtype="float32")

    if metric == "cosine":
        faiss.normalize_L2(vecs)
        index = faiss.IndexFlatIP(vecs.shape[1])
    elif metric == "l2":
        index = faiss.IndexFlatL2(vecs.shape[1])
    else:
        raise ValueError("метрика должна быть 'cosine' or 'l2'")

    index.add(vecs)
    return index

In [19]:
from tqdm.auto import tqdm
from sklearn.metrics import average_precision_score

def evaluate(extractor, train_loader, test_loader):
    """
    → Precision@5, mAP, faiss-index, rel_paths
    """
    extractor.fit(train_loader)

    # ---------- кодируем базу без shuffle ----------
    enc_loader = DataLoader(
        train_loader.dataset,
        batch_size=train_loader.batch_size,
        shuffle=False,
        num_workers=cfg.dataset.num_workers,
    )

    db_vecs, db_labels, rel_paths = [], [], []
    for x, y, paths in enc_loader:
        db_vecs.append(extractor.encode(x))
        db_labels.extend(y.numpy())
        rel_paths.extend([os.path.relpath(p, cfg.dataset.root) for p in paths])

    db_vecs   = np.vstack(db_vecs).astype("float32")
    db_labels = np.asarray(db_labels)

    if cfg.index.metric == "cosine":
        faiss.normalize_L2(db_vecs)
        index = faiss.IndexFlatIP(db_vecs.shape[1])
    else:
        index = faiss.IndexFlatL2(db_vecs.shape[1])
    index.add(db_vecs)

    # ---------- Precision@5 и mAP ----------
    k = cfg.evaluation.k
    prec_sum, ap_list = 0.0, []
    for x, y, _ in test_loader:
        q = extractor.encode(x)
        if cfg.index.metric == "cosine":
            faiss.normalize_L2(q)
        D, I = index.search(q, db_vecs.shape[0])        # полный ранжированный список
        for lbl, d_row, i_row in zip(y.numpy(), D, I):
            prec_sum += np.sum(db_labels[i_row[:k]] == lbl) / k
            rel = (db_labels[i_row] == lbl).astype(int)
            score = d_row if cfg.index.metric != "l2" else -d_row
            ap_list.append(average_precision_score(rel, score))

    precision = prec_sum / len(test_loader.dataset)
    mAP = float(np.mean(ap_list))
    return precision, mAP, index, rel_paths

In [26]:
from collections import Counter
from pathlib import Path
from datetime import datetime
import torch.nn.functional as F
from torch.utils.data import Subset

import timm

extractors = {
    'resnet50'      : ResNetExtractor(),
    'efficientnet_b0': EfficientNetExtractor(),
    'clip_zeroshot' : CLIPHFExtractor(),
    'metric_learning': MetricExtractor(),
    'clip_finetune' : CLIPFineTuneExtractor(),
    'dinov2'        : DINOv2Extractor()
}

train_loader = DataLoader(train_set, batch_size=cfg.training.batch_size, 
                          shuffle=False, num_workers=cfg.dataset.num_workers, drop_last=True)
test_loader  = DataLoader(test_set,  batch_size=cfg.training.batch_size, 
                          shuffle=False, num_workers=cfg.dataset.num_workers)
save_dir = Path("checkpoints")
save_dir.mkdir(exist_ok=True, parents=True)
scores = {}
for name, extractor in extractors.items():
    print(f"\n=== {name.upper()} ===")
    p, mAP, idx, rel_paths = evaluate(extractor, train_loader, test_loader)
    scores[name] = (p, mAP)
    print(f"Precision@{cfg.evaluation.k}: {p:.3f}")
    print(f"mAP@{cfg.evaluation.k}: {mAP:.3f}")
    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base  = f"{name}_{stamp}"

    if hasattr(extractor, "state_dict"):
        torch.save(extractor.state_dict(), save_dir / f"{base}.pth")
    elif hasattr(extractor, "backbone") and hasattr(extractor.backbone, "state_dict"):
        torch.save(extractor.backbone.state_dict(), save_dir / f"{base}.pth")

    faiss.write_index(idx, str(save_dir / f"{base}.faiss"))
    np.save(save_dir / f"{base}_paths.npy", np.array(rel_paths, dtype=object))
    print(f"Saved: {base}.pth / .faiss / _paths.npy")


=== RESNET50 ===
Fine tune ResNet50


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 01: train loss=0.9206 acc=0.719 | val loss=0.8580 acc=0.697


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 02: train loss=0.3313 acc=0.895 | val loss=0.6186 acc=0.766


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 03: train loss=0.1885 acc=0.944 | val loss=0.4813 acc=0.835


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 04: train loss=0.1232 acc=0.963 | val loss=0.4736 acc=0.812


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 05: train loss=0.0936 acc=0.973 | val loss=0.4357 acc=0.818


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 06: train loss=0.0528 acc=0.990 | val loss=0.4264 acc=0.841


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 07: train loss=0.0438 acc=0.990 | val loss=0.4688 acc=0.838


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 08: train loss=0.0347 acc=0.992 | val loss=0.4347 acc=0.844


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 09: train loss=0.0326 acc=0.991 | val loss=0.4285 acc=0.853


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 10: train loss=0.0223 acc=0.996 | val loss=0.4259 acc=0.858


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 11: train loss=0.0197 acc=0.997 | val loss=0.4393 acc=0.847


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 12: train loss=0.0153 acc=0.999 | val loss=0.4297 acc=0.867


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 13: train loss=0.0150 acc=0.997 | val loss=0.4544 acc=0.853


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 14: train loss=0.0141 acc=0.998 | val loss=0.4227 acc=0.864


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 15: train loss=0.0162 acc=0.997 | val loss=0.4565 acc=0.850
Early stopping triggered
Best val acc=0.867 (epoch 12)
Precision@5: 0.926
mAP@5: 0.826
Saved: resnet50_20250709_131618.pth / .faiss / _paths.npy

=== EFFICIENTNET_B0 ===
Fine tune EfficientNet-B0


  0%|          | 0/97 [00:00<?, ?it/s]

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 01: train loss=0.5469 acc=0.821 | val loss=0.4323 acc=0.870


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 02: train loss=0.1488 acc=0.954 | val loss=0.4558 acc=0.858


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 03: train loss=0.0574 acc=0.986 | val loss=0.4605 acc=0.867


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 04: train loss=0.0280 acc=0.993 | val loss=0.4595 acc=0.882


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 05: train loss=0.0228 acc=0.993 | val loss=0.4137 acc=0.887


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 06: train loss=0.0115 acc=0.997 | val loss=0.4392 acc=0.896


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 07: train loss=0.0103 acc=0.998 | val loss=0.4554 acc=0.893


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 08: train loss=0.0108 acc=0.997 | val loss=0.5013 acc=0.879


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 09: train loss=0.0103 acc=0.997 | val loss=0.4327 acc=0.893
Early stopping triggered
Best val acc=0.896 (epoch 6)
Precision@5: 0.938
mAP@5: 0.895
Saved: efficientnet_b0_20250709_132110.pth / .faiss / _paths.npy

=== CLIP_ZEROSHOT ===


  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(in

Precision@5: 0.548
mAP@5: 0.279
Saved: clip_zeroshot_20250709_132153.pth / .faiss / _paths.npy

=== METRIC_LEARNING ===


  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)


E01 train:   0%|          | 0/97 [00:00<?, ?it/s]

  return _VF.cdist(x1, x2, p, None)  # type: ignore[attr-defined]
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 01: loss=0.0957 | val kNN@1=0.096


E02 train:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 02: loss=0.0915 | val kNN@1=0.171


E03 train:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 03: loss=0.0899 | val kNN@1=0.183


E04 train:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 04: loss=0.0869 | val kNN@1=0.145


E05 train:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 05: loss=0.0857 | val kNN@1=0.197


E06 train:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 06: loss=0.0856 | val kNN@1=0.188


E07 train:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 07: loss=0.0783 | val kNN@1=0.197


E08 train:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 08: loss=0.0822 | val kNN@1=0.171


E09 train:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 09: loss=0.0721 | val kNN@1=0.197
Early stopping.
Best val kNN@1 = 0.197
Precision@5: 0.344
mAP@5: 0.254
Saved: metric_learning_20250709_132846.pth / .faiss / _paths.npy

=== CLIP_FINETUNE ===
Fine tune CLIP


  0%|          | 0/97 [00:00<?, ?it/s]

  return F.linear(input, self.weight, self.bias)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 01: train loss=1.1290 acc=0.602 | val loss=1.2541 acc=0.338


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 02: train loss=0.6591 acc=0.750 | val loss=1.4468 acc=0.263


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 03: train loss=0.4783 acc=0.833 | val loss=0.7082 acc=0.728


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 04: train loss=0.3343 acc=0.885 | val loss=1.2944 acc=0.581


  0%|          | 0/97 [00:00<?, ?it/s]

Epoch 05: train loss=0.2209 acc=0.924 | val loss=1.1196 acc=0.688
Early stopping triggered
Best val acc=0.728 (epoch 3)


  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(input, self.weight, self.bias)
  return F.linear(in

Precision@5: 0.331
mAP@5: 0.238
Saved: clip_finetune_20250709_133223.pth / .faiss / _paths.npy

=== DINOV2 ===
Precision@5: 0.974
mAP@5: 0.869
Saved: dinov2_20250709_133628.pth / .faiss / _paths.npy


In [None]:
# best_name = max(scores, key=scores.get)
# extr = extractors[best_name]
# extr.fit(train_loader)


# all_paths = [s for _,_,paths in DataLoader(full_ds, batch_size=128) for s in paths]
# all_feats=[]
# for x,_,_ in DataLoader(full_ds, batch_size=128):
#     all_feats.append(extr.encode(x))
# all_feats = np.vstack(all_feats)
# index = build_index(all_feats, cfg.index.metric)
# faiss.write_index(index, f'{best_name}.faiss')

# # веса модели (если DL)
# torch.save(extr.backbone.state_dict(), f'{best_name}.pth')
# np.save('paths.npy', np.array(all_paths))


In [None]:
!zip -r /kaggle/working/output.zip /kaggle/working/output/