In [52]:
import pandas as pd
import numpy as np

In [53]:
train_df = pd.read_csv("train_tiger.csv")
train_df.head()

Unnamed: 0,file_name,individual_name,label,sequence,side,path,split,is_query,is_gallery
0,e923dc51c3d3f6ff66ddb56b03f6c535.JPG,0,0,34,right,../train/e923dc51c3d3f6ff66ddb56b03f6c535.JPG,validation,1.0,1.0
1,ad168605cf575df7071a8fd3243183d7.JPG,0,0,34,right,../train/ad168605cf575df7071a8fd3243183d7.JPG,validation,1.0,1.0
2,4c5e1e09c364d17cdce4ae943535e9b9.JPG,0,0,45,right,../train/4c5e1e09c364d17cdce4ae943535e9b9.JPG,validation,1.0,1.0
3,a2347dc80412aaf70f17ff7bdb1b87cb.JPG,0,0,45,right,../train/a2347dc80412aaf70f17ff7bdb1b87cb.JPG,validation,1.0,1.0
4,10d51e412312e8b6d28275e201b04a0e.JPG,0,0,45,right,../train/10d51e412312e8b6d28275e201b04a0e.JPG,validation,1.0,1.0


# Обучение модели

In [54]:
from pathlib import Path
from pprint import pprint
from typing import Tuple

import hydra
import pytorch_lightning as pl
from omegaconf import DictConfig
from oml.const import TCfg
from oml.datasets.images import get_retrieval_images_datasets
from oml.lightning.callbacks.metric import MetricValCallback
from oml.lightning.modules.extractor import ExtractorModule, ExtractorModuleDDP
from oml.lightning.pipelines.parser import (
    check_is_config_for_ddp,
    parse_logger_from_config,
    parse_ckpt_callback_from_config,
    parse_engine_params_from_config,
    parse_sampler_from_config,
    parse_scheduler_from_config,
)
from oml.metrics.embeddings import EmbeddingMetrics
from oml.registry.losses import get_criterion_by_cfg
from oml.registry.models import get_extractor_by_cfg
from oml.registry.optimizers import get_optimizer_by_cfg
from oml.registry.transforms import TRANSFORMS_REGISTRY, get_transforms_by_cfg
from oml.utils.misc import dictconfig_to_dict, set_global_seed
from torch.utils.data import DataLoader

import torch

import albumentations as albu
import cv2
from albumentations.pytorch import ToTensorV2
from oml.const import MEAN, PAD_COLOR, STD, TNormParam

In [55]:
from datetime import datetime

postfix = "metric_learning"

current_dateTime = datetime.now()
y = current_dateTime.year
month = current_dateTime.month
d = current_dateTime.day
hour = current_dateTime.hour
minute = current_dateTime.minute
s = current_dateTime.second
ms = current_dateTime.microsecond

cfg: TCfg = {
    "postfix": postfix,
    "seed": 42,
    "image_size": 224,
    "accelerator": "gpu",
    "devices": 1, 
    "dataframe_name": "train_tiger.csv",
    "dataset_root": "./",
    "logs_root": "logs/",
    "logs_folder": f"{y}-{month}-{d}-{hour}-{minute}-{s}-{ms}_{postfix}",
    "num_workers": 4,
    "cache_size": 0,
    "sampler": None,
    "bs_train": 32,
    "bs_val": 64,  
    "max_epochs": 7,  # number of epochs to train
    "valid_period": 1, 
    "save_dir": ".",

    "metric_args": {
        "metrics_to_exclude_from_visualization": ["cmc"],
        "map_top_k": [1, 3, 5], 
        "return_only_overall_category": False,
        "visualize_only_overall_category": False
    },

    "log_images": True,
    "metric_for_checkpointing": "OVERALL/map/5",


    "extractor":{
        "name": "vit",
        "args":{
            "arch": "vitl14",
            # "gem_p": 3.0,
            # "remove_fc": True,
            "normalise_features": False,
            "weights": "vitl14_dinov2",
        },
    },

    "criterion": {
        "name": "arcface",
        "args":{
            "smoothing_epsilon": 0.0,
            # "m": 0.4,
            # "s": 64,
            "in_features": 1024,
            "num_classes": 6626,
        },
    },

    "optimizer":{
        "name": "adam",
        "args":{
            "lr": 1e-5,
        },
    },

    "scheduling": None,
    "logger":{
        "name": "tensorboard",  
        "args":{
            "save_dir": "."
        }
    }
}


In [56]:
def get_transforms(im_size: int, mean: TNormParam = MEAN, std: TNormParam = STD) -> albu.Compose:
    """
    Use default oml albu augs, but without HorizontalFlip.
    :param im_size:
    :param mean:
    :param std:
    :return:
    """
    return albu.Compose(
        [
            albu.LongestMaxSize(max_size=im_size),
            albu.PadIfNeeded(
                min_height=im_size,
                min_width=im_size,
                border_mode=cv2.BORDER_CONSTANT,
                value=PAD_COLOR,
            ),
            albu.Normalize(mean=mean, std=std),
            ToTensorV2(),
        ],
    )

In [57]:
def get_retrieval_loaders(cfg: TCfg) -> Tuple[DataLoader, DataLoader]:
    train_dataset, valid_dataset = get_retrieval_images_datasets(
        dataset_root=Path(cfg['dataset_root']),
        transforms_train=get_transforms(cfg['image_size']),
        transforms_val=get_transforms(cfg['image_size']),
        dataframe_name=cfg['dataframe_name'],
        cache_size=cfg['cache_size'],
        verbose=cfg.get('show_dataset_warnings', True),
    )    

    loader_train = DataLoader(
        dataset=train_dataset,
        num_workers=cfg['num_workers'],
        batch_size=cfg['bs_train'],
        drop_last=True,
        shuffle=True,
    )

    loader_val = DataLoader(dataset=valid_dataset, batch_size=cfg['bs_val'], num_workers=cfg['num_workers'])

    return loader_train, loader_val


In [58]:
def extractor_training_pipeline(cfg: TCfg) -> None:
    set_global_seed(cfg['seed'])

    cfg = dictconfig_to_dict(cfg)
    print(cfg)
    
    logger = parse_logger_from_config(cfg)
    logger.log_pipeline_info(cfg)

    loader_train, loaders_val = get_retrieval_loaders(cfg)
    extractor = get_extractor_by_cfg(cfg['extractor'])
    criterion = get_criterion_by_cfg(cfg['criterion'], **{'label2category': loader_train.dataset.get_label2category()})
    optimizable_parameters = [
        {'lr': cfg['optimizer']['args']['lr'], 'params': extractor.parameters()},
        {'lr': cfg['optimizer']['args']['lr'], 'params': criterion.parameters()},
    ]
    optimizer = get_optimizer_by_cfg(cfg['optimizer'], **{'params': optimizable_parameters})  # type: ignore

    module_kwargs = {}
    module_kwargs.update(parse_scheduler_from_config(cfg, optimizer=optimizer))
    module_constructor = ExtractorModule  # type: ignore

    pl_module = module_constructor(
        extractor=extractor,
        criterion=criterion,
        optimizer=optimizer,
        input_tensors_key=loader_train.dataset.input_tensors_key,
        labels_key=loader_train.dataset.labels_key,
        freeze_n_epochs=cfg.get('freeze_n_epochs', 0),
        **module_kwargs,
    )

    metrics_constructor = EmbeddingMetrics
    metrics_calc = metrics_constructor(
        dataset = loaders_val.dataset,
        **cfg.get('metric_args', {}),
    )


    metrics_clb_constructor = MetricValCallback
    metrics_clb = metrics_clb_constructor(
        metric=metrics_calc,
        log_images=cfg.get('log_images', False),
    )

    trainer = pl.Trainer(
        max_epochs=cfg['max_epochs'],
        num_sanity_val_steps=0,
        check_val_every_n_epoch=cfg['valid_period'],
        default_root_dir=str(Path.cwd()),
        enable_checkpointing=True,
        enable_progress_bar=True,
        enable_model_summary=True,
        callbacks=[metrics_clb, parse_ckpt_callback_from_config(cfg)],
        logger=logger,
        precision=16,
        # **trainer_engine_params,
        **cfg.get('lightning_trainer_extra_args', {}),
    )

    trainer.fit(model=pl_module, train_dataloaders=loader_train, val_dataloaders=loaders_val)


In [59]:
extractor_training_pipeline(cfg)

{'postfix': 'metric_learning', 'seed': 42, 'image_size': 224, 'accelerator': 'gpu', 'devices': 1, 'dataframe_name': 'train_tiger.csv', 'dataset_root': './', 'logs_root': 'logs/', 'logs_folder': '2025-4-9-14-48-39-78694_metric_learning', 'num_workers': 4, 'cache_size': 0, 'sampler': None, 'bs_train': 32, 'bs_val': 64, 'max_epochs': 7, 'valid_period': 1, 'save_dir': '.', 'metric_args': {'metrics_to_exclude_from_visualization': ['cmc'], 'map_top_k': [1, 3, 5], 'return_only_overall_category': False, 'visualize_only_overall_category': False}, 'log_images': True, 'metric_for_checkpointing': 'OVERALL/map/5', 'extractor': {'name': 'vit', 'args': {'arch': 'vitl14', 'normalise_features': False, 'weights': 'vitl14_dinov2'}}, 'criterion': {'name': 'arcface', 'args': {'smoothing_epsilon': 0.0, 'in_features': 1024, 'num_classes': 6626}}, 'optimizer': {'name': 'adam', 'args': {'lr': 1e-05}}, 'scheduling': None, 'logger': {'name': 'tensorboard', 'args': {'save_dir': '.'}}}


  albu.PadIfNeeded(


https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth
Checkpoint is already here.


/venv/main/lib/python3.10/site-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/venv/main/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /baseline/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type         | Params | Mode 
---------------------------------------------------
0 | model     | ViTExtractor | 304 M  | train
1 | criterion | ArcFaceLoss  | 6.8 M  | train
---------------------------------------------------
311 M     Trainable params
0         Non-trainable params
311 M     Total params
1,244.615 Total estimated model params size (MB)
418       Modules in train mode
0         Modules in eval mod

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Finding nearest neighbors.:   0%|          | 0/1 [00:00<?, ?it/s]

CMC@5:   0%|          | 0/2160 [00:00<?, ?it/s]

Precision@5:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@1:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@3:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@5:   0%|          | 0/2160 [00:00<?, ?it/s]


Metrics:
{'OVERALL': {'cmc': {5: tensor(0.8389)},
             'map': {1: tensor(0.6315), 3: tensor(0.6897), 5: tensor(0.6860)},
             'pcf': {0.5: tensor(0.0107)},
             'precision': {5: tensor(0.5607)}}}


Epoch 0, global step 170: 'OVERALL/map/5' reached 0.68597 (best 0.68597), saving model to '/baseline/checkpoints/best-v4.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Finding nearest neighbors.:   0%|          | 0/1 [00:00<?, ?it/s]

CMC@5:   0%|          | 0/2160 [00:00<?, ?it/s]

Precision@5:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@1:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@3:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@5:   0%|          | 0/2160 [00:00<?, ?it/s]


Metrics:
{'OVERALL': {'cmc': {5: tensor(0.8745)},
             'map': {1: tensor(0.7426), 3: tensor(0.7769), 5: tensor(0.7728)},
             'pcf': {0.5: tensor(0.0156)},
             'precision': {5: tensor(0.6877)}}}


Epoch 1, global step 340: 'OVERALL/map/5' reached 0.77279 (best 0.77279), saving model to '/baseline/checkpoints/best-v4.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Finding nearest neighbors.:   0%|          | 0/1 [00:00<?, ?it/s]

CMC@5:   0%|          | 0/2160 [00:00<?, ?it/s]

Precision@5:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@1:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@3:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@5:   0%|          | 0/2160 [00:00<?, ?it/s]


Metrics:
{'OVERALL': {'cmc': {5: tensor(0.8903)},
             'map': {1: tensor(0.7806), 3: tensor(0.8115), 5: tensor(0.8055)},
             'pcf': {0.5: tensor(0.0176)},
             'precision': {5: tensor(0.7345)}}}


Epoch 2, global step 510: 'OVERALL/map/5' reached 0.80553 (best 0.80553), saving model to '/baseline/checkpoints/best-v4.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Finding nearest neighbors.:   0%|          | 0/1 [00:00<?, ?it/s]

CMC@5:   0%|          | 0/2160 [00:00<?, ?it/s]

Precision@5:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@1:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@3:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@5:   0%|          | 0/2160 [00:00<?, ?it/s]


Metrics:
{'OVERALL': {'cmc': {5: tensor(0.9014)},
             'map': {1: tensor(0.7884), 3: tensor(0.8228), 5: tensor(0.8171)},
             'pcf': {0.5: tensor(0.0205)},
             'precision': {5: tensor(0.7493)}}}


Epoch 3, global step 680: 'OVERALL/map/5' reached 0.81713 (best 0.81713), saving model to '/baseline/checkpoints/best-v4.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Finding nearest neighbors.:   0%|          | 0/1 [00:00<?, ?it/s]

CMC@5:   0%|          | 0/2160 [00:00<?, ?it/s]

Precision@5:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@1:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@3:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@5:   0%|          | 0/2160 [00:00<?, ?it/s]


Metrics:
{'OVERALL': {'cmc': {5: tensor(0.9083)},
             'map': {1: tensor(0.7954), 3: tensor(0.8243), 5: tensor(0.8219)},
             'pcf': {0.5: tensor(0.0215)},
             'precision': {5: tensor(0.7566)}}}


Epoch 4, global step 850: 'OVERALL/map/5' reached 0.82185 (best 0.82185), saving model to '/baseline/checkpoints/best-v4.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Finding nearest neighbors.:   0%|          | 0/1 [00:00<?, ?it/s]

CMC@5:   0%|          | 0/2160 [00:00<?, ?it/s]

Precision@5:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@1:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@3:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@5:   0%|          | 0/2160 [00:00<?, ?it/s]


Metrics:
{'OVERALL': {'cmc': {5: tensor(0.8977)},
             'map': {1: tensor(0.7972), 3: tensor(0.8256), 5: tensor(0.8237)},
             'pcf': {0.5: tensor(0.0215)},
             'precision': {5: tensor(0.7625)}}}


Epoch 5, global step 1020: 'OVERALL/map/5' reached 0.82367 (best 0.82367), saving model to '/baseline/checkpoints/best-v4.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Finding nearest neighbors.:   0%|          | 0/1 [00:00<?, ?it/s]

CMC@5:   0%|          | 0/2160 [00:00<?, ?it/s]

Precision@5:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@1:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@3:   0%|          | 0/2160 [00:00<?, ?it/s]

MAP@5:   0%|          | 0/2160 [00:00<?, ?it/s]


Metrics:
{'OVERALL': {'cmc': {5: tensor(0.9088)},
             'map': {1: tensor(0.7903), 3: tensor(0.8240), 5: tensor(0.8236)},
             'pcf': {0.5: tensor(0.0234)},
             'precision': {5: tensor(0.7645)}}}


Epoch 6, global step 1190: 'OVERALL/map/5' was not in top 1
`Trainer.fit` stopped: `max_epochs=7` reached.
