In [1]:
# https://platform.olimpiada-ai.ro/en/problems/73

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import torch
import torch.nn.functional as F
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-feature-extraction", model="facebook/dinov2-large", return_tensors='pt')

2026-01-23 03:30:30.472861: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769139030.704411      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769139030.774922      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769139031.330788      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769139031.330830      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769139031.330834      55 computation_placer.cc:177] computation placer alr

config.json:   0%|          | 0.00/549 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cpu


In [3]:
def operate(big_path):
    query_embeddings = []
    query_indices = [int(x.split('.')[0]) for x in os.listdir(os.path.join(big_path, 'query'))]
    os.listdir(os.path.join(big_path, 'query'))
    for query_img_path in tqdm([os.path.join(big_path, 'query', f'{x}.png') for x in query_indices], desc='Query Embeddings'):
        emb = pipe(query_img_path)[0, 0, :]
        query_embeddings.append(emb)
    query_embeddings = torch.stack(query_embeddings)

    gallery_embeddings = []
    gallery_indices = [int(x.split('.')[0]) for x in os.listdir(os.path.join(big_path, 'gallery'))]
    for gallery_img_path in tqdm([os.path.join(big_path, 'gallery', f'{x}.png') for x in gallery_indices], desc='Gallery Embeddings'):
        emb = pipe(gallery_img_path)[0, 0, :]
        gallery_embeddings.append(emb)
    gallery_embeddings = torch.stack(gallery_embeddings)

    similarities = (F.normalize(query_embeddings, dim=1) @ F.normalize(gallery_embeddings, dim=1).T)

    _, ind = torch.topk(similarities, k=2, dim=1)
    chosen_gallery_indices = torch.tensor(gallery_indices)[ind[:, 1]]

    df = pd.DataFrame({
        'image_path': [os.path.join(os.path.join(*big_path.split('/')[-2:]), 'query', f'{x}.png') for x in query_indices],
        'label': chosen_gallery_indices
    })
    
    return df

In [4]:
valid_subm = operate('/kaggle/input/toilet-sign-matching/dataset/validation_set')
test_subm = operate('/kaggle/input/toilet-sign-matching/dataset/test_set')

subm = pd.concat([valid_subm, test_subm])

subm.to_csv("submission.csv", index=False)

subm.head()

Query Embeddings:   0%|          | 0/10 [00:00<?, ?it/s]

Gallery Embeddings:   0%|          | 0/20 [00:00<?, ?it/s]

Query Embeddings:   0%|          | 0/30 [00:00<?, ?it/s]

Gallery Embeddings:   0%|          | 0/60 [00:00<?, ?it/s]

Unnamed: 0,image_path,label
0,dataset/validation_set/query/4.png,19
1,dataset/validation_set/query/9.png,13
2,dataset/validation_set/query/1.png,2
3,dataset/validation_set/query/2.png,18
4,dataset/validation_set/query/10.png,20
