In [4]:
%pip install chromadb


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image, ImageFile
from pathlib import Path
from tqdm import tqdm
import json
import chromadb  # –ó–∞–º–µ–Ω–∏–ª–∏ faiss –Ω–∞ chromadb
from datetime import datetime

ImageFile.LOAD_TRUNCATED_IMAGES = True

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Device: {device}")


  import pynvml  # type: ignore[import]


Device: mps


## 1. –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞—Ç–∞—Å–µ—Ç–∞

In [6]:
# –ó–∞–≥—Ä—É–∂–∞–µ–º CSV —Å –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–µ–π –æ–± –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è—Ö
df = pd.read_csv('../data/balanced_animals_dataset.csv')
print(f"–í—Å–µ–≥–æ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π: {len(df)}")
print(f"\n–ö–ª–∞—Å—Å—ã: {df['scientific_name'].unique()}")
print(f"\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ –∫–ª–∞—Å—Å–∞–º:\n{df['scientific_name'].value_counts()}")

–í—Å–µ–≥–æ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π: 12000

–ö–ª–∞—Å—Å—ã: ['Canis aureus' 'Canis familiaris' 'Canis familiaris dingo'
 'Canis latrans' 'Canis lupus']

–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ –∫–ª–∞—Å—Å–∞–º:
scientific_name
Canis aureus              2400
Canis familiaris          2400
Canis familiaris dingo    2400
Canis latrans             2400
Canis lupus               2400
Name: count, dtype: int64


## 2. –°–æ–∑–¥–∞–Ω–∏–µ —ç–∫—Å—Ç—Ä–∞–∫—Ç–æ—Ä–∞ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤

In [7]:
class EmbeddingExtractor(nn.Module):
    """–ú–æ–¥–µ–ª—å –¥–ª—è –∏–∑–≤–ª–µ—á–µ–Ω–∏—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –∏–∑ EfficientNet V2 M"""
    
    def __init__(self, base_model):
        super().__init__()
        # –î–ª—è EfficientNet: features + avgpool
        self.features = base_model.features
        self.avgpool = base_model.avgpool
        
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        return x


def load_embedding_extractor(checkpoint_path):
    """–ó–∞–≥—Ä—É–∂–∞–µ—Ç –º–æ–¥–µ–ª—å –∏ —Å–æ–∑–¥–∞–µ—Ç —ç–∫—Å—Ç—Ä–∞–∫—Ç–æ—Ä —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤"""
    
    # –ó–∞–≥—Ä—É–∂–∞–µ–º checkpoint
    checkpoint = torch.load(checkpoint_path, map_location=device)
    params = checkpoint['params']
    
    print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–∞ –º–æ–¥–µ–ª—å: {params['model']['name']}")
    print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –∫–ª–∞—Å—Å–æ–≤: {params['model']['num_classes']}")
    
    # –°–æ–∑–¥–∞–µ–º –±–∞–∑–æ–≤—É—é –º–æ–¥–µ–ª—å
    model_name = params['model']['name']
    num_classes = params['model']['num_classes']
    
    if model_name == 'efficientnet_v2_m':
        weights = getattr(models.EfficientNet_V2_M_Weights, params['model']['pretrained'])
        base_model = models.efficientnet_v2_m(weights=weights)
        num_features = base_model.classifier[1].in_features
        base_model.classifier[1] = nn.Linear(num_features, num_classes)
    else:
        raise ValueError(f"–ú–æ–¥–µ–ª—å {model_name} –Ω–µ –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ—Ç—Å—è")
    
    # –ó–∞–≥—Ä—É–∂–∞–µ–º –≤–µ—Å–∞
    base_model.load_state_dict(checkpoint['model_state_dict'])
    
    # –°–æ–∑–¥–∞–µ–º —ç–∫—Å—Ç—Ä–∞–∫—Ç–æ—Ä —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ (–±–µ–∑ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–æ–Ω–Ω–æ–≥–æ —Å–ª–æ—è)
    embedding_extractor = EmbeddingExtractor(base_model)
    embedding_extractor.eval()
    embedding_extractor = embedding_extractor.to(device)
    
    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º —Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å —ç–º–±–µ–¥–¥–∏–Ω–≥–∞
    with torch.no_grad():
        dummy_input = torch.randn(1, 3, 224, 224).to(device)
        dummy_output = embedding_extractor(dummy_input)
        embedding_dim = dummy_output.shape[1]
    
    print(f"–†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å —ç–º–±–µ–¥–¥–∏–Ω–≥–∞: {embedding_dim}")
    
    return embedding_extractor, embedding_dim, checkpoint

In [8]:
# –ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å
model_path = Path('../models/best_model.pth')
embedding_extractor, embedding_dim, checkpoint = load_embedding_extractor(model_path)

# –ò–∑–≤–ª–µ–∫–∞–µ–º –º–∞–ø–ø–∏–Ω–≥–∏ –∫–ª–∞—Å—Å–æ–≤
idx_to_label = checkpoint['idx_to_label']
label_to_idx = checkpoint['label_to_idx']

–ó–∞–≥—Ä—É–∂–µ–Ω–∞ –º–æ–¥–µ–ª—å: efficientnet_v2_m
–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –∫–ª–∞—Å—Å–æ–≤: 5
–†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å —ç–º–±–µ–¥–¥–∏–Ω–≥–∞: 1280


## 3. –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ DataLoader

In [9]:
class ImageDataset(Dataset):
    """Dataset –¥–ª—è –∑–∞–≥—Ä—É–∑–∫–∏ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π"""
    
    def __init__(self, dataframe, transform=None):
        self.df = dataframe.reset_index(drop=True)
        self.transform = transform
        self.base_path = Path('../animal_images')
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        species = row['scientific_name'].replace(' ', '_')
        img_path = self.base_path / species / f"{row['uuid']}.jpg"
        
        try:
            image = Image.open(img_path).convert('RGB')
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ {img_path}: {e}")
            image = Image.new('RGB', (224, 224), color=(0, 0, 0))
        
        if self.transform:
            image = self.transform(image)
        
        # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–µ –∏ –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ
        metadata = {
            'uuid': row['uuid'],
            'scientific_name': row['scientific_name'],
            'path': str(img_path.relative_to(Path('../'))),
            'index': idx
        }
        
        return image, metadata


# –¢—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∞—Ü–∏–∏ (—Ç–∞–∫–∏–µ –∂–µ –∫–∞–∫ –ø—Ä–∏ –≤–∞–ª–∏–¥–∞—Ü–∏–∏)
transform = transforms.Compose([
    transforms.Resize(384),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# –°–æ–∑–¥–∞–µ–º dataset –∏ dataloader
dataset = ImageDataset(df, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0)

print(f"–°–æ–∑–¥–∞–Ω DataLoader —Å {len(dataset)} –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è–º–∏")

–°–æ–∑–¥–∞–Ω DataLoader —Å 12000 –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è–º–∏


## 4. –ò–∑–≤–ª–µ—á–µ–Ω–∏–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤

In [10]:
def extract_embeddings(model, dataloader, device):
    """–ò–∑–≤–ª–µ–∫–∞–µ—Ç —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –¥–ª—è –≤—Å–µ—Ö –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π"""
    
    all_embeddings = []
    all_metadata = []
    
    model.eval()
    
    with torch.no_grad():
        for images, metadata_batch in tqdm(dataloader, desc="–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤"):
            images = images.to(device)
            
            # –ü–æ–ª—É—á–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏
            embeddings = model(images)
            
            embeddings_np = embeddings.cpu().numpy().astype(np.float32)
            embeddings_np = np.ascontiguousarray(embeddings_np)
            all_embeddings.append(embeddings_np)
            
            # –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ
            for i in range(len(metadata_batch['uuid'])):
                meta = {
                    'uuid': metadata_batch['uuid'][i],
                    'scientific_name': metadata_batch['scientific_name'][i],
                    'path': metadata_batch['path'][i],
                    'embedding_index': len(all_metadata)
                }
                all_metadata.append(meta)
    
    # –û–±—ä–µ–¥–∏–Ω—è–µ–º –≤—Å–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–∏
    all_embeddings = np.vstack(all_embeddings)
    all_embeddings = np.ascontiguousarray(all_embeddings, dtype=np.float32)
    
    print(f"\n–ò–∑–≤–ª–µ—á–µ–Ω–æ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤: {all_embeddings.shape}")
    print(f"–ú–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö: {len(all_metadata)}")
    print(f"–¢–∏–ø –¥–∞–Ω–Ω—ã—Ö: {all_embeddings.dtype}")
    print(f"Contiguous: {all_embeddings.flags['C_CONTIGUOUS']}")
    
    return all_embeddings, all_metadata


In [11]:
# –ò–∑–≤–ª–µ–∫–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏
embeddings, metadata = extract_embeddings(embedding_extractor, dataloader, device)

print(f"\n–§–æ—Ä–º–∞ –º–∞—Å—Å–∏–≤–∞ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤: {embeddings.shape}")
print(f"–¢–∏–ø –¥–∞–Ω–Ω—ã—Ö: {embeddings.dtype}")
print(f"\n–ü—Ä–∏–º–µ—Ä –º–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö –ø–µ—Ä–≤–æ–≥–æ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è:")
print(json.dumps(metadata[0], indent=2, ensure_ascii=False))

–ò–∑–≤–ª–µ—á–µ–Ω–∏–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 375/375 [01:59<00:00,  3.13it/s]


–ò–∑–≤–ª–µ—á–µ–Ω–æ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤: (12000, 1280)
–ú–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö: 12000
–¢–∏–ø –¥–∞–Ω–Ω—ã—Ö: float32
Contiguous: True

–§–æ—Ä–º–∞ –º–∞—Å—Å–∏–≤–∞ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤: (12000, 1280)
–¢–∏–ø –¥–∞–Ω–Ω—ã—Ö: float32

–ü—Ä–∏–º–µ—Ä –º–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö –ø–µ—Ä–≤–æ–≥–æ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è:
{
  "uuid": "710acdf3-5470-44ca-a1ac-32fb42c95b70",
  "scientific_name": "Canis aureus",
  "path": "animal_images/Canis_aureus/710acdf3-5470-44ca-a1ac-32fb42c95b70.jpg",
  "embedding_index": 0
}





## 5. –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ chroma –∏–Ω–¥–µ–∫—Å–∞

In [12]:
def build_chromadb_collection(embeddings, metadata, collection_name="animal_embeddings"):
    """–°—Ç—Ä–æ–∏—Ç ChromaDB –∫–æ–ª–ª–µ–∫—Ü–∏—é –¥–ª—è –ø–æ–∏—Å–∫–∞ –ø–æ—Ö–æ–∂–∏—Ö –≤–µ–∫—Ç–æ—Ä–æ–≤
    
    Args:
        embeddings: numpy array —Å —ç–º–±–µ–¥–¥–∏–Ω–≥–∞–º–∏ (n_samples, embedding_dim)
        metadata: —Å–ø–∏—Å–æ–∫ —Å–ª–æ–≤–∞—Ä–µ–π —Å –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–º–∏ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è
        collection_name: –Ω–∞–∑–≤–∞–Ω–∏–µ –∫–æ–ª–ª–µ–∫—Ü–∏–∏
    
    Returns:
        client: ChromaDB –∫–ª–∏–µ–Ω—Ç
        collection: ChromaDB –∫–æ–ª–ª–µ–∫—Ü–∏—è
    """
    
    print(f"–°–æ–∑–¥–∞–Ω–∏–µ ChromaDB –∫–æ–ª–ª–µ–∫—Ü–∏–∏ '{collection_name}'...")
    
    # –°–æ–∑–¥–∞–µ–º persistent –∫–ª–∏–µ–Ω—Ç (—Å–æ—Ö—Ä–∞–Ω—è–µ—Ç—Å—è –Ω–∞ –¥–∏—Å–∫)
    chroma_path = Path('../embeddings/chromadb')
    chroma_path.mkdir(parents=True, exist_ok=True)
    
    client = chromadb.PersistentClient(path=str(chroma_path))
    
    # –£–¥–∞–ª—è–µ–º –∫–æ–ª–ª–µ–∫—Ü–∏—é –µ—Å–ª–∏ —Å—É—â–µ—Å—Ç–≤—É–µ—Ç (–¥–ª—è —á–∏—Å—Ç–æ–≥–æ —Å—Ç–∞—Ä—Ç–∞)
    try:
        client.delete_collection(name=collection_name)
        print("–°—Ç–∞—Ä–∞—è –∫–æ–ª–ª–µ–∫—Ü–∏—è —É–¥–∞–ª–µ–Ω–∞")
    except:
        pass
    
    # –°–æ–∑–¥–∞–µ–º –Ω–æ–≤—É—é –∫–æ–ª–ª–µ–∫—Ü–∏—é —Å –∫–æ—Å–∏–Ω—É—Å–Ω—ã–º —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ–º
    collection = client.create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"}  # cosine, l2, –∏–ª–∏ ip
    )
    
    # –ü–æ–¥–≥–æ—Ç–∞–≤–ª–∏–≤–∞–µ–º –¥–∞–Ω–Ω—ã–µ –¥–ª—è ChromaDB
    ids = [meta['uuid'] for meta in metadata]
    embeddings_list = embeddings.tolist()
    
    # –ü–æ–¥–≥–æ—Ç–∞–≤–ª–∏–≤–∞–µ–º –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ (ChromaDB —Ç—Ä–µ–±—É–µ—Ç —Å—Ç—Ä–æ–∫–æ–≤—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è)
    metadatas = [
        {
            'scientific_name': meta['scientific_name'],
            'path': meta['path'],
            'embedding_index': str(meta['embedding_index'])
        }
        for meta in metadata
    ]
    
    # –î–æ–±–∞–≤–ª—è–µ–º –¥–∞–Ω–Ω—ã–µ –±–∞—Ç—á–∞–º–∏ (ChromaDB —Ä–µ–∫–æ–º–µ–Ω–¥—É–µ—Ç –±–∞—Ç—á–∏ ~40000)
    batch_size = 1000
    num_batches = (len(ids) + batch_size - 1) // batch_size
    
    print(f"–î–æ–±–∞–≤–ª–µ–Ω–∏–µ {len(ids)} –≤–µ–∫—Ç–æ—Ä–æ–≤ –≤ {num_batches} –±–∞—Ç—á–∞—Ö...")
    
    for i in tqdm(range(0, len(ids), batch_size), desc="–ó–∞–≥—Ä—É–∑–∫–∞ –≤ ChromaDB"):
        batch_end = min(i + batch_size, len(ids))
        
        collection.add(
            ids=ids[i:batch_end],
            embeddings=embeddings_list[i:batch_end],
            metadatas=metadatas[i:batch_end]
        )
    
    print(f"‚úì –ö–æ–ª–ª–µ–∫—Ü–∏—è —Å–æ–∑–¥–∞–Ω–∞. –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –≤–µ–∫—Ç–æ—Ä–æ–≤: {collection.count()}")
    
    return client, collection


In [13]:
# –°—Ç—Ä–æ–∏–º ChromaDB –∫–æ–ª–ª–µ–∫—Ü–∏—é
client, collection = build_chromadb_collection(embeddings, metadata)

–°–æ–∑–¥–∞–Ω–∏–µ ChromaDB –∫–æ–ª–ª–µ–∫—Ü–∏–∏ 'animal_embeddings'...
–î–æ–±–∞–≤–ª–µ–Ω–∏–µ 12000 –≤–µ–∫—Ç–æ—Ä–æ–≤ –≤ 12 –±–∞—Ç—á–∞—Ö...


–ó–∞–≥—Ä—É–∑–∫–∞ –≤ ChromaDB: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:06<00:00,  1.82it/s]

‚úì –ö–æ–ª–ª–µ–∫—Ü–∏—è —Å–æ–∑–¥–∞–Ω–∞. –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –≤–µ–∫—Ç–æ—Ä–æ–≤: 12000





## 6. –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –ø–æ–∏—Å–∫–∞

In [14]:
def test_search(collection, metadata, test_idx=0, k=10):
    """–¢–µ—Å—Ç–∏—Ä—É–µ—Ç –ø–æ–∏—Å–∫ –ø–æ—Ö–æ–∂–∏—Ö –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π –≤ ChromaDB"""
    
    # –ü–æ–ª—É—á–∞–µ–º UUID —Ç–µ—Å—Ç–æ–≤–æ–≥–æ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è
    query_uuid = metadata[test_idx]['uuid']
    
    print(f"\n–¢–µ—Å—Ç–æ–≤–æ–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–µ (–∏–Ω–¥–µ–∫—Å {test_idx}):")
    print(f"  UUID: {metadata[test_idx]['uuid']}")
    print(f"  –ö–ª–∞—Å—Å: {metadata[test_idx]['scientific_name']}")
    print(f"  –ü—É—Ç—å: {metadata[test_idx]['path']}")
    
    # –ü–æ–ª—É—á–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥ –∏–∑ –∫–æ–ª–ª–µ–∫—Ü–∏–∏
    query_result = collection.get(
        ids=[query_uuid],
        include=['embeddings']
    )
    
    query_embedding = query_result['embeddings'][0]
    
    # –ò—â–µ–º k –±–ª–∏–∂–∞–π—à–∏—Ö —Å–æ—Å–µ–¥–µ–π
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k,
        include=['metadatas', 'distances']
    )
    
    print(f"\n–¢–æ–ø-{k} –ø–æ—Ö–æ–∂–∏—Ö –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π:")
    for i, (uuid, distance, meta) in enumerate(zip(
        results['ids'][0], 
        results['distances'][0], 
        results['metadatas'][0]
    )):
        # –î–ª—è –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏—è: similarity = 1 - distance
        similarity = (1 - distance) * 100
        print(f"  {i+1}. –ü–æ—Ö–æ–∂–µ—Å—Ç—å: {similarity:.2f}% | –ö–ª–∞—Å—Å: {meta['scientific_name']} | UUID: {uuid}")
    
    return results


In [15]:
# –¢–µ—Å—Ç–∏—Ä—É–µ–º –ø–æ–∏—Å–∫ –Ω–∞ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö –ø—Ä–∏–º–µ—Ä–∞—Ö
print("=" * 80)
print("–¢–ï–°–¢ 1: –ü–µ—Ä–≤–æ–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–µ")
print("=" * 80)
test_search(collection, metadata, test_idx=0, k=10)

print("\n" + "=" * 80)
print("–¢–ï–°–¢ 2: –°–ª—É—á–∞–π–Ω–æ–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–µ")
print("=" * 80)
random_idx = np.random.randint(0, len(metadata))
test_search(collection, metadata, test_idx=random_idx, k=10)


–¢–ï–°–¢ 1: –ü–µ—Ä–≤–æ–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–µ

–¢–µ—Å—Ç–æ–≤–æ–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–µ (–∏–Ω–¥–µ–∫—Å 0):
  UUID: 710acdf3-5470-44ca-a1ac-32fb42c95b70
  –ö–ª–∞—Å—Å: Canis aureus
  –ü—É—Ç—å: animal_images/Canis_aureus/710acdf3-5470-44ca-a1ac-32fb42c95b70.jpg

–¢–æ–ø-10 –ø–æ—Ö–æ–∂–∏—Ö –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π:
  1. –ü–æ—Ö–æ–∂–µ—Å—Ç—å: 100.00% | –ö–ª–∞—Å—Å: Canis aureus | UUID: 710acdf3-5470-44ca-a1ac-32fb42c95b70
  2. –ü–æ—Ö–æ–∂–µ—Å—Ç—å: 63.19% | –ö–ª–∞—Å—Å: Canis aureus | UUID: 06a9b4ca-e660-4bd4-906b-c88f88bd52b4
  3. –ü–æ—Ö–æ–∂–µ—Å—Ç—å: 58.98% | –ö–ª–∞—Å—Å: Canis aureus | UUID: 5fcc3a75-fc6f-45a8-9ae9-628bb59b16d9
  4. –ü–æ—Ö–æ–∂–µ—Å—Ç—å: 58.94% | –ö–ª–∞—Å—Å: Canis latrans | UUID: b6144f2b-a437-481a-8bcc-be838f77e7a8
  5. –ü–æ—Ö–æ–∂–µ—Å—Ç—å: 58.74% | –ö–ª–∞—Å—Å: Canis lupus | UUID: ddcbfea3-556a-44bd-aac4-72df84d98d56
  6. –ü–æ—Ö–æ–∂–µ—Å—Ç—å: 58.33% | –ö–ª–∞—Å—Å: Canis aureus | UUID: ba65859e-54e9-48ba-95eb-8a4ee9d3c945
  7. –ü–æ—Ö–æ–∂–µ—Å—Ç—å: 58.32% | –ö–ª–∞—Å—Å: Canis latrans | UUID

{'ids': [['ab495944-6f16-44f1-80b0-87ef3c547285',
   'b8fdc71f-6ac7-41c6-a031-6ba641aa90ae',
   'c3e0e713-cda4-4514-a2b3-c8d1dd450c7f',
   '91cd24b4-b01f-4f5d-8207-5dc842fe3f12',
   '04416934-7ce2-4cf9-aff0-aef73a88434f',
   'd5f3df05-2cd4-4c87-98eb-01ccd3b73840',
   '587463b4-4fe7-498f-ab8c-c97062bd4dd1',
   '4900678f-00b8-445d-b93c-9a084f4ca360',
   'fab4eea6-eff5-453a-a1a9-92dc6cec2146',
   'f0fbd7ae-d23d-43f6-9a9b-7a62e18f4066']],
 'embeddings': None,
 'documents': None,
 'uris': None,
 'included': ['metadatas', 'distances'],
 'data': None,
 'metadatas': [[{'path': 'animal_images/Canis_familiaris_dingo/ab495944-6f16-44f1-80b0-87ef3c547285.jpg',
    'embedding_index': '4882',
    'scientific_name': 'Canis familiaris dingo'},
   {'path': 'animal_images/Canis_latrans/b8fdc71f-6ac7-41c6-a031-6ba641aa90ae.jpg',
    'embedding_index': '8763',
    'scientific_name': 'Canis latrans'},
   {'embedding_index': '9518',
    'path': 'animal_images/Canis_latrans/c3e0e713-cda4-4514-a2b3-c8d1dd450c

## 7. –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –∏–Ω–¥–µ–∫—Å–∞ –∏ –º–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö

In [16]:
# –°–æ–∑–¥–∞–µ–º –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—é –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è
embeddings_dir = Path('../embeddings')
embeddings_dir.mkdir(exist_ok=True)

# ChromaDB —É–∂–µ —Å–æ—Ö—Ä–∞–Ω–µ–Ω –≤ persistent mode
print(f"‚úì ChromaDB —Å–æ—Ö—Ä–∞–Ω–µ–Ω –≤: {embeddings_dir / 'chromadb'}")

# –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ
metadata_dict = {
    'images': metadata,
    'metadata': {
        'total_images': len(metadata),
        'embedding_dim': embedding_dim,
        'model': checkpoint['params']['model']['name'],
        'use_cosine_similarity': True,
        'created_at': datetime.now().isoformat(),
        'classes': list(idx_to_label.values()),
        'collection_name': 'animal_embeddings'
    }
}

metadata_path = embeddings_dir / 'image_metadata.json'
with open(metadata_path, 'w', encoding='utf-8') as f:
    json.dump(metadata_dict, f, indent=2, ensure_ascii=False)
print(f"‚úì –ú–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: {metadata_path}")

# –û–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ: —Å–æ—Ö—Ä–∞–Ω—è–µ–º —Å–∞–º–∏ —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ (–¥–ª—è –∞–Ω–∞–ª–∏–∑–∞)
embeddings_path = embeddings_dir / 'embeddings.npy'
np.save(embeddings_path, embeddings)
print(f"‚úì –≠–º–±–µ–¥–¥–∏–Ω–≥–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: {embeddings_path}")

# –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ —Ä–∞–∑–º–µ—Ä–æ–≤
import shutil
chroma_size = sum(f.stat().st_size for f in (embeddings_dir / 'chromadb').rglob('*') if f.is_file())
print(f"\n–í—Å–µ —Ñ–∞–π–ª—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏: {embeddings_dir}")
print(f"–†–∞–∑–º–µ—Ä ChromaDB: {chroma_size / 1024 / 1024:.2f} MB")
print(f"–†–∞–∑–º–µ—Ä –º–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö: {metadata_path.stat().st_size / 1024:.2f} KB")
print(f"–†–∞–∑–º–µ—Ä —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤: {embeddings_path.stat().st_size / 1024 / 1024:.2f} MB")


‚úì ChromaDB —Å–æ—Ö—Ä–∞–Ω–µ–Ω –≤: ../embeddings/chromadb
‚úì –ú–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: ../embeddings/image_metadata.json
‚úì –≠–º–±–µ–¥–¥–∏–Ω–≥–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: ../embeddings/embeddings.npy

–í—Å–µ —Ñ–∞–π–ª—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏: ../embeddings
–†–∞–∑–º–µ—Ä ChromaDB: 73.82 MB
–†–∞–∑–º–µ—Ä –º–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö: 2680.17 KB
–†–∞–∑–º–µ—Ä —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤: 58.59 MB


## 8. –ü—Ä–æ–≤–µ—Ä–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏

In [17]:
# –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ ChromaDB –º–æ–∂–Ω–æ –∑–∞–≥—Ä—É–∑–∏—Ç—å –æ–±—Ä–∞—Ç–Ω–æ
print("–ü—Ä–æ–≤–µ—Ä–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ ChromaDB...")

# –ó–∞–≥—Ä—É–∂–∞–µ–º —Å—É—â–µ—Å—Ç–≤—É—é—â–∏–π –∫–ª–∏–µ–Ω—Ç
loaded_client = chromadb.PersistentClient(path=str(embeddings_dir / 'chromadb'))
loaded_collection = loaded_client.get_collection(name="animal_embeddings")
print(f"‚úì ChromaDB –∑–∞–≥—Ä—É–∂–µ–Ω. –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –≤–µ–∫—Ç–æ—Ä–æ–≤: {loaded_collection.count()}")

with open(metadata_path, 'r', encoding='utf-8') as f:
    loaded_metadata = json.load(f)
print(f"‚úì –ú–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ –∑–∞–≥—Ä—É–∂–µ–Ω—ã. –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π: {loaded_metadata['metadata']['total_images']}")

loaded_embeddings = np.load(embeddings_path)
print(f"‚úì –≠–º–±–µ–¥–¥–∏–Ω–≥–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã. –§–æ—Ä–º–∞: {loaded_embeddings.shape}")

print("\n‚úÖ –í—Å–µ —Ñ–∞–π–ª—ã —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã!")

# –¢–µ—Å—Ç–æ–≤—ã–π –ø–æ–∏—Å–∫ –Ω–∞ –∑–∞–≥—Ä—É–∂–µ–Ω–Ω–æ–π –∫–æ–ª–ª–µ–∫—Ü–∏–∏
print("\n" + "=" * 80)
print("–¢–ï–°–¢: –ü–æ–∏—Å–∫ –≤ –∑–∞–≥—Ä—É–∂–µ–Ω–Ω–æ–π –∫–æ–ª–ª–µ–∫—Ü–∏–∏")
print("=" * 80)
test_idx = 100
test_search(loaded_collection, loaded_metadata['images'], test_idx=test_idx, k=5)


–ü—Ä–æ–≤–µ—Ä–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ ChromaDB...
‚úì ChromaDB –∑–∞–≥—Ä—É–∂–µ–Ω. –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –≤–µ–∫—Ç–æ—Ä–æ–≤: 12000
‚úì –ú–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ –∑–∞–≥—Ä—É–∂–µ–Ω—ã. –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π: 12000
‚úì –≠–º–±–µ–¥–¥–∏–Ω–≥–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã. –§–æ—Ä–º–∞: (12000, 1280)

‚úÖ –í—Å–µ —Ñ–∞–π–ª—ã —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã!

–¢–ï–°–¢: –ü–æ–∏—Å–∫ –≤ –∑–∞–≥—Ä—É–∂–µ–Ω–Ω–æ–π –∫–æ–ª–ª–µ–∫—Ü–∏–∏

–¢–µ—Å—Ç–æ–≤–æ–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–µ (–∏–Ω–¥–µ–∫—Å 100):
  UUID: e14cf0ed-919f-4d09-8f9a-eff023047033
  –ö–ª–∞—Å—Å: Canis aureus
  –ü—É—Ç—å: animal_images/Canis_aureus/e14cf0ed-919f-4d09-8f9a-eff023047033.jpg

–¢–æ–ø-5 –ø–æ—Ö–æ–∂–∏—Ö –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π:
  1. –ü–æ—Ö–æ–∂–µ—Å—Ç—å: 100.00% | –ö–ª–∞—Å—Å: Canis aureus | UUID: e14cf0ed-919f-4d09-8f9a-eff023047033
  2. –ü–æ—Ö–æ–∂–µ—Å—Ç—å: 53.18% | –ö–ª–∞—Å—Å: Canis aureus | UUID: 03629448-a094-419e-92ad-ba91fa546782
  3. –ü–æ—Ö–æ–∂–µ—Å—Ç—å: 51.27% | –ö–ª–∞—Å—Å: Canis latrans | UUID: 6b51439f-dbe2-4f7c-adf0-7

{'ids': [['e14cf0ed-919f-4d09-8f9a-eff023047033',
   '03629448-a094-419e-92ad-ba91fa546782',
   '6b51439f-dbe2-4f7c-adf0-703ab05d6335',
   '538cd4c7-9945-4236-b74f-29c559aa1983',
   'c66356a9-689d-48e3-a2e1-60530a38860f']],
 'embeddings': None,
 'documents': None,
 'uris': None,
 'included': ['metadatas', 'distances'],
 'data': None,
 'metadatas': [[{'embedding_index': '100',
    'scientific_name': 'Canis aureus',
    'path': 'animal_images/Canis_aureus/e14cf0ed-919f-4d09-8f9a-eff023047033.jpg'},
   {'scientific_name': 'Canis aureus',
    'embedding_index': '323',
    'path': 'animal_images/Canis_aureus/03629448-a094-419e-92ad-ba91fa546782.jpg'},
   {'scientific_name': 'Canis latrans',
    'path': 'animal_images/Canis_latrans/6b51439f-dbe2-4f7c-adf0-703ab05d6335.jpg',
    'embedding_index': '9319'},
   {'scientific_name': 'Canis aureus',
    'path': 'animal_images/Canis_aureus/538cd4c7-9945-4236-b74f-29c559aa1983.jpg',
    'embedding_index': '735'},
   {'path': 'animal_images/Canis_lat

## 9. –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –∏ –≤–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è

In [18]:
# –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø–æ –∫–ª–∞—Å—Å–∞–º
class_counts = {}
for meta in metadata:
    class_name = meta['scientific_name']
    class_counts[class_name] = class_counts.get(class_name, 0) + 1

print("–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π –ø–æ –∫–ª–∞—Å—Å–∞–º:")
for class_name, count in sorted(class_counts.items()):
    print(f"  {class_name}: {count} –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π")

print(f"\n–í—Å–µ–≥–æ –∫–ª–∞—Å—Å–æ–≤: {len(class_counts)}")
print(f"–í—Å–µ–≥–æ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π: {sum(class_counts.values())}")

–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π –ø–æ –∫–ª–∞—Å—Å–∞–º:
  Canis aureus: 2400 –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π
  Canis familiaris: 2400 –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π
  Canis familiaris dingo: 2400 –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π
  Canis latrans: 2400 –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π
  Canis lupus: 2400 –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π

–í—Å–µ–≥–æ –∫–ª–∞—Å—Å–æ–≤: 5
–í—Å–µ–≥–æ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π: 12000


## –ì–æ—Ç–æ–≤–æ! üéâ

–ò–Ω–¥–µ–∫—Å —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ —É—Å–ø–µ—à–Ω–æ –ø–æ—Å—Ç—Ä–æ–µ–Ω –∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω. –¢–µ–ø–µ—Ä—å –º–æ–∂–Ω–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –µ–≥–æ –≤ –≤–µ–±-–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–∏ –¥–ª—è –ø–æ–∏—Å–∫–∞ –ø–æ—Ö–æ–∂–∏—Ö –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π.

**–°–æ–∑–¥–∞–Ω–Ω—ã–µ —Ñ–∞–π–ª—ã:**
- `embeddings/faiss_index.bin` - FAISS –∏–Ω–¥–µ–∫—Å –¥–ª—è –±—ã—Å—Ç—Ä–æ–≥–æ –ø–æ–∏—Å–∫–∞
- `embeddings/image_metadata.json` - –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π
- `embeddings/embeddings.npy` - —Å–æ—Ö—Ä–∞–Ω–µ–Ω–Ω—ã–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ (–æ–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ)

**–°–ª–µ–¥—É—é—â–∏–µ —à–∞–≥–∏:**
1. –°–æ–∑–¥–∞—Ç—å –º–æ–¥—É–ª–∏ –¥–ª—è –≤–µ–±-–ø—Ä–∏–ª–æ–∂–µ–Ω–∏—è (`embedding_extractor.py`, `vector_db.py`, `similarity_search.py`)
2. –ò–Ω—Ç–µ–≥—Ä–∏—Ä–æ–≤–∞—Ç—å –ø–æ–∏—Å–∫ –ø–æ—Ö–æ–∂–∏—Ö –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π –≤ UI
3. –ü—Ä–æ—Ç–µ—Å—Ç–∏—Ä–æ–≤–∞—Ç—å —Ä–∞–±–æ—Ç—É —Å–∏—Å—Ç–µ–º—ã