This notebook allows you to **add new images** to your existing search index without rebuilding everything from scratch.

### Steps:
1. Load existing index and metadata
2. Scan `data/raw` for any NEW images
3. Process only the new images
4. Append to index and save

In [12]:
import sys
import faiss
import numpy as np
import json
from pathlib import Path
from tqdm import tqdm

# Add project root to path
sys.path.append('..')

from indexer.feature_extractor import MultimodalFeatureExtractor
from utils.config import config

In [None]:
# 1. Load Existing Data
print("Loading existing metadata...")

metadata_path = config.PROCESSED_DATA_DIR / "metadata.json"
index_path = config.PROCESSED_DATA_DIR / "faiss_index.bin"

if not metadata_path.exists():
    print("No existing metadata found! Please run full pipeline first.")
else:
    with open(metadata_path, 'r') as f:
        existing_data = json.load(f)
        
    existing_paths = set(existing_data['image_paths'])
    print(f"Loaded {len(existing_paths)} existing images")

Loading existing metadata...
✓ Loaded 2055 existing images


In [None]:
# 2. Find New Images
print("Scanning for new images...")

image_extensions = {'.jpg', '.jpeg', '.png', '.webp', '.bmp'}
all_images = []
data_dir = config.RAW_DATA_DIR

for ext in image_extensions:
    all_images.extend(data_dir.glob(f"**/*{ext}"))

# Robust comparison using filenames only
existing_filenames = {Path(p).name for p in existing_paths}

new_images = []
for img_path in all_images:
    if img_path.name not in existing_filenames:
        new_images.append(img_path)

print(f"Found {len(all_images)} total images in folder")
print(f"Found {len(new_images)} NEW images to process")

Scanning for new images...
Found 2064 total images in folder
✨ Found 9 NEW images to process


In [None]:
# 3. Process New Images
if not new_images:
    print("index is up to date! Nothing to do.")
else:
    print("Initializing extractor (this takes a moment)...")
    extractor = MultimodalFeatureExtractor()
    
    new_embeddings = []
    new_metadata = []
    new_paths = []
    
    for img_path in tqdm(new_images, desc="Processing new images"):
        try:
            # Extract
            emb, meta, _ = extractor.extract_and_combine(img_path)
            
            new_embeddings.append(emb)
            new_metadata.append(meta)
            new_paths.append(str(img_path))
            
        except Exception as e:
            print(f" Error processing {img_path.name}: {e}")
            
    print(f"✓ Processed {len(new_embeddings)} new images")

Initializing extractor (this takes a moment)...
Initializing MultimodalFeatureExtractor...
✓ All models loaded successfully!
✓ Using fusion strategy: weighted_concat



Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md


[A
[A
[A
[A
[A
[A
[A
[A
Processing new images: 100%|██████████| 9/9 [00:47<00:00,  5.27s/it]

✓ Processed 9 new images





In [None]:
# 4. Append and Save
if new_embeddings:
    # Load FAISS index
    print("Loading existing index...")
    index = faiss.read_index(str(index_path))
    
    # Append new vectors
    print("Adding to index...")
    embeddings_array = np.array(new_embeddings).astype('float32')
    index.add(embeddings_array)
    
    # Update data structures
    existing_data['image_paths'].extend(new_paths)
    existing_data['metadata'].extend(new_metadata)
    
    # SAVE EVERYTHING
    print("Saving updates...")
    
    # 1. Index
    faiss.write_index(index, str(index_path))
    
    # 2. Metadata
    with open(metadata_path, 'w') as f:
        json.dump(existing_data, f, indent=2)
        
 
    
    print(f"✅ Success! Index now has {index.ntotal} total images.")

Loading existing index...
Adding to index...
Saving updates...
✅ Success! Index now has 2064 total images.
