In [17]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[K     |████████████████████████████████| 255 kB 2.6 MB/s eta 0:00:01
[?25hCollecting transformers<5.0.0,>=4.41.0
  Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[K     |████████████████████████████████| 10.0 MB 74.0 MB/s eta 0:00:01
Collecting torch>=1.11.0
  Downloading torch-2.4.1-cp38-cp38-manylinux1_x86_64.whl (797.1 MB)
[K     |████████████████████████████████| 797.1 MB 21 kB/s s eta 0:00:012   |█▍                              | 34.1 MB 43.8 MB/s eta 0:00:18     |████████████▊                   | 316.1 MB 25.6 MB/s eta 0:00:19     |████████████████████            | 498.6 MB 22.8 MB/s eta 0:00:14     |██████████████████████▊         | 565.0 MB 22.0 MB/s eta 0:00:11
Collecting safetensors>=0.4.1
  Downloading safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (461 kB)
[K     |████████████████████████████████| 461 kB 19.9 MB/s eta 0:00:01
[?

Collecting nvidia-nvjitlink-cu12
  Downloading nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 40.0 MB/s eta 0:00:01
[31mERROR: prodigy 1.11.7 has requirement fastapi<0.69.0,>=0.65.1, but you'll have fastapi 0.95.2 which is incompatible.[0m
[31mERROR: prodigy 1.11.7 has requirement pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4, but you'll have pydantic 2.10.4 which is incompatible.[0m
[31mERROR: prodigy 1.11.7 has requirement uvicorn<0.14.0,>=0.13.3, but you'll have uvicorn 0.33.0 which is incompatible.[0m
[31mERROR: fastapi 0.95.2 has requirement pydantic!=1.7,!=1.7.1,!=1.7.2,!=1.7.3,!=1.8,!=1.8.1,<2.0.0,>=1.6.2, but you'll have pydantic 2.10.4 which is incompatible.[0m
[31mERROR: torchvision 0.11.2 has requirement torch==1.10.1, but you'll have torch 2.4.1 which is incompatible.[0m
[31mERROR: llama-parse 0.5.15 has requirement click<9.0.0,>=8.1.7, but you'll have click 8.1.2 which is 

In [19]:
import json
import os
import numpy as np
from sentence_transformers import SentenceTransformer

# Define paths
JSON_PATH = "../backend/data/raw/uy-tourism.json"
STORAGE_PATH = "../backend/data/embeddings/"

# Create the storage directory if it doesn't exist
os.makedirs(STORAGE_PATH, exist_ok=True)

def process_and_index(json_path, storage_path):
    print(f"Starting indexing process...")
    
    # Load the model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Load JSON data
    with open(json_path, "r") as f:
        data = json.load(f)
    print(f"Loaded {len(data)} entries from JSON file")
    
    # Prepare documents and their metadata
    documents = []
    metadata = []
    
    for entry in data:
        title = entry.get("Title", "").strip()
        location = entry.get("Location", "").strip()
        text = entry.get("Text", "").strip()
        address = entry.get("Address", "").strip()
        
        # Combine content
        full_content = f"Title: {title}. Location: {location}. Address: {address}\n\n{text}"
        
        # Store document and its metadata
        documents.append(full_content)
        metadata.append({
            "title": title,
            "location": location,
            "address": address
        })
    
    # Generate embeddings
    print("Generating embeddings...")
    embeddings = model.encode(documents, show_progress_bar=True)
    
    # Save everything
    np.save(f"{storage_path}/embeddings.npy", embeddings)
    with open(f"{storage_path}/documents.json", 'w') as f:
        json.dump(documents, f)
    with open(f"{storage_path}/metadata.json", 'w') as f:
        json.dump(metadata, f)
    
    print(f"Successfully saved {len(documents)} documents and their embeddings at {storage_path}")
    print(f"Embedding dimension: {embeddings.shape}")

# Run the indexing process
process_and_index(JSON_PATH, STORAGE_PATH)

Starting indexing process...
Loaded 8 entries from JSON file
Generating embeddings...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Successfully saved 8 documents and their embeddings at ../backend/data/embeddings/
Embedding dimension: (8, 384)
