# Embedding & Vector Database (Chromadb)


## 1. Libs

In [1]:
# Install required packages
%pip install sentence-transformers chromadb langchain pandas numpy tqdm

Note: you may need to restart the kernel to use updated packages.


In [17]:
import json
import os
import pandas as pd
import numpy as np
import pickle
from typing import List, Dict, Any
from pathlib import Path
from tqdm import tqdm

# RAG components
from sentence_transformers import SentenceTransformer
import chromadb
from langchain.docstore.document import Document

## 2. Load data

In [18]:

with open('data/processed/chunked_documents.pkl', 'rb') as f:
    chunked_documents = pickle.load(f)
print(f"{len(chunked_documents)} chunks")


257 chunks


## 3. Initialize 

In [22]:
embedding_model = SentenceTransformer("intfloat/multilingual-e5-base")

## 4. Create Embeddings with Task Instructions

In [None]:
doc_texts = [f"passage: {doc.page_content}" for doc in chunked_documents]

all_embeddings = embedding_model.encode( doc_texts, show_progress_bar=True,normalize_embeddings=True)

os.makedirs('data/processed', exist_ok=True)
np.save('data/processed/nomic_embeddings.npy', all_embeddings)

Batches: 100%|██████████| 9/9 [01:08<00:00,  7.59s/it]


## 5. Setup ChromaDB Vector Database

In [None]:

chroma_client = chromadb.PersistentClient(path="./chroma_db_multi_lang")

collection = chroma_client.create_collection(
    name="apec_nomic_fixed",
    metadata={"hnsw:space": "cosine"}  
)


In [None]:

# Prepare data
ids = [f"chunk_{i}" for i in range(len(chunked_documents))]
embeddings = all_embeddings.tolist()
documents = [doc.page_content for doc in chunked_documents]
metadatas = [doc.metadata for doc in chunked_documents]

batch_size = 100
for i in tqdm(range(0, len(ids), batch_size), desc="Adding to ChromaDB"):
    end_idx = min(i + batch_size, len(ids))
    
    collection.add(
        ids=ids[i:end_idx],
        embeddings=embeddings[i:end_idx],
        documents=documents[i:end_idx],
        metadatas=metadatas[i:end_idx]
    )

print(f"✅ Added {collection.count()} documents")

## 6. Test

In [None]:
def search_fixed(query, top_k=5):
    """Fixed search function with proper similarity calculation"""
    print(f"🔍 Searching: '{query}'")
    
    query_text = f"query: {query}"
    query_embedding = embedding_model.encode([query_text], normalize_embeddings=True)
    
    results = collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )
    
    print(f"\n📊 Found {len(results['documents'][0])} results:")
    print("=" * 60)
    
    for i, (doc, metadata, distance) in enumerate(zip(results['documents'][0], results['metadatas'][0], results['distances'][0])):
        similarity = 1 - distance
        
        print(f"\nResult {i+1}:")
        print(f"   Distance: {distance:.3f}")
        print(f"   Similarity: {similarity:.3f}")
        print(f"   Title: {metadata.get('title', 'N/A')}")
        print(f"   Content: {doc[:150]}...")
    
    return results



In [None]:

test_queries = [
    "What is APEC?",
    'APEC là gì',
    "APEC member countries",
    "APEC 2025 meetings schedule events",
    'Lịch họp APEC 2025 sự kiện']

for query in test_queries:
    search_fixed(query, top_k=3)
    print("\n" + "="*80 + "\n")