#### Date 30th july 2025 Project Rag for AMLGO LABS

In [3]:
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import os
from typing import List
import pandas as pd

print(" Starting embeddings and vector database creation...")

# Load processed chunks
with open('/home/petpooja/Documents/amlgo/chunks/processed_chunks.json','r', encoding='utf-8') as f:
    chunks_data = json.load(f)

chunks = [chunk['text'] for chunk in chunks_data['chunks']]
print(f"Loaded {len(chunks)} chunks")
print(f"Sample chunk length: {len(chunks[0])} characters")

 Starting embeddings and vector database creation...
Loaded 306 chunks
Sample chunk length: 17 characters


In [4]:
# Initialize sentence transformer model
model_name = 'all-MiniLM-L6-v2'
print(f"Loading embedding model: {model_name}")

embedding_model = SentenceTransformer(model_name)
print(f"Model loaded successfully!")
print(f"Model max sequence length: {embedding_model.max_seq_length}")
print(f"Embedding dimension: {embedding_model.get_sentence_embedding_dimension()}")

🤖 Loading embedding model: all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded successfully!
Model max sequence length: 256
Embedding dimension: 384


In [None]:
# Generate embeddings for all chunks
print("Generating embeddings...")
print("This may take a few minutes depending on document size...")

embeddings = embedding_model.encode(
    chunks, 
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # Normalize for cosine similarity
)

print(f"\nEmbeddings generated!")
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embedding type: {embeddings.dtype}")

# Save embeddings
os.makedirs('../vectordb', exist_ok=True)
np.save('../vectordb/embeddings.npy', embeddings)
print("Embeddings saved to ../vectordb/embeddings.npy")

🔄 Generating embeddings...
This may take a few minutes depending on document size...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)



✅ Embeddings generated!
Embeddings shape: (306, 384)
Embedding type: float32
💾 Embeddings saved to ../vectordb/embeddings.npy


In [None]:
# Create FAISS index for efficient similarity search
print("Creating FAISS vector database...")

dimension = embeddings.shape[1]
print(f"Vector dimension: {dimension}")

# Use IndexFlatIP for inner product (cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)

# Add embeddings to index
embeddings_float32 = embeddings.astype('float32')
index.add(embeddings_float32)

print(f"FAISS index created!")
print(f"Total vectors in index: {index.ntotal}")

# Save FAISS index
faiss.write_index(index, '../vectordb/document_index.faiss')
print("FAISS index saved to ../vectordb/document_index.faiss")

🗃️ Creating FAISS vector database...
Vector dimension: 384
✅ FAISS index created!
Total vectors in index: 306
💾 FAISS index saved to ../vectordb/document_index.faiss


In [None]:
# Create comprehensive metadata
metadata = {
    'document_info': chunks_data['metadata'],
    'embedding_model': model_name,
    'vector_dimension': dimension,
    'total_vectors': len(chunks),
    'index_type': 'FAISS_IndexFlatIP',
    'similarity_metric': 'cosine_similarity',
    'chunks': chunks_data['chunks']  # Include full chunk metadata
}

# Save metadata
with open('../vectordb/metadata.json', 'w', encoding='utf-8') as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

print("Metadata saved to ../vectordb/metadata.json")

# Test retrieval functionality
def test_retrieval(query: str, top_k: int = 3):
    """Test the retrieval system"""
    print(f"\nTesting retrieval for query: '{query}'")
    
    # Encode query
    query_embedding = embedding_model.encode([query], normalize_embeddings=True)
    query_embedding = query_embedding.astype('float32')
    
    # Search
    scores, indices = index.search(query_embedding, top_k)
    
    print(f"Top {top_k} results:")
    for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
        print(f"\nResult {i+1}:")
        print(f"Score: {score:.4f}")
        print(f"Chunk ID: {idx}")
        print(f"Text: {chunks[idx][:150]}...")

# Test with sample queries
test_queries = [
    "What are eBay's return policies?",
    "seller fees and charges",
    "dispute resolution process"
]

for query in test_queries:
    test_retrieval(query, top_k=2)

print(f"\nVector database creation complete!")
print(f"Created files:")
print(f"  - ../vectordb/document_index.faiss")
print(f"  - ../vectordb/embeddings.npy") 
print(f"  - ../vectordb/metadata.json")

💾 Metadata saved to ../vectordb/metadata.json

🔍 Testing retrieval for query: 'What are eBay's return policies?'
📊 Top 2 results:

Result 1:
Score: 0.7369
Chunk ID: 124
Text: Where settings have been set to automatically accept requests for returns or replacements, an eBay -generated return shipping label will be provided t...

Result 2:
Score: 0.7163
Chunk ID: 119
Text: 14. Additional Terms Returns and cancellations for sellers Sellers can create rules to automate replacements, returns, and refunds under certain circu...

🔍 Testing retrieval for query: 'seller fees and charges'
📊 Top 2 results:

Result 1:
Score: 0.7369
Chunk ID: 44
Text: The fees we charge sellers for using our Services to sell goods and services are listed on our Selling fees pages....

Result 2:
Score: 0.7022
Chunk ID: 42
Text: Fees and Taxes We charge sellers for the use of our Services....

🔍 Testing retrieval for query: 'dispute resolution process'
📊 Top 2 results:

Result 1:
Score: 0.6269
Chunk ID: 186
Text: For