In [None]:
# Import required libraries
import sys
import os
from pathlib import Path

# Add src to path
sys.path.append('../src')

from retrieval.embeddings import EmbeddingService, prepare_documents_for_embedding
from retrieval.vector_store import create_vector_store
import json
from typing import List, Dict, Any

In [None]:
# Load raw documents
def load_raw_documents(data_dir: str) -> List[Dict[str, Any]]:
    """Load documents from raw directory."""
    documents = []
    raw_dir = Path(data_dir) / 'raw'
    
    for file_path in raw_dir.glob('*.md'):
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
        documents.append({
            'doc_id': f"KB-{file_path.stem.upper()}",
            'title': file_path.stem.replace('_', ' ').title(),
            'content': content,
            'url': f"https://kb.company.com/{file_path.stem}",
            'category': 'billing' if 'billing' in file_path.stem.lower() else 'account'
        })
    
    return documents

# Load documents
raw_documents = load_raw_documents('../data/knowledge_base')
print(f"Loaded {len(raw_documents)} documents")

In [None]:
# Prepare documents for embedding
chunked_documents = prepare_documents_for_embedding(raw_documents)
print(f"Created {len(chunked_documents)} chunks")

In [None]:
# Generate embeddings
async def generate_embeddings(documents: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Generate embeddings for documents."""
    embedding_service = EmbeddingService()
    
    texts = [doc['content'] for doc in documents]
    embeddings = await embedding_service.embed_texts(texts)
    
    for doc, embedding in zip(documents, embeddings):
        doc['embedding'] = embedding
    
    return documents

# Note: This would be run in an async environment
# embedded_documents = await generate_embeddings(chunked_documents)

In [None]:
# Save processed documents
def save_processed_documents(documents: List[Dict[str, Any]], output_dir: str):
    """Save processed documents to disk."""
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    for doc in documents:
        doc_id = doc['doc_id']
        chunk_id = doc.get('chunk_id', 'c-0')
        
        filename = f"{doc_id}_{chunk_id}.json"
        filepath = output_path / filename
        
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(doc, f, indent=2, ensure_ascii=False)

# save_processed_documents(chunked_documents, '../data/knowledge_base/processed')