In [1]:

import os
import json
import logging
from pathlib import Path
from typing import List, Dict, Any
from datetime import datetime

# Core libraries
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

# LlamaIndex imports
from llama_index.core import Document, SimpleDirectoryReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import BaseNode, TextNode


# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✅ All imports loaded successfully!")
print("📁 Ready to process tagged region sermon content")

✅ All imports loaded successfully!
📁 Ready to process tagged region sermon content


In [2]:
reader = SimpleDirectoryReader(input_dir="./devo_dir", recursive=True)
documents = reader.load_data()
print(f"📄 Loaded {len(documents)} documents from './devo_dir'")

📄 Loaded 1 documents from './devo_dir'


In [3]:

EMBEDDING_MODEL = "text-embedding-3-large"
EMBEDDING_DIMENSIONS = 3072


parser = SimpleNodeParser(
        chunk_size=512,
        chunk_overlap=20,
        include_metadata=True,
        include_prev_next_rel=True
    )
nodes = parser.get_nodes_from_documents(documents)
print(f"📝 Parsed {len(nodes)} nodes from documents")

📝 Parsed 4 nodes from documents


In [None]:
# Process and embed all document chunks
print("🔄 Processing document chunks for embedding...")

def process_and_embed_documents(documents, nodes, openai_client, pinecone_index):
    """
    Process document nodes, generate embeddings, and upsert to Pinecone with metadata.
    """
    vectors_to_upsert = []
    
    for i, node in enumerate(nodes):
        print(f"📝 Processing chunk {i+1}/{len(nodes)}...")
        
        # Generate embedding for the node text
        response = openai_client.embeddings.create(
            input=node.text,
            model=EMBEDDING_MODEL
        )
        embedding = response.data[0].embedding
        
        # Create metadata for the chunk
        metadata = {
            'text': node.text,
            'source_document': node.metadata.get('file_name', 'unknown'),
            'chunk_index': i,
            'char_count': len(node.text),
            'document_id': node.metadata.get('doc_id', str(node.id_)),
            'file_path': node.metadata.get('file_path', ''),
            'creation_date': node.metadata.get('creation_date', ''),
        }
        
        # Prepare vector for upsert
        vector_data = {
            'id': f"chunk_{i}_{node.id_}",
            'values': embedding,
            'metadata': metadata
        }
        
        vectors_to_upsert.append(vector_data)
    
    # Batch upsert to Pinecone
    print(f"📤 Upserting {len(vectors_to_upsert)} vectors to Pinecone...")
    response = pinecone_index.upsert(vectors=vectors_to_upsert)
    
    print(f"✅ Successfully processed and upserted {len(vectors_to_upsert)} document chunks")
    return response

# Initialize clients
client = OpenAI()
pc = Pinecone()
aog_index = pc.Index("aog-devo")

# Process all documents
upsert_response = process_and_embed_documents(documents, nodes, client, aog_index)
print(f"📊 Upserted vectors count: {upsert_response.upserted_count}")

In [None]:
def test_rag_functionality(pinecone_index, openai_client, test_query: str = "What does the Bible teach about faith?"):
    """
    Test the RAG functionality for retrieving relevant devotional content.
    """
    print(f"🔍 Testing RAG functionality with query: '{test_query}'")
    
    try:
        # Create embedding for the test query
        query_response = openai_client.embeddings.create(
            input=test_query,
            model=EMBEDDING_MODEL
        )
        query_embedding = query_response.data[0].embedding
        
        # Search Pinecone for similar content chunks
        search_response = pinecone_index.query(
            vector=query_embedding,
            top_k=3,
            include_metadata=True,
            include_values=False
        )
        
        print(f"🎯 Found {len(search_response.matches)} relevant chunks")
        
        # Display results
        for i, match in enumerate(search_response.matches, 1):
            print(f"\n📄 Result {i} (Similarity Score: {match.score:.4f}):")
            if match.metadata:
                source = match.metadata.get('source_document', 'Unknown source')
                chunk_idx = match.metadata.get('chunk_index', 'N/A')
                char_count = match.metadata.get('char_count', 'N/A')
                
                print(f"   📁 Source: {source} | Chunk: {chunk_idx} | Length: {char_count} chars")
                print(f"   📖 Content: {match.metadata.get('text', 'No text available')[:300]}...")
                print("   " + "-" * 80)
            else:
                print("   ❌ No metadata available")
        
        return search_response
        
    except Exception as e:
        print(f"❌ Error testing RAG functionality: {str(e)}")
        return None

# Test the complete RAG functionality with devotional content
print("🧪 Testing complete RAG pipeline...")
test_queries = [
    "What does the Bible teach about faith?",
    "How can Scripture guide my daily life?",
    "What are the foundations of faith?"
]

for query in test_queries:
    print("\n" + "="*80)
    test_result = test_rag_functionality(aog_index, client, query)
    
print("\n✅ Complete RAG functionality testing finished")