In [3]:
import os
import json
import numpy as np
import faiss
from typing import List, Dict, Any
import requests
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
from io import BytesIO
import re
import pickle

class PhilippineHistoryRAG:
    def __init__(self, 
                 pdf_path: str = "philippine_history.pdf",
                 embedding_model: str = "all-MiniLM-L6-v2",
                 ollama_model: str = "llama2",
                 ollama_url: str = "http://localhost:11434",
                 chunk_size: int = 500,
                 chunk_overlap: int = 50):
        """
        Initialize the Philippine History RAG system.
        
        Args:
            pdf_path: Path to the Philippine history PDF file
            embedding_model: Name of the sentence transformer model for embeddings
            ollama_model: Name of the Ollama model to use
            ollama_url: Base URL for Ollama API
            chunk_size: Size of text chunks in characters
            chunk_overlap: Overlap between chunks in characters
        """
        self.pdf_path = pdf_path
        self.embedding_model_name = embedding_model
        self.ollama_model = ollama_model
        self.ollama_url = ollama_url
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        
        # Initialize components
        print("Loading embedding model...")
        self.embedding_model = SentenceTransformer(embedding_model)
        self.embedding_dim = self.embedding_model.get_sentence_embedding_dimension()
        
        # Storage for chunks and index
        self.chunks = []
        self.faiss_index = None
        
    def load_and_extract_text(self) -> str:
        """
        Load the PDF file and extract text content using PyMuPDF.
        
        Returns:
            Extracted text from the PDF
        """
        print(f"Loading PDF: {self.pdf_path}")
        
        try:
            # Open the PDF document
            pdf_document = fitz.open(self.pdf_path)
            text = ""
            
            print(f"Processing {len(pdf_document)} pages...")
            
            for page_num in range(len(pdf_document)):
                page = pdf_document[page_num]
                page_text = page.get_text()
                text += f"\n--- Page {page_num + 1} ---\n{page_text}"
                print(f"Extracted page {page_num + 1}/{len(pdf_document)}")
            
            # Close the document
            pdf_document.close()
            
            print(f"Successfully extracted {len(text)} characters from PDF")
            return text
                
        except FileNotFoundError:
            raise FileNotFoundError(f"PDF file not found: {self.pdf_path}")
        except Exception as e:
            raise Exception(f"Error reading PDF: {str(e)}")
    
    def clean_text(self, text: str) -> str:
        """
        Clean and preprocess the extracted text.
        
        Args:
            text: Raw text from PDF
            
        Returns:
            Cleaned text
        """
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s.,;:!?()-]', ' ', text)
        
        # Remove multiple consecutive punctuation
        text = re.sub(r'[.,;:!?]{2,}', '.', text)
        
        return text.strip()
    
    def create_chunks(self, text: str) -> List[Dict[str, Any]]:
        """
        Split text into overlapping chunks.
        
        Args:
            text: Text to chunk
            
        Returns:
            List of chunk dictionaries with text and metadata
        """
        print("Creating text chunks...")
        
        cleaned_text = self.clean_text(text)
        chunks = []
        
        # Split by sentences first for better chunk boundaries
        sentences = re.split(r'(?<=[.!?])\s+', cleaned_text)
        
        current_chunk = ""
        chunk_id = 0
        
        for sentence in sentences:
            # If adding this sentence would exceed chunk size, save current chunk
            if len(current_chunk) + len(sentence) > self.chunk_size and current_chunk:
                chunks.append({
                    'id': chunk_id,
                    'text': current_chunk.strip(),
                    'length': len(current_chunk)
                })
                
                # Start new chunk with overlap
                overlap_text = current_chunk[-self.chunk_overlap:] if len(current_chunk) > self.chunk_overlap else current_chunk
                current_chunk = overlap_text + " " + sentence
                chunk_id += 1
            else:
                current_chunk += " " + sentence if current_chunk else sentence
        
        # Add the last chunk
        if current_chunk:
            chunks.append({
                'id': chunk_id,
                'text': current_chunk.strip(),
                'length': len(current_chunk)
            })
        
        print(f"Created {len(chunks)} chunks")
        return chunks
    
    def create_embeddings(self, chunks: List[Dict[str, Any]]) -> np.ndarray:
        """
        Create embeddings for all chunks.
        
        Args:
            chunks: List of text chunks
            
        Returns:
            Numpy array of embeddings
        """
        print("Creating embeddings...")
        
        texts = [chunk['text'] for chunk in chunks]
        embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
        
        print(f"Created embeddings with shape: {embeddings.shape}")
        return embeddings
    
    def build_faiss_index(self, embeddings: np.ndarray) -> faiss.Index:
        """
        Build FAISS index from embeddings.
        
        Args:
            embeddings: Array of embeddings
            
        Returns:
            FAISS index
        """
        print("Building FAISS index...")
        
        # Create a FAISS index (using L2 distance)
        index = faiss.IndexFlatL2(self.embedding_dim)
        
        # Add embeddings to index
        index.add(embeddings.astype(np.float32))
        
        print(f"FAISS index built with {index.ntotal} vectors")
        return index
    
    def setup_index(self):
        """
        Complete setup: load PDF, create chunks, embeddings, and FAISS index.
        """
        print("=== Setting up Philippine History RAG System ===")
        
        # Load and process PDF
        text = self.load_and_extract_text()
        self.chunks = self.create_chunks(text)
        
        # Create embeddings and FAISS index
        embeddings = self.create_embeddings(self.chunks)
        self.faiss_index = self.build_faiss_index(embeddings)
        
        print("=== Setup complete! ===\n")
    
    def save_index(self, index_path: str = "faiss_index.index", chunks_path: str = "chunks.pkl"):
        """
        Save the FAISS index and chunks to disk.
        
        Args:
            index_path: Path to save FAISS index
            chunks_path: Path to save chunks
        """
        if self.faiss_index is None:
            raise ValueError("No index to save. Run setup_index() first.")
            
        faiss.write_index(self.faiss_index, index_path)
        with open(chunks_path, 'wb') as f:
            pickle.dump(self.chunks, f)
        print(f"Index saved to {index_path}, chunks saved to {chunks_path}")
    
    def load_index(self, index_path: str = "faiss_index.index", chunks_path: str = "chunks.pkl"):
        """
        Load the FAISS index and chunks from disk.
        
        Args:
            index_path: Path to load FAISS index
            chunks_path: Path to load chunks
        """
        self.faiss_index = faiss.read_index(index_path)
        with open(chunks_path, 'rb') as f:
            self.chunks = pickle.load(f)
        print(f"Index loaded from {index_path}, chunks loaded from {chunks_path}")
    
    def query_ollama(self, prompt: str) -> str:
        """
        Send a query to Ollama and get response.
        
        Args:
            prompt: The prompt to send to Ollama
            
        Returns:
            Response from Ollama
        """
        try:
            response = requests.post(
                f"{self.ollama_url}/api/generate",
                json={
                    "model": self.ollama_model,
                    "prompt": prompt,
                    "stream": False
                },
                timeout=120
            )
            
            if response.status_code == 200:
                return response.json()['response']
            else:
                return f"Error: HTTP {response.status_code} - {response.text}"
                
        except requests.exceptions.ConnectionError:
            return "Error: Could not connect to Ollama. Make sure Ollama is running on localhost:11434"
        except requests.exceptions.Timeout:
            return "Error: Request to Ollama timed out"
        except Exception as e:
            return f"Error communicating with Ollama: {str(e)}"
    
    def retrieve_relevant_chunks(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """
        Retrieve the most relevant chunks for a query.
        
        Args:
            query: User query
            top_k: Number of top chunks to retrieve
            
        Returns:
            List of relevant chunks with similarity scores
        """
        if self.faiss_index is None:
            raise ValueError("Index not built. Run setup_index() first.")
        
        # Create embedding for the query
        query_embedding = self.embedding_model.encode([query])
        
        # Search the index
        distances, indices = self.faiss_index.search(query_embedding.astype(np.float32), top_k)
        
        # Get relevant chunks
        relevant_chunks = []
        for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
            if idx < len(self.chunks):  # Valid index
                chunk = self.chunks[idx].copy()
                chunk['similarity_score'] = float(distance)
                chunk['rank'] = i + 1
                relevant_chunks.append(chunk)
        
        return relevant_chunks
    
    def build_context(self, relevant_chunks: List[Dict[str, Any]]) -> str:
        """
        Build context string from relevant chunks.
        
        Args:
            relevant_chunks: List of relevant chunks
            
        Returns:
            Formatted context string
        """
        context_parts = []
        for chunk in relevant_chunks:
            context_parts.append(f"[Chunk {chunk['rank']}]: {chunk['text']}")
        
        return "\n\n".join(context_parts)
    
    def answer_query(self, query: str, top_k: int = 5) -> Dict[str, Any]:
        """
        Answer a query using the RAG system.
        
        Args:
            query: User question
            top_k: Number of chunks to retrieve
            
        Returns:
            Dictionary containing the answer and metadata
        """
        print(f"Processing query: {query}")
        print("-" * 50)
        
        # Retrieve relevant chunks
        relevant_chunks = self.retrieve_relevant_chunks(query, top_k)
        
        if not relevant_chunks:
            return {
                'query': query,
                'answer': "I couldn't find relevant information to answer your question.",
                'relevant_chunks': [],
                'context': ""
            }
        
        # Build context
        context = self.build_context(relevant_chunks)
        
        # Create prompt for Ollama
        prompt = f"""Based on the following context about Philippine history, please answer the question accurately and concisely.

Context:
{context}

Question: {query}

Answer: """
        
        print("Retrieving answer from Ollama...")
        answer = self.query_ollama(prompt)
        
        return {
            'query': query,
            'answer': answer,
            'relevant_chunks': relevant_chunks,
            'context': context
        }
    
    def print_detailed_response(self, result: Dict[str, Any]):
        """
        Print a detailed response with context and sources.
        
        Args:
            result: Result dictionary from answer_query
        """
        print("=" * 60)
        print("PHILIPPINE HISTORY RAG SYSTEM - QUERY RESULT")
        print("=" * 60)
        print(f"Query: {result['query']}")
        print("-" * 60)
        print("ANSWER:")
        print(result['answer'])
        print("-" * 60)
        print(f"RETRIEVED CONTEXT ({len(result['relevant_chunks'])} chunks):")
        
        for chunk in result['relevant_chunks']:
            print(f"\n[Chunk {chunk['rank']} - Score: {chunk['similarity_score']:.4f}]")
            print(chunk['text'][:200] + "..." if len(chunk['text']) > 200 else chunk['text'])
        
        print("=" * 60)

def main():
    """
    Main function to demonstrate the RAG system.
    """
    # Initialize the RAG system
    rag = PhilippineHistoryRAG(
        pdf_path="PHILIPPINE-HISTORY-SOURCE-BOOK-FINAL-SEP022021.pdf",
        embedding_model="all-MiniLM-L6-v2",
        ollama_model="gemma3:1b",  # Change this to your preferred Ollama model
        chunk_size=500,
        chunk_overlap=50
    )
    
    # Check if saved index exists
    if os.path.exists("faiss_index.index") and os.path.exists("chunks.pkl"):
        print("Found existing index files. Loading...")
        rag.load_index()
    else:
        print("No existing index found. Creating new index...")
        rag.setup_index()
        # Save for future use
        rag.save_index()
    
    # Example queries
    sample_queries = [
        "When did the EDSA People Power Revolution happen?",
        "Who was Jose Rizal?",
        "What happened during the Spanish colonization of the Philippines?",
        "Tell me about Ferdinand Marcos and Martial Law",
        "What was the Katipunan?"
    ]
    
    print("Philippine History RAG System is ready!")
    print("\nSample queries you can try:")
    for i, query in enumerate(sample_queries, 1):
        print(f"{i}. {query}")
    
    # Interactive query loop
    while True:
        print("\n" + "="*50)
        user_query = input("Enter your question about Philippine history (or 'quit' to exit): ").strip()
        
        if user_query.lower() in ['quit', 'exit', 'q']:
            print("Thank you for using the Philippine History RAG system!")
            break
        
        if not user_query:
            print("Please enter a valid question.")
            continue
        
        try:
            # Get answer
            result = rag.answer_query(user_query, top_k=3)
            
            # Print detailed response
            rag.print_detailed_response(result)
            
        except Exception as e:
            print(f"Error processing query: {str(e)}")

if __name__ == "__main__":
    main()

Required packages:
  - faiss-cpu
  - sentence-transformers
  - PyMuPDF
  - numpy
  - requests

Install with: pip install faiss-cpu sentence-transformers PyMuPDF numpy requests
--------------------------------------------------
Loading embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

No existing index found. Creating new index...
=== Setting up Philippine History RAG System ===
Loading PDF: PHILIPPINE-HISTORY-SOURCE-BOOK-FINAL-SEP022021.pdf
Processing 646 pages...
Extracted page 1/646
Extracted page 2/646
Extracted page 3/646
Extracted page 4/646
Extracted page 5/646
Extracted page 6/646
Extracted page 7/646
Extracted page 8/646
Extracted page 9/646
Extracted page 10/646
Extracted page 11/646
Extracted page 12/646
Extracted page 13/646
Extracted page 14/646
Extracted page 15/646
Extracted page 16/646
Extracted page 17/646
Extracted page 18/646
Extracted page 19/646
Extracted page 20/646
Extracted page 21/646
Extracted page 22/646
Extracted page 23/646
Extracted page 24/646
Extracted page 25/646
Extracted page 26/646
Extracted page 27/646
Extracted page 28/646
Extracted page 29/646
Extracted page 30/646
Extracted page 31/646
Extracted page 32/646
Extracted page 33/646
Extracted page 34/646
Extracted page 35/646
Extracted page 36/646
Extracted page 37/646
Extracted p

Batches:   0%|          | 0/127 [00:00<?, ?it/s]

Created embeddings with shape: (4061, 384)
Building FAISS index...
FAISS index built with 4061 vectors
=== Setup complete! ===

Index saved to faiss_index.index, chunks saved to chunks.pkl
Philippine History RAG System is ready!

Sample queries you can try:
1. When did the EDSA People Power Revolution happen?
2. Who was Jose Rizal?
3. What happened during the Spanish colonization of the Philippines?
4. Tell me about Ferdinand Marcos and Martial Law
5. What was the Katipunan?



Enter your question about Philippine history (or 'quit' to exit):  When did the EDSA People Power Revolution happen?


Processing query: When did the EDSA People Power Revolution happen?
--------------------------------------------------
Retrieving answer from Ollama...
PHILIPPINE HISTORY RAG SYSTEM - QUERY RESULT
Query: When did the EDSA People Power Revolution happen?
------------------------------------------------------------
ANSWER:
February 25, 1986.
------------------------------------------------------------
RETRIEVED CONTEXT (3 chunks):

[Chunk 1 - Score: 0.7608]
itarianism supplanted the democratic institutions. On February 25, 1986, after four days of bloodless EDSA People Power Revolution, Marcos was compelled to step down and flee with his family to Hawaii...

[Chunk 2 - Score: 0.8177]
itarianism supplanted the democratic institutions. On February 25, 1986, after four days of bloodless EDSA People Power Revolution, Marcos was compelled to step down and flee with his family to Hawaii...

[Chunk 3 - Score: 0.8408]
72, following a spate of bombings in Metro Manila. Constitutional authoritaria

Enter your question about Philippine history (or 'quit' to exit):  "Who is José Rizal and why is he important?


Processing query: "Who is José Rizal and why is he important?
--------------------------------------------------
Retrieving answer from Ollama...
PHILIPPINE HISTORY RAG SYSTEM - QUERY RESULT
Query: "Who is José Rizal and why is he important?
------------------------------------------------------------
ANSWER:
José Rizal was a Filipino nationalist who lived during the late 19th century. He is important because he highlighted the problems of the Philippines, identified enemies of his country, and advocated for effective solutions to overcome them, embodying a positive nationalism that inspired Filipinos to strive for security, prosperity, and happiness for their nation.
------------------------------------------------------------
RETRIEVED CONTEXT (3 chunks):

[Chunk 1 - Score: 0.5460]
y in which to seek the answer. How did Rizal live? Perhaps we should first ask What did he live for? Rizal lived for his country. How did he live for his country? He lived for it first by understandin...



Enter your question about Philippine history (or 'quit' to exit):  Tell me about the Spanish colonization of the Philippines.


Processing query: Tell me about the Spanish colonization of the Philippines.
--------------------------------------------------
Retrieving answer from Ollama...
PHILIPPINE HISTORY RAG SYSTEM - QUERY RESULT
Query: Tell me about the Spanish colonization of the Philippines.
------------------------------------------------------------
ANSWER:
The Spanish colonization of the Philippines began in 1521 with the arrival of European navigators like Magellan, marking the end of Spanish control of the islands. It was a period of intense struggle for freedom for the Filipino people, who migrated to escape oppression.
------------------------------------------------------------
RETRIEVED CONTEXT (3 chunks):

[Chunk 1 - Score: 0.5356]
ending the Spanish control of the islands in 1898. The set of materials included here presents the various facets of colonial life in the nineteenth century Philippines. These were written by various ...

[Chunk 2 - Score: 0.6715]
d the Arts. 2001 Hernandez, Jose Rhomm

Enter your question about Philippine history (or 'quit' to exit):  quit


Thank you for using the Philippine History RAG system!


In [23]:
!pytest teamified_assessment_unit_tests.py -v

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


platform linux -- Python 3.13.5, pytest-8.4.1, pluggy-1.6.0 -- /home/aditya/teamified_assessment/bin/python3.13
cachedir: .pytest_cache
rootdir: /home/aditya/teamified_assessment
plugins: mock-3.14.1, cov-6.2.1, langsmith-0.4.8, anyio-4.9.0
collected 24 items                                                                                                                                                                           [0m[1m

teamified_assessment_unit_tests.py::TestPhilippineHistoryRAGInit::test_init_default_parameters [32mPASSED[0m[33m                                                                                  [  4%][0m
teamified_assessment_unit_tests.py::TestPDFProcessing::test_load_and_extract_text_success [31mFAILED[0m[31m                                                                                       [  8%][0m
teamified_assessment_unit_tests.py::TestPDFProcessing::test_load_and_extract_text_file_not_found [32mPASSED[0m[31m                           