# RAG TAG: Simple RAG Pipeline with Ollama and Mistral
    
This notebook demonstrates a minimal implementation of a Retrieval-Augmented Generation (RAG) workflow using:

- PyMuPDF (fitz) for text extraction,
- SentenceTransformers for embeddings,
- FAISS for vector search,
- Ollama with Mistral for text generation,

No LangChain or other high-level frameworks are used.
# Introduction to Simple RAG

Retrieval-Augmented Generation (RAG) is a hybrid approach that combines information retrieval with generative models. It enhances the performance of language models by incorporating external knowledge, which improves accuracy and factual correctness.

In a Simple RAG setup, we follow these steps:

1. **Data Ingestion**: Load and preprocess the text data.
2. **Chunking**: Break the data into smaller chunks to improve retrieval performance.
3. **Embedding Creation**: Convert the text chunks into numerical representations using an embedding model.
4. **Semantic Search**: Retrieve relevant chunks based on a user query.
5. **Response Generation**: Use a language model to generate a response based on retrieved text.

This notebook implements a Simple RAG approach, evaluates the modelâ€™s response, and explores various improvements.


# Import necessary libraries

In [None]:
import fitz  # PyMuPDF for PDF extraction
import numpy as np
import requests  # For Ollama API
from sentence_transformers import SentenceTransformer  # For local embeddings
import faiss  # For vector search
import json  # For loading validation data
import os
import time


# 1. Extract text from PDF

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract all text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


# 2. Chunk text into segments

In [None]:
def chunk_text(text, chunk_size=1000, overlap=200):
    """Split text into overlapping chunks of specified size."""
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# 3. Create embeddings using local Sentence Transformer model

In [None]:
def create_embeddings(texts):
    """Generate embeddings for a list of text chunks."""
    # Initialize the model (only happens once)
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Generate embeddings
    embeddings = model.encode(texts)
    
    # Normalize for cosine similarity
    normalized_embeddings = np.array([emb / np.linalg.norm(emb) for emb in embeddings]).astype('float32')
    return normalized_embeddings

# 4. Create FAISS index for vector search

In [None]:
def create_vector_index(embeddings):
    """Create a FAISS index for fast vector search."""
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
    index.add(embeddings)
    return index

# 5. Search for relevant chunks

In [None]:
def search_vectors(index, query_embedding, texts, k=3):
    """Find the k most relevant text chunks for a query embedding."""
    # Normalize query embedding
    query_embedding = np.array([query_embedding / np.linalg.norm(query_embedding)]).astype('float32')
    
    # Search for similar vectors
    similarities, indices = index.search(query_embedding, k)
    
    # Return the actual text chunks and their similarity scores
    return [texts[i] for i in indices[0]], similarities[0]

# 6. Query Ollama with Mistral model

In [None]:
def query_ollama_mistral(prompt):
    """Send a prompt to Ollama's Mistral model and get a response."""
    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": "mistral",
                "prompt": prompt
            },
            timeout=30
        )
        return response.json()["response"]
    except Exception as e:
        print(f"Error querying Ollama: {e}")
        return f"Error: {e}"

# 7. Generate response

In [None]:
def generate_response(query, context):
    """Generate a response to the query based on the provided context."""
    prompt = f"""
    Please answer the following question based only on the provided context information.
    If the answer cannot be determined from the context, respond with "I don't have enough information to answer that."
    
    Context:
    {context}
    
    Question: {query}
    
    Answer:
    """
    return query_ollama_mistral(prompt)

# 8. Complete RAG pipeline

In [None]:
def rag_pipeline(query, file_path):
    """Run the complete RAG pipeline from document to answer."""
    # Check if file exists
    if not os.path.exists(file_path):
        return f"Error: File {file_path} not found", [], []
    
    # Extract text based on file type
    if file_path.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
    
    # Chunk the text
    chunks = chunk_text(text)
    print(f"Created {len(chunks)} text chunks")
    
    # Create embeddings for chunks
    chunk_embeddings = create_embeddings(chunks)
    
    # Create vector index
    index = create_vector_index(chunk_embeddings)
    
    # Create query embedding
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode([query])[0]
    
    # Search for relevant chunks
    relevant_chunks, similarities = search_vectors(index, query_embedding, chunks)
    
    # Join retrieved contexts
    context = "\n\n".join(relevant_chunks)
    
    # Generate response using Mistral
    start_time = time.time()
    response = generate_response(query, context)
    end_time = time.time()
    
    print(f"Response generated in {end_time - start_time:.2f} seconds")
    
    return response, relevant_chunks, similarities

# 9. Example usage

# Make sure Ollama is running and Mistral model is downloaded (ollama pull mistral)


In [None]:
# Make sure Ollama is running and Mistral model is downloaded (ollama pull mistral)
file_path = "data/quantum.txt"  # Update with your actual file path
query = "What are the stages in the path to practical quantum computing?"

print("Running RAG pipeline with Ollama + Mistral...\n")
response, chunks, scores = rag_pipeline(query, file_path)

print("\n--- Results ---")
print(f"Query: {query}")
print(f"\nResponse from Mistral:\n{response}")
print("\nTop retrieved chunks:")
for i, (chunk, score) in enumerate(zip(chunks, scores)):
    print(f"\nChunk {i+1} (similarity score: {score:.4f}):")
    print(chunk[:200] + "..." if len(chunk) > 200 else chunk)

# 10. Evaluation with validation data (optional)

In [None]:
def evaluate_with_validation_data(val_file_path, document_path):
    """Evaluate the RAG pipeline using validation data."""
    try:
        with open(val_file_path, 'r') as f:
            val_data = json.load(f)
        
        results = []
        for item in val_data:
            query = item["question"]
            print(f"Testing: {query}")
            
            response, chunks, scores = rag_pipeline(query, document_path)
            
            results.append({
                "question": query,
                "generated_answer": response,
                "ideal_answer": item["ideal_answer"],
                "has_answer": item["has_answer"],
                "top_chunk": chunks[0][:200] + "..." if chunks and len(chunks[0]) > 200 else chunks[0] if chunks else ""
            })
        
        return results
    
    except Exception as e:
        print(f"Evaluation error: {e}")
        return []

# Uncomment to run evaluation
# val_results = evaluate_with_validation_data("data/val.json", file_path)
# print("\n--- Evaluation Results ---")
# for result in val_results:
#     print(f"Q: {result['question']}")
#     print(f"Generated: {result['generated_answer']}")
#     print(f"Ideal: {result['ideal_answer']}")
#     print("-" * 50)