# Pipeline

Data ingestion -> Document Store (Azure AI Search)

## 1. Ingest pdf(s)

Ingest pdf(s) in `/data` folder

In [2]:
import base64

def encode_pdf_to_base64(file_path):
    """
    Reads a PDF file and converts it to a base64 data URI.
    Required because Azure MaaS endpoints usually don't accept local paths.
    """
    with open(file_path, "rb") as pdf_file:
        encoded_string = base64.b64encode(pdf_file.read()).decode("utf-8")
    
    # Mistral expects this exact format prefix
    return f"data:application/pdf;base64,{encoded_string}"

## 2. Run OCR

Run OCR to extract text from each page. Mistral document model (https://docs.mistral.ai/capabilities/document_ai), it is on Azure AI foundry

In [None]:
# currently broken

import glob
import os
import dotenv
import requests

# Load environment variables from .env file
dotenv.load_dotenv()

results = []
pdf_files = glob.glob(os.path.join("data", "*.pdf"))

print(f"Found {len(pdf_files)} PDFs. Starting OCR job...\n")

headers = {
    "Authorization": f"Bearer {os.getenv('AZURE_OPENAI_API_KEY')}",
    "Content-Type": "application/json"
}

for file_path in pdf_files:
    file_name = os.path.basename(file_path)
    print(f"Processing: {file_name}...", end=" ")
    
    try:
        # 1. Prepare Payload
        payload = {
            "model": "mistral-document-ai-2505",
            "document": {
                "type": "document_url",
                "document_url": encode_pdf_to_base64(file_path)
            },
            "include_image_base64": False # optionally set to true
        }
        
        # 2. Send Request
        response = requests.post(os.getenv("AZURE_OPENAI_ENDPOINT"), headers=headers, json=payload)
        
        # 3. Handle Response
        if response.status_code == 200:
            data = response.json()
            
            # preserve page location
            page_data = []
            for i, page in enumerate(data.get('pages', [])):
                page_data.append({
                    "page_num": i + 1,  # Humans start at 1
                    "text": page['markdown']
                })
            
            results.append({
                "source_context": file_name,
                "file_path": file_path,
                "pages": page_data # Store the list, not the string
            })
            print("Done.")
        else:
            print(f"Error {response.status_code}: {response.text}")
            
    except Exception as e:
        print(f"Failed: {str(e)}")

print("\nAll files processed.")

Found 2 PDFs. Starting OCR job...

Processing: embedding_retrieval.pdf... Failed: Expecting value: line 1 column 1 (char 0)
Processing: refrag_research.pdf... Failed: Expecting value: line 1 column 1 (char 0)

All files processed.


## 3. Chunking

Chunk OCR test with a simple simple textsplitter (https://learn.microsoft.com/en-us/azure/search/vector-search-how-to-chunk-documents#langchain-data-chunking-example)

In [None]:
# !pip install langchain-text-splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Configure the splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000,   # Characters per chunk (adjust based on your embedding model limit)
    chunk_overlap=500, # overlap keeps context between cuts
    separators=["\n\n", "\n", " ", ""] # Try to split by paragraphs first, then lines, then words
)

In [None]:
chunked_data = []

# iterating over the 'results' list from the previous OCR step
for doc in results:
    filename = doc['source_context']
    
    # Iterate through each PAGE first
    for page in doc['pages']:
        page_num = page['page_num']
        page_text = page['text']
        
        # Split ONLY this page's text
        chunks = splitter.split_text(page_text)
        
        for i, text_chunk in enumerate(chunks):
            chunked_data.append({
                "chunk_id": f"{filename}_p{page_num}_{i}",
                "source": filename,
                "page": page_num,
                "text": text_chunk
            })

print(f"Generated {len(chunked_data)} chunks with page numbers.")

In [None]:
# Preview the first 2 chunks
for chunk in chunked_data[:2]:
    print(f"Chunk from {chunk['source']}")
    print(chunk['text'][:150] + "...") # Print first 150 chars
    print("\n")

## 4. Embedding

Generate vector embeddings per chunk using the Azure OpenAI embedding model. (https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/embeddings?view=foundry-classic&tabs=csharp)

In [None]:
from openai import AzureOpenAI
import os

# Setup Client
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-02-01",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

def get_embedding(text):
    text = text.replace("\n", " ") # Clean newlines to avoid token weirdness
    return client.embeddings.create(
        input=[text], 
        model="text-embedding-3-large",
    ).data[0].embedding

# Apply to all chunks
print(f"Embedding {len(chunked_data)} chunks...")

for i, chunk in enumerate(chunked_data):
    try:
        vector = get_embedding(chunk['text'])
        chunk['values'] = vector # Store the 3072 float list
        
        if i % 10 == 0: print(f".", end="") # Progress bar
        
    except Exception as e:
        print(f"\nError on chunk {i}: {e}")

print("\nDone! Embeddings generated.")

## 5. Vector DB

Index in Azure AI Search: store chunk text + metadata (document id, page number, folder, category, source_link) + embedding vector; enable vector search.

get data ready for upload

In [None]:
import uuid

documents_to_upload = []

print(f"Preparing payload from {len(chunked_data)} chunks...")

for chunk in chunked_data:
    # 1. Extract context (We only have filename now, not page number)
    filename = chunk['source']
    
    # 2. Map to your Azure Search Index Schema
    azure_doc = {
        "id": str(uuid.uuid4()),
        "content": chunk['text'],
        "contentVector": chunk['values'],
        # Citation will look like: "Source: report.pdf (Page 4)"
        "location": f"Source: {chunk['source']} (Page {chunk['page']})" 
    }
    
    documents_to_upload.append(azure_doc)

print(f"Ready to upload {len(documents_to_upload)} documents.")

upload to AI search

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient

# Initialize Client
credential = AzureKeyCredential(os.getenv("AZURE_SEARCH_API_KEY"))
client = SearchClient(endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
                      index_name=os.getenv("AZURE_SEARCH_INDEX_NAME"),
                      credential=credential)

# Upload in batches (Azure has a limit of ~1000 docs per request)
BATCH_SIZE = 1000
for i in range(0, len(documents_to_upload), BATCH_SIZE):
    batch = documents_to_upload[i : i + BATCH_SIZE]
    
    try:
        result = client.upload_documents(documents=batch)
        print(f"Uploaded batch {i} - {i+len(batch)}: Success")
    except Exception as e:
        print(f"Error uploading batch {i}: {e}")

print("Upload Complete.")

## 6. Testing

Validate end-to-end: run a few test queries, confirm top results point back to the right page/chunk, and iterate on chunking/cleaning. 

## 7. Use AI Search MCP on GHCP

This will be needed to set up an MCP server for AI search (for vector/hybrid search): https://github.com/tomgutt/azure-ai-search-mcp

## 8. Integrate MCP with OpenWebUI

OpenWebUI doesn't support stdio MCP configurations natively, use `mcpo` python library for it to work.