# PDF processing using langchain and llama-index

In [2]:
import os

from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import AzureChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
# Load environment variables
load_dotenv()

True

## langchain

In [6]:
def langchain_rag_pipeline(pdf_path: str):
    """Full RAG pipeline with query capabilities"""
    # Load and chunk documents
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200, add_start_index=True
    )
    chunks = text_splitter.split_documents(pages)

    # Create and persist vector store
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2"
        ),
    )

    # Create retriever with metadata filtering
    retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 3, "filter": {"page": {"$gte": 5}}},
    )

    # Define prompt template
    prompt_template = PromptTemplate.from_template(
        """Answer the question based only on the following context:
        {context}
        
        Question: {question}
        """
    )

    # Construct the chain with proper component order
    # 1. First retrieve documents
    # 2. Format the prompt with context and question
    # 3. Send to LLM
    # 4. Parse output
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    llm = AzureChatOpenAI(
        model="gpt-4o", api_version=os.getenv("AZURE_OPENAI_API_VERSION")
    )

    rag_chain = (
        {"context": retriever | format_docs, "question": lambda x: x}
        | prompt_template
        | llm
    )

    return rag_chain

In [7]:
# Usage
pdf_file_path = "data/Understanding_Climate_Change.pdf"
qa = langchain_rag_pipeline(pdf_file_path)
result = qa.invoke("What is the main topic of section 3?")
print(result.content)

The main topic of section 3 is **Global Vision**, which focuses on inspiring collective action and fostering hope through a shared vision for a sustainable future, including a healthy planet, thriving ecosystems, and equitable societies.


## llama-index

In [32]:
from chromadb import Client
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.vector_stores.chroma import ChromaVectorStore

In [29]:
# support langchain embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


def llamaindex_rag_pipeline(pdf_path: str, persist_dir: str = "llamaindex_db"):
    """Full RAG pipeline with hybrid search"""
    # Load documents
    reader = SimpleDirectoryReader(input_files=[pdf_path])
    documents = reader.load_data()

    # Create index with Chroma
    client = Client()
    vector_store = ChromaVectorStore(
        chroma_collection=client.get_or_create_collection("docs")
    )

    lc_embed_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    )
    embed_model = LangchainEmbedding(lc_embed_model)

    index = VectorStoreIndex.from_documents(
        documents,
        transformations=[SentenceSplitter(chunk_size=512)],
        embed_model=embed_model,
        vector_store=vector_store,
    )

    llm = AzureOpenAI(
        model="gpt-4.1",
        engine="gpt-4.1",
        api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    )

    query_engine = index.as_query_engine(
        llm=llm,
        similarity_top_k=6,
    )

    return query_engine

In [30]:
pdf_file_path = "data/Understanding_Climate_Change.pdf"
engine = llamaindex_rag_pipeline(pdf_file_path)
response = engine.query("What are the key findings in this document?")
print(response.response)

The document highlights several key findings regarding climate change and the actions needed to address it:

1. **Intergenerational Equity and Responsibility:** There is a strong emphasis on the responsibility to protect the rights and well-being of future generations. This involves long-term thinking, sustainable resource management, and fostering a sense of stewardship and legacy.

2. **Holistic and Inclusive Approaches:** Addressing climate change requires integrating environmental, social, and economic dimensions. Collaboration across sectors, innovation, and global solidarity are essential for effective climate action.

3. **Empowerment and Education:** Educating and empowering individuals, especially youth, is crucial. Integrating climate education at all levels and supporting youth leadership can drive positive change and build a culture of sustainability.

4. **Climate Justice and Social Equity:** The impacts of climate change are not evenly distributed, with vulnerable and mar