In [1]:
# Standard library imports
import os
import sys
from pathlib import Path

# LangChain Document Loaders
from langchain_community.document_loaders import PyPDFLoader

# LangChain Text Splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Ollama Integration
from langchain_ollama import OllamaEmbeddings, ChatOllama

# ChromaDB Vector Store
from langchain_chroma import Chroma

# LangChain Core Components
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

print("âœ“ All imports successful!")
print("âœ“ Ready for local offline RAG!")
print(f"\nPython version: {sys.version}")

âœ“ All imports successful!
âœ“ Ready for local offline RAG!

Python version: 3.12.2 (tags/v3.12.2:6abddd9, Feb  6 2024, 21:26:36) [MSC v.1937 64 bit (AMD64)]


In [2]:
!ollama list

NAME                        ID              SIZE      MODIFIED     
gemma3:270m                 e7d36fb2c3b3    291 MB    5 hours ago     
mxbai-embed-large:latest    468836162de7    669 MB    6 hours ago     
tinyllama:latest            2644915ede35    637 MB    6 hours ago     
nomic-embed-text:latest     0a109f422b47    274 MB    18 hours ago    
gpt-oss:120b-cloud          569662207105    -         23 hours ago    


In [4]:
print("Testing Ollama connection...\n")

try:
    test_llm = ChatOllama(model="gemma3:270m", temperature=0)
    response = test_llm.invoke("Say 'Hello! I am running locally on your machine!'")
    
    print("âœ“ Ollama is working!")
    print(f"Response: {response.content}")
    
except Exception as e:
    print(f"âœ— Error connecting to Ollama: {e}")
    print("\nMake sure Ollama is running. Try: ollama serve")

Testing Ollama connection...

âœ“ Ollama is working!
Response: Hello!



In [15]:
#1.LOAD PDF
pdf_path = "attention.pdf"
loader = PyPDFLoader(pdf_path)
document = loader.lazy_load()
#2.CHUNKING
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 128,
    length_function = len,
    separators = ["\n\n", "\n", " ", ""]
)
chunk = text_splitter.split_documents(document)
#3.EMBEDDING
embedding = OllamaEmbeddings(
    model = "nomic-embed-text:latest",
)
#4.VECTOR DB
persist_directory = "./chroma_db"
vector_store = Chroma.from_documents(
    documents=chunk,
    embedding=embedding,
    persist_directory=persist_directory,
    collection_name="local_rag_ollama.ipynb"
)


In [21]:
#RETRIVER
retriever = vector_store.as_retriever(
    search_type = "similarity",
    kwargs = {"k":4}
)

#OLLAMA
llm = ChatOllama(
    model = "gemma3:270m",
    temperature= 0
)

In [26]:
system_prompt = (
    "You are a helpful assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer based on the context, say that you don't know. "
    "Keep the answer concise and accurate.\n\n"
    "Context: {context}\n\n"
    "Question: {question}"
)

prompt = ChatPromptTemplate.from_template(system_prompt)

def format_docs(docs):
    """Format retrieved documents into a single string."""
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {
      "context" : retriever | format_docs,
      "question" : RunnablePassthrough()
    }  
    | prompt
    | llm
    | StrOutputParser()
)
print("âœ“ RAG chain created successfully using LCEL!")
print("\nRAG Pipeline Flow:")
print("  1. User provides a query")
print("  2. Retriever finds top 4 relevant chunks (local ChromaDB)")
print("  3. Chunks are formatted as context")
print("  4. Context + question formatted with prompt template")
print("  5. Local LLM (gemma3:1b) generates answer")
print("  6. Answer parsed and returned")
print("\nðŸ”’ Everything runs locally on your machine!")

âœ“ RAG chain created successfully using LCEL!

RAG Pipeline Flow:
  1. User provides a query
  2. Retriever finds top 4 relevant chunks (local ChromaDB)
  3. Chunks are formatted as context
  4. Context + question formatted with prompt template
  5. Local LLM (gemma3:1b) generates answer
  6. Answer parsed and returned

ðŸ”’ Everything runs locally on your machine!


In [28]:
query1 = "What is the main topic or contribution of this document?"
answer =rag_chain.invoke(query1)
print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(answer)
print("\n" + "=" * 80)

# Show source documents
print("\nSOURCE DOCUMENTS USED:")
print("=" * 80)
retrieved_docs = retriever.invoke(query1)
for i, doc in enumerate(retrieved_docs):
    print(f"\nDocument {i+1}:")
    print(f"  Page: {doc.metadata.get('page', 'N/A')}")
    print(f"  Content: {doc.page_content[:200]}...")
    print("-" * 80)

ANSWER:
The main topic or contribution of this document is the development of a transformer-based model for natural language processing, specifically focusing on the ability of the model to handle long-range dependencies in the input sequence.


SOURCE DOCUMENTS USED:

Document 1:
  Page: 12
  Content: Attention Visualizations
Input-Input Layer5
It
is
in
this
spirit
that
a
majority
of
American
governments
have
passed
new
laws
since
2009
making
the
registration
or
voting
process
more
difficult
.
<EOS...
--------------------------------------------------------------------------------

Document 2:
  Page: 13
  Content: Input-Input Layer5
The
Law
will
never
be
perfect
,
but
its
application
should
be
just
-
this
is
what
we
are
missing
,
in
my
opinion
.
<EOS>
<pad>
The
Law
will
never
be
perfect
,
but
its
application
sh...
--------------------------------------------------------------------------------

Document 3:
  Page: 9
  Content: Table 4: The Transformer generalizes well to English cons

In [31]:

# Example Query 3: Your custom question
custom_query = "What specific details are mentioned about the methodology or approach?"

print(f"Query: {custom_query}")
print("\nProcessing locally...\n")

answer = rag_chain.invoke(custom_query)

print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(answer)
print("\n" + "=" * 80)

Query: What specific details are mentioned about the methodology or approach?

Processing locally...

ANSWER:
The methodology or approach used to address the question is described in Figure 3.


