# Ollama PDF RAG Notebook

## Import Libraries


In [1]:
# Imports
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Jupyter-specific imports
from IPython.display import display, Markdown

# Set environment variable for protobuf
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

In [6]:
import os
os.environ['GOOGLE_API_KEY'] = ""

## Load PDF

In [2]:
folder_path = "./data/"

In [3]:
# Load PDF
all_documents = []  # Stores all loaded documents
for filename in os.listdir(folder_path):
    local_path = os.path.join(folder_path,filename)
    if local_path:
        # loader = UnstructuredPDFLoader(file_path=local_path)
        loader = UnstructuredMarkdownLoader(file_path=local_path) 
        # loader = UnstructuredHTMLLoader(file_path=local_path)  better than markdwon
        # loader = UnstructuredFileLoader(file_path=local_path) good
        data = loader.load()
        all_documents.extend(data)  # Add to combined list
        print(f" loaded successfully: {local_path}")
    else:
        print("not Upload a  file : {local_path}")

 loaded successfully: ./data/academmic-new.md
 loaded successfully: ./data/bus-new.md
 loaded successfully: ./data/cutoof-new.md
 loaded successfully: ./data/holidays.md
 loaded successfully: ./data/lab-end.md
 loaded successfully: ./data/placements.md
 loaded successfully: ./data/results-new.md
 loaded successfully: ./data/sem-end-new.md
 loaded successfully: ./data/syllabus.md
 loaded successfully: ./data/time_tables.md


## Split text into chunks

In [4]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(all_documents)
print(f"Text split into {len(chunks)} chunks")

Text split into 793 chunks


## Create vector database

In [7]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embedding_function = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [8]:
# Create vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_function,
    collection_name="local-rag",
    persist_directory="./chroma_db"
)
print("Vector database created successfully")

Vector database created successfully


## Set up LLM and Retrieval

In [9]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [10]:
# Set up LLM and retrieval
#local_model = "llama3.2"  # or whichever model you prefer
# llm = ChatOllama(model=local_model)
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [11]:
vector_db = Chroma(
    collection_name="local-rag",
    embedding_function=embedding_function,
    persist_directory="./chroma_db"
)

In [12]:
# Query prompt template
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate 2
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

# Set up retriever
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

## Create chain

In [13]:
# RAG prompt template
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [14]:
# Create chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

## Chat with PDF

In [15]:
def chat_with_pdf(question):
    """
    Chat with the PDF using the RAG chain.
    """
    return display(Markdown(chain.invoke(question)))

In [18]:
# Example 1
chat_with_pdf("What are the placements in CSE with packages?")

Based on the provided text, here's a partial list of CSE placements with packages.  The data is incomplete and inconsistently formatted across the documents, making a fully comprehensive answer impossible:


* **ARMIET Asst Professor (CSE):** 6.6 Lakhs
* **Cognida. ai. Software Engineer:** 7.0 Lakhs (2 placements)
* **IBM Software Engineer:** 11.0 Lakhs (1 placement)
* **Accolite Software Engineer:** 8.0 Lakhs (2 placements)
* **Accenture Adv. Associate Software Engineer:** 6.5 Lakhs (1 placement)
* **Accenture Associate Software Engineer:** 4.5 Lakhs (33 placements)
* **PayPal PPO Software Engineer:** 34.0 Lakhs (1 placement)
* **Microsoft PPO Software Engineer:** 51.0 Lakhs (1 placement)
* **ThoughtSpot Software Engineer:** 11.0 Lakhs (6 placements)
* **Amazon Software Development Engineer:** 47.0 Lakhs (1 placement)
* **Ideagen Trainee Software Engineer:** 7.0 Lakhs (1 placement)
* **CloudAngles Software Engineer Trainee:** 5.0 Lakhs (2 placements)


Note:  This list only includes information explicitly stating "CSE" and a CTC (Cost To Company) figure.  Many other entries mention software engineering roles, but without explicit CSE designation.  Also, some entries are missing crucial details like the number of placements.

## Clean up (optional)

In [1]:
# Optional: Clean up when done 
vector_db.delete_collection()
print("Vector database deleted successfully")

NameError: name 'vector_db' is not defined