In [None]:
#  RAG pdf implementation using langchain and ollama
# https://python.langchain.com/docs/tutorials/rag/

In [26]:
# imports
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
import os 
import pandas as pd

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [27]:
# defining model
model = OllamaLLM(model="llama3.2")

In [42]:
# template
template = """
You are an assistant that answers questions based on the provided context. The context is about files and documents.
You should answer based in the context and you don't try to invent false information. If you can't answer the question, you retrive a response saying that you don't have the information to answer that.

Context:
{context}

Question: {question}

Answer:
"""

In [43]:
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | model

In [33]:
result = chain.invoke({"context":[], "question": "What is the best product?"})
print(result)

I don't see any context provided. Can you please provide some information or clarify what kind of products you're referring to (e.g., electronics, beauty products, clothing, etc.)? I'll do my best to help!


In [4]:
#  ======================= Embedding =============================

In [34]:
# pdf files
pdf_files = [
    "Andino_Agustin_Propuesta_PFC_2024.pdf",
    "Informe_PFC_Andino_Agustin.pdf"
]

In [35]:
embeddings = OllamaEmbeddings(model="mxbai-embed-large")

In [36]:
db_location = "./chrome_langchain_db"
add_documents = not os.path.exists(db_location)

In [None]:
# load the files
if add_documents:
    documents = []
    ids = []
    for i, pdf_file in enumerate(pdf_files):
        if os.path.exists(pdf_file):
            loader = PyPDFLoader(pdf_file)
            documents.extend(loader.load())
            ids.append(i)
            print(f"Loaded {pdf_file}: {len(loader.load())} pages")
        else:
            print(f"File not found: {pdf_file}")

    print(f"Total pages of documents loaded: {len(documents)}")
else:
    print("Documents already loaded")

Loaded Andino_Agustin_Propuesta_PFC_2024.pdf: 33 pages
Loaded Informe_PFC_Andino_Agustin.pdf: 193 pages
Total pages of documents loaded: 226


In [11]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

In [12]:
texts = text_splitter.split_documents(documents)
print(f"Total text chunks created: {len(texts)}")

Total text chunks created: 492


In [37]:
# Display first few chunks
for i, text in enumerate(texts[:3]):
    print(f"\n--- Chunk {i+1} ---")
    print(text.page_content[:500] + "..." if len(text.page_content) > 500 else text.page_content)


--- Chunk 1 ---
UNIVERSIDADNACIONALDELLITORALFacultaddeIngenieríayCienciasHídricas
PROPUESTADEPROYECTOFINALDECARRERAINGENIERÍAINFORMÁTICA
Desarrollodemódulodeguardiaambulatoriacontriajeautomatizadomedianteinteligenciaartificial enel sistemawebSaDER.
Alumno: Andino,DanielAgustin
Director: MaríaAgustinaCarignan
Co-Director:
Asesortemático:
SantaFe,Agostode2024

--- Chunk 2 ---
Índice
Resúmen 3PalabrasClaves 3Justificación 3Objetivo 6Alcance 7Metodología 8Plandetareas 13Cronogramadeactividades 17Puntosdeseguimiento 19Riesgos 20Recursos 24Presupuesto 25Plandecomunicaciones 28Bibliografía 30Referencias 31
2

--- Chunk 3 ---
Resumen


In [38]:
# Store the chunks
vector_store = Chroma(
    collection_name="pdf_seeker",
    persist_directory=db_location,
    embedding_function=embeddings
)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [44]:
if add_documents:
    vector_store.add_documents(documents=documents)

In [45]:
retriever = vector_store.as_retriever()

In [None]:
while True:
    question = input("Enter your question (q to quit): ")
    if question.lower() == "q":
        break
    search = retriever.invoke(question)
    result = chain.invoke({"query": search})
    print(result)

In [47]:
question = "Name of the final project, the university or the educational institution and the name of the alumn who did it"

In [48]:
search = retriever.invoke(question)

# context of the documents
context = "\n\n".join([doc.page_content for doc in search])

# prompt with the context and question
prompt_input = f"Context:\n{context}\n\nQuestion: {question}"

result = chain.invoke({"context": context, "question": question})
print(result)

The final project is called "Desarrollo de módulo de guardia ambulatoria con triaje automatizado mediante inteligencia artificial en el sistema web SaDER".

The university or educational institution where the student carried out the project is: UNIVERSIDAD NACIONAL DEL LITORAL.

The name of the alumn who did the project is: ANDINO DANIEL AGUSTÍN.
