In [1]:
import os 
from uuid import uuid4
from typing import List
from langchain_chroma import Chroma
from langchain_ollama import OllamaLLM,OllamaEmbeddings
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_community.document_loaders import PyPDFLoader

In [2]:
path_of_documents = "..\\doc\\original"
path_to_export_md = "..\\doc\\md"
persist_directory = "..\\chroma"

## Recursive Character Splitting

> TODO: Semantic Chunking

In [3]:
def recursive_character_splitting(text:str, source:str, chunk_size:int = 1000, percentage=0.2) -> List[Document]:
  splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=int(chunk_size*percentage)
  )
  chunks:List[str] = splitter.split_text(text=text)
  return [ Document(page_content=chunk, metadata={"source": source}) for chunk in chunks ]

def read_document(doc:str) -> str:
  loader = PyPDFLoader(doc)
  pages = [page.page_content for page in loader.lazy_load()]
  return ", ".join(pages)

def load_documents() -> List[str]:
  dir = os.listdir(path_of_documents)
  dir = [ path_of_documents + "\\" + dir[i] for i in range(len(dir)) ]
  return dir 

In [None]:
def print_chunks(chunks:List[Document]) -> None:
  for i,chunk in enumerate(chunks):
    print("\n\n")
    print(f"--------> Chunk {i+1} <--------")
    print(chunk)
    print("\n\n")

In [4]:
dirs = load_documents()
result = []
for dir in dirs:
  text = read_document(dir)
  chunks= recursive_character_splitting(" ".join(text), dir)
  result.extend(chunks)

In [None]:
print_chunks(result)

## Memory as VectorStore and Retriever

In [None]:
embeddings = OllamaEmbeddings(
  model = "mxbai-embed-large"
)
vs = InMemoryVectorStore.from_documents(
  result, 
  embedding=embeddings
)

In [None]:
retriever = vs.as_retriever()
retrieved_documents = retriever.invoke("¿Qué es un número complejo?")

print(len(retrieved_documents))

In [None]:
for i,r_doc in enumerate(retrieved_documents):
  print(f"=====> Document {i+1} <=====")
  print(r_doc.page_content)

## Map Reduce

![](../resources/summary_of_data.png)

In [None]:
prompt = """ 
Resume los temas de los siguientes fragmentos de un documento

{documents} 
"""

In [None]:
GEMMA = "gemma3:1b"             # ollama gemma3:1b
#DEEPSEEK = "deepseek-r1:1.5b"   # ollama deepseek-r1:1.5b

llm = OllamaLLM(GEMMA, temperature=0.8)
llm.invoke("2+2?")