# Generate Chunks

This notebook loads previously generated documents, splits them into smaller chunks, and builds a multi-representation index.

## 1. Imports

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
from langchain.docstore.document import Document
from collections import defaultdict
from tqdm import tqdm
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain.docstore.document import Document
import uuid
from tqdm import tqdm
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore  # o otro como FAISSStore, RedisStore, etc.

import numpy
import torch
print("Numpy:", numpy.__version__)
print("Torch:", torch.__version__)
print("Numpy from:", numpy.__file__)
print("Torch from:", torch.__file__)


import pickle

Numpy: 1.26.4
Torch: 2.2.2
Numpy from: /Users/Bauti/Development/lawyerAgent-AI/.lawyer-agent-venv/lib/python3.12/site-packages/numpy/__init__.py
Torch from: /Users/Bauti/Development/lawyerAgent-AI/.lawyer-agent-venv/lib/python3.12/site-packages/torch/__init__.py


---

## 2. Load and Merge Documents

- Read the JSONL file containing page-level extracts.
- Although each page was saved as a separate Document, every page begins with a common identifier for its full judicial decision.
- We therefore merge pages back into complete documents rather than chunking at the page level.
- By using a multi-representation index, we preserve the full context of each decision during retrieval.

In [3]:
documents = []
with open("data/documents_fallos.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        doc = Document(
            page_content=entry["page_content"],
            metadata=entry.get("metadata", {})
        )
        documents.append(doc)

In [4]:
grouped = defaultdict(list)
for doc in documents:
    fallo_id = doc.page_content.split("#")[1]  # "#38414262#..." → "38414262"
    grouped[fallo_id].append(doc)

In [5]:
merged_documents = []
for fallo_id, pages in grouped.items():
    pages_sorted = sorted(pages, key=lambda x: x.metadata.get("page", 0))
    full_text = "\n".join(p.page_content for p in pages_sorted)
    full_metadata = pages_sorted[0].metadata.copy()  # Asumimos que todos los metadatos son iguales
    merged_documents.append(Document(page_content=full_text, metadata=full_metadata))


In [6]:
len(merged_documents)

4091

In [None]:
for doc in merged_documents:
    doc.page_content = doc.page_content.split('\n', 1)[1] #removes identifier

merged_documents[0].page_content

'Poder Judicial de la Nación\nCÁMARA COMERCIAL - SALA F\nEn Buenos Aires a los cinco días del mes de mayo de dos mil veinticinco,\nreunidos los Señores Jueces de Cámara en la Sala de Acuerdos fueron\ntraídos para conocer los autos “TORRES NOELI AYELEN C/CALEDONIA\n EXPTE. N°\nCOMPAÑÍA ARGENTINA DE SEGUROS SA S/ ORDINARIO”\nCOM 1110/2023; en los que al practicarse la desinsaculación que ordena el\nart. 268 del Código Procesal Civil y Comercial de la Nación resultó que la\nvotación debía tener lugar en el siguiente orden: Vocalía 16, Vocalía 18 y\nVocalía 17. Dado que la Vocalía Nº 18 se halla actualmente vacante,\nintervendrán la Dra. Alejandra N. Tevez y el Dr. Ernesto Lucchelli (art. 109\nRJN).\nSe deja constancia que las referencias de las fechas de las\nactuaciones y las fojas de cada una de ellas son las que surgen de los\nregistros digitales del expediente.\nEstudiados los autos la Cámara plantea la siguiente cuestión a\nresolver:\n¿Es arreglada a derecho la sentencia apelada de f

---

## 3. Generate Multi-Representation Indexing

In [None]:
multivector_documents = []
parent_documents = []

for i, doc in enumerate(merged_documents):
    full_text = doc.page_content
    parent_id = str(uuid.uuid4())  # Un ID único para este fallo
    metadata_base = doc.metadata.copy()
    metadata_base["parent_id"] = parent_id

    # 1. Full representation
    multivector_documents.append(Document(
        page_content=full_text,
        metadata={**metadata_base, "representation": "full_text"}
    ))

    # 2. Intro representation
    multivector_documents.append(Document(
        page_content=full_text[:4000],
        metadata={**metadata_base, "representation": "intro"}
    ))

    # 3. Resolution representation
    multivector_documents.append(Document(
        page_content=full_text[-4000:],
        metadata={**metadata_base, "representation": "resolucion"}
    ))

    # Parent document
    parent_documents.append(Document(
        page_content=full_text,
        metadata={"id": parent_id, **doc.metadata}
    ))


# Parents will be used to generate the response after retrieving relevant chunks
with open("parent_documents.pkl", "wb") as f:
    pickle.dump(parent_documents, f)

In [15]:
with open("parent_documents.pkl", "wb") as f:
    pickle.dump(parent_documents, f)

In [9]:
long_docs = [doc for doc in multivector_documents if len(doc.page_content) > 20000]
print(f"{len(long_docs)} documentos tienen más de 20k caracteres")

380 documentos tienen más de 20k caracteres


In [10]:
MAX_CHARS = 15000
cleaned_docs = [doc for doc in multivector_documents if len(doc.page_content) <= MAX_CHARS]

---

## 4. Embeddings, Vector DB and Retriever

In [None]:
embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Inicializás el vectorstore
vectorstore = Chroma(
    embedding_function=embedding,
    persist_directory="./multivector_chroma_db_001"
)

# 🔹 Agregás los documentos multivector
vectorstore.add_documents(multivector_documents)

# Store para documentos padres (los que se muestran como respuesta)
store = InMemoryStore()
store.mset([(d.metadata["id"], d) for d in parent_documents])

# 🔹 Definís el retriever
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key="parent_id",
    search_kwargs={"k": 3}
)

# 🔹 Guardás la base
vectorstore.persist()

  vectorstore.persist()
