In [None]:
# please ensure python version is 3.12

In [11]:
!pip install pandas openpyxl langchain langchain-classic langchain-text-splitters langchain-huggingface langchain-community chromadb openai sentence-transformers tiktoken huggingface_hub

[31mERROR: Could not find a version that satisfies the requirement langchain-huggingface-hub (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for langchain-huggingface-hub[0m[31m
[0m

In [36]:
# uncomment and run the below commands in case of module import errors due to change in python version
# please ensure python version is 3.12 / python kernel is set to 3.12 in case of jupyter notebook
import sys
!{sys.executable} -m pip install pandas openpyxl langchain langchain-classic langchain-text-splitters langchain-huggingface langchain-community chromadb openai sentence-transformers tiktoken huggingface_hub



In [28]:
import pandas as pd
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_classic.vectorstores import Chroma
from langchain_classic.chains import RetrievalQA
from langchain_classic.llms import HuggingFaceHub
from langchain_classic.schema import Document

In [29]:
# Data paths
DATA_DIR = "../../data/"

ecli_path = os.path.join(DATA_DIR, "DATA ecli_nummers juni 2025 v1 (version 1).xlsx")
letters_path = os.path.join(DATA_DIR, "Dataset Advice letters on objections towing of bicycles.xlsx")

In [30]:
# Load Excel files
df_ecli = pd.read_excel(ecli_path)
df_letters = pd.read_excel(letters_path)

# Select only the text columns we need
ecli_texts = df_ecli['ecli_tekst'].astype(str).tolist()
letters_texts = df_letters['geanonimiseerd_doc_inhoud'].astype(str).tolist()

# Quick sanity check
print("Number of ECLI cases:", len(ecli_texts))
print("Number of advice letters:", len(letters_texts))
print("Sample ECLI text:", ecli_texts[0][:200])
print("Sample advice letter text:", letters_texts[0][:200])

Number of ECLI cases: 2447
Number of advice letters: 567
Sample ECLI text: 
RvdW 2014/158




http://deeplink.rechtspraak.nl/uitspraak?id=ECLI:NL:HR:2014:39
text/html
public
2014-01-08T10:14:07
2014-01-08
Raad voor de Rechtspraak
nl
ECLI:NL:HR:2014:39 Hoge Raad , 07-01-2014 
Sample advice letter text: Advies van de bezwaarschriftencommissie Juridisch Bureau

Aan                Het college van burgemeester en wethouders van Amsterdam
Zaaknummer         JB.19.012268.001
Datum              2 december 


In [31]:
# Wrap ECLI cases as Documents
# TODO: add more metadata if reqd
case_docs = [
    Document(page_content=text, metadata={"source": "case", "ecli_nummer": df_ecli.loc[i, "ecli_nummer"]})
    for i, text in enumerate(ecli_texts)
]

# Wrap Advice letters as Documents
# TODO: add more metadata if reqd
advice_docs = [
    Document(page_content=text, metadata={
        "source": "advice",
        "zaaknummer": df_letters.loc[i, "Octopus zaaknummer"],
        "onderwerp": df_letters.loc[i, "Onderwerp"]
    })
    for i, text in enumerate(letters_texts)
]

# Combine all documents
all_docs = case_docs + advice_docs

print(f"Total documents prepared: {len(all_docs)}")
print("Sample document metadata:", all_docs[0].metadata)
print("Sample document text (first 200 chars):", all_docs[0].page_content[:200])

Total documents prepared: 3014
Sample document metadata: {'source': 'case', 'ecli_nummer': 'ECLI:NL:HR:2014:39'}
Sample document text (first 200 chars): 
RvdW 2014/158




http://deeplink.rechtspraak.nl/uitspraak?id=ECLI:NL:HR:2014:39
text/html
public
2014-01-08T10:14:07
2014-01-08
Raad voor de Rechtspraak
nl
ECLI:NL:HR:2014:39 Hoge Raad , 07-01-2014 


In [37]:
# Create a text splitter
# chunk_size = maximum number of characters per chunk
# chunk_overlap = number of characters overlapping between chunks to preserve context
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150
)

# Split all documents (ECLI cases + Advice letters) into smaller chunks
# Splitting preserves metadata for each chunk
split_docs = text_splitter.split_documents(all_docs)

# Print original number of documents
print("Original number of documents:", len(all_docs))

# Print number of chunks created to verify
print("Number of chunks created after splitting:", len(split_docs))

# Optional: print a sample chunk and its metadata for verification
print("\nSample chunk text (first 200 chars):")
print(split_docs[0].page_content[:200])

print("\nSample chunk metadata:")
print(split_docs[0].metadata)

Original number of documents: 3014
Number of chunks created after splitting: 83904

Sample chunk text (first 200 chars):
RvdW 2014/158




http://deeplink.rechtspraak.nl/uitspraak?id=ECLI:NL:HR:2014:39
text/html
public
2014-01-08T10:14:07
2014-01-08
Raad voor de Rechtspraak
nl
ECLI:NL:HR:2014:39 Hoge Raad , 07-01-2014 /

Sample chunk metadata:
{'source': 'case', 'ecli_nummer': 'ECLI:NL:HR:2014:39'}


In [44]:
# Create Embeddings
# Purpose: Convert each text chunk into a numeric vector (embedding) that captures its semantic meaning
# These embeddings will be stored in a vector store (Chroma) for similarity search

def create_embeddings(split_docs, model_name):
    """
    Create an embeddings object using the model given and test it on the first document chunk.
    """
    print("Initializing embeddings model...")
    print(f"Using model: {model_name}")

    # Initialize embeddings
    embeddings = HuggingFaceEmbeddings(model_name=model_name)

    # Test on the first chunk to confirm model works
    print("\nTesting embedding on first chunk...")
    sample_text = split_docs[0].page_content
    test_embedding = embeddings.embed_documents([sample_text])

    print("Embedding created successfully!")
    print("Embedding vector length:", len(test_embedding[0]))
    print("First 5 values:", test_embedding[0][:5])

    return embeddings


In [45]:
# Note: For Dutch legal text, possibly change later to another and test, eg: 'pdelobelle/robbert-v2-dutch-base'
embedding_model = "sentence-transformers/all-mpnet-base-v2"

embeddings = create_embeddings(
    split_docs,
    model_name=embedding_model
)

Initializing embeddings model...
Using model: sentence-transformers/all-mpnet-base-v2

Testing embedding on first chunk...
Embedding created successfully!
Embedding vector length: 768
First 5 values: [0.04207202419638634, -0.03519180789589882, 0.020371224731206894, 0.041567977517843246, -0.09661170095205307]


In [51]:
# Directory to persist Chroma database
PERSIST_DIR = "../resources/chroma_db"

In [47]:
# Create Chroma vector store from split documents
# Purpose: Store embeddings of all document chunks for fast similarity search
# Also preserves metadata (like source, case number) for context during retrieval

def create_vector_store(documents, persist_directory=PERSIST_DIR):
    """Create a new Chroma vector store."""
    start = time.time()
    
    # Create directory if it does not exist
    os.makedirs(persist_directory, exist_ok=True)
    
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        collection_name="legal_rag",
        persist_directory=persist_directory
    )

    # Persist the vector store to disk
    vector_store.persist()

    print("Chroma vector store created and persisted successfully!")
    print("Number of documents stored in vector store:", vector_store._collection.count())
    print(f"Vector store saved at: {persist_directory}")
    print(f"Time taken: {time.time() - start:.2f} seconds")

    return vector_store

In [48]:
def load_vector_store(persist_directory=PERSIST_DIR):
    """Load an existing Chroma vector store."""
    vector_store = Chroma(
        collection_name="legal_rag",
        embedding_function=embeddings,
        persist_directory=persist_directory
    )
    print("Loaded existing Chroma DB.")
    return vector_store

In [49]:
def update_vector_store(new_documents, persist_directory=PERSIST_DIR):
    """Add new documents into an existing Chroma store."""
    start = time.time()

    vector_store = load_vector_db(persist_directory)
    
    vector_store.add_documents(new_documents)

    # Persist the vector store to disk
    vector_store.persist()

    print("Chroma vector store updated with new documents!")
    print("Total number of documents stored in vector store:", vector_store._collection.count())
    print(f"Update time: {time.time() - start:.2f} seconds")

    return vector_store

In [52]:
# sample usage
create_vector_store(split_docs)
store = load_vector_store()
#update_vector_store(split_docs)

KeyboardInterrupt: 