# RAG Semantic Chunking Demo

## Semantic Chunking
Semantic chunking groups sentences by meaning, preserving complete information units.


In [12]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from supabase import create_client, Client
import uuid
import os
from tqdm import tqdm


In [24]:
def load_env_file(file_path):
    env_vars = {}
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith('#') and '=' in line:
                key, value = line.split('=', 1)
                env_vars[key] = value
    return env_vars
env_vars = load_env_file('supabase-docker/.env')

SUPABASE_URL = env_vars.get('SUPABASE_PUBLIC_URL')
SUPABASE_KEY = env_vars.get('ANON_KEY')

supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

user = supabase.auth.get_user()
print("✅ user:", user)


✅ user: None


In [3]:
loader: PyPDFLoader = PyPDFLoader("de-re-coquinaria.pdf")
docs_raw = loader.load()

In [4]:
embedding_model: HuggingFaceEmbeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")

In [5]:
text_splitter: SemanticChunker = SemanticChunker(
    embedding_model, 
    breakpoint_threshold_type="percentile"
)
docs_chunks = text_splitter.split_documents(docs_raw)

In [16]:
full_text = "".join([doc.page_content for doc in docs_raw])
chunks = text_splitter.create_documents([full_text])
print(chunks)

[Document(metadata={}, page_content='Marco Gavio Apicio C o c i n a  r o m a n a  \n \n1\n \n \n \n \n \nMarco Gavio Apicio \n \n \n \nCocina romana  \n \n \n \n \nTraducción de Bárbara Pastor Artigues \n \n \nTercera Edición, 1987 \n \n© Bárbara Pastor Artigues \n© EDITORIAL COLOQUIO, S.A. Juan Alvarez Mendizábal, 65 28008 MADRID \nISBN: 84-86093-36-8 \nDepósito Legal: M. 10370-1986  \nImprime: GRUPODIS, S.A. Juan Alvarez Mendizábal, 58  \n28008 MADRIDMarco Gavio Apicio C o c i n a  r o m a n a  \n \n2\n \n \n \n \nPROLOGO \n \n \n \nPoner prólogo a un libro tiene utilidad siempre que aporte interés al lector por su contenido; sin \nembargo, nuestro prólogo tiene otro objeto: es una aclaración a quien se disponga a leer este li bro \ntitulado “Cocina Romana”. Esto es, se trata de la simple transmisión literal de las recetas elaboradas \npor Apicio, autor del Tratado, sin otro ánimo crítico, cual pudiera ser aquel que impulsare a traducir \nun texto de la Antigüedad: un estudio eru dit

In [None]:
# Create SQL table in supabase
"""
    CREATE TABLE IF NOT EXISTS recipes (
        id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
        file_name TEXT NOT NULL,
        file_path TEXT NOT NULL, 
        file_content TEXT NOT NULL,
        embedding VECTOR(384) NOT NULL,
        created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
    );
"""
# also the search function
"""
DROP FUNCTION IF EXISTS public.search_recipes(vector(384), double precision, integer);

CREATE FUNCTION public.search_recipes(
    query_embedding vector(384),
    match_threshold float,
    match_count int
)
RETURNS TABLE(
    id uuid,
    file_name text,
    file_path text,
    file_content text,
    embedding vector(384),
    similarity float
)
LANGUAGE sql
AS $$
    SELECT 
        r.id,
        r.file_name,
        r.file_path,
        r.file_content,
        r.embedding,
        1 / (1 + (r.embedding <=> query_embedding)) AS similarity
    FROM recipes r
    WHERE (r.embedding <=> query_embedding) < match_threshold
    ORDER BY (r.embedding <=> query_embedding)
    LIMIT match_count;
$$;
"""

'\n    CREATE TABLE IF NOT EXISTS recipes (\n        id UUID PRIMARY KEY DEFAULT gen_random_uuid(),\n        file_name TEXT NOT NULL,\n        file_path TEXT NOT NULL, \n        file_content TEXT NOT NULL,\n        embedding VECTOR(384) NOT NULL,\n        created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()\n    );\n'

In [None]:
# Delete all records from recipes table first
supabase.from_('recipes').delete().neq('id', '00000000-0000-0000-0000-000000000000').execute()

for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks", unit="chunk")):
    content = chunk.page_content
    embedding = embedding_model.embed_query(content)

    try:
        if len(content) < 10:
            continue
        result = supabase.from_('recipes').insert({
            'id': str(uuid.uuid4()),
            'file_name': 'de-re-coquinaria.pdf',
            'file_path': f'de-re-coquinaria.pdf',
            'file_content': content,
            'embedding': embedding
        }).execute()
    except Exception as e:
        print(f"Error inserting chunk {i}: {e}")

Processing chunks: 100%|██████████| 200/200 [00:02<00:00, 84.15chunk/s]
