In [1]:
import re
import os
from pprint import pprint
from dotenv import load_dotenv
from langchain_community.document_loaders.pdf import PyMuPDFLoader
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec
from pinecone_text.sparse import BM25Encoder

load_dotenv()

file_path = '../../app/data/ustava.pdf'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(os.path.exists(file_path))

True


In [2]:
def split_by_articles(text):
    """Split Slovenian Constitution text into articles."""
    articles = []
    pattern = r"\n\s*(\d+(?:\.\w+)?)\.\s*člen\s+"
    matches = list(re.finditer(pattern, text))
    
    print(f"Found {len(matches)} article markers")
    
    for i, match in enumerate(matches):
        article_start = match.start()
        article_number = match.group(1)
        
        if i < len(matches) - 1:
            next_start = matches[i + 1].start()
            article_text = text[article_start:next_start]
        else:
            article_text = text[article_start:]
        
        article_text = article_text.strip()
        is_sub_article = "." in article_number and not article_number.replace(".", "").isdigit()
        
        article_data = {
            "id": f"ustava_{article_number}_{i}",
            "text": article_text,
            "metadata": {
                "article_number": article_number,
                "article_type": "sub_article" if is_sub_article else "article",
                "source": "Ustava RS.pdf",
                "chunk_index": i,
                "total_chunks": len(matches),
                "law": "Ustava Republike Slovenije"
            }
        }
        
        if is_sub_article:
            article_data["metadata"]["main_article"] = article_number.split(".")[0]
        
        articles.append(article_data)
    
    return articles

In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.list_indexes().names()

['pravni-vodnik', 'hybrid-search-langchain-pinecone']

In [5]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

In [12]:
bm25 = BM25Encoder()

Load document

In [10]:
print("Loading PDF...")
loader = PyMuPDFLoader(file_path, mode='single')
document = loader.load()
text = document[0].page_content

Loading PDF...


Split document

In [12]:
articles = split_by_articles(text)
print(f"Created {len(articles)} articles")
print(articles[0])

NameError: name 'split_by_articles' is not defined

Train BM25 - on current document only

In [None]:
# texts_for_bm25 = [article['text'] for article in articles]
# bm25.fit(texts_for_bm25)

# bm25.dump('../../app/database/bm25_model.json')

100%|██████████| 174/174 [00:00<00:00, 1038.10it/s]


Create Hybrid Vectors

In [None]:
hybrid_vectors = []

for article in articles:
    try:
        # dense vector
        dense_vector = embeddings.embed_query(article['text'])

        # sparse vector
        sparse_vector = bm25.encode_documents(article['text'])

        hybrid_vector = {
            "id": article['id'],
            'values': dense_vector,
            'sparse_values': {
                'indices': sparse_vector['indices'],
                'values': sparse_vector['values']
            },
            'metadata': {
                **article['metadata'],
                'text': article['text']
            }
        }

        hybrid_vectors.append(hybrid_vector)
    except Exception as e:
        print(f"Error processing {article['id']}: e") 

In [None]:
pprint(hybrid_vectors[0])

{'id': 'ustava_1_0',
 'metadata': {'article_number': '1',
              'article_type': 'article',
              'chunk_index': 0,
              'law': 'Ustava Republike Slovenije',
              'source': 'Ustava RS.pdf',
              'text': '1. člen\nSlovenija je demokratična republika.',
              'total_chunks': 174},
 'sparse_values': {'indices': [697231871,
                               964707915,
                               2765784866,
                               3904829042,
                               1381030882,
                               874673559],
                   'values': [0.7031357164412498,
                              0.7031357164412498,
                              0.7031357164412498,
                              0.7031357164412498,
                              0.7031357164412498,
                              0.7031357164412498]},
 'values': [-0.028065351769328117,
            0.005382997449487448,
            0.011775855906307697,
         

Create Pinecone Index

In [21]:
index_name = 'pravni-vodnik'

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='dotproduct',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
    
    # wait for index to be ready
    import time
    time.sleep(30)

Uplod to Pinecone

In [29]:
batch_size = 100
index = pc.Index(index_name)

print(f"Uploading {len(hybrid_vectors)} vectors in batches of {batch_size}...")

for i in range(0, len(hybrid_vectors), batch_size):
    batch = hybrid_vectors[i:i + batch_size]
    
    # Upload batch
    index.upsert(
        vectors=batch,
        namespace='ustava'
    )
    
    # Show progress
    batch_num = i // batch_size + 1
    total_batches = (len(hybrid_vectors) - 1) // batch_size + 1
    print(f"Uploaded batch {batch_num}/{total_batches} ({len(batch)} vectors)")

print(f"✅ Successfully uploaded {len(hybrid_vectors)} vectors total!")

Uploading 174 vectors in batches of 100...
Uploaded batch 1/2 (100 vectors)
Uploaded batch 2/2 (74 vectors)
✅ Successfully uploaded 174 vectors total!


In [28]:
stats = index.describe_index_stats()
stats['namespaces']['ustava']['vector_count']

100

In [38]:
query = 'Ali je slovenija ustavna država?'

sparse_vector = bm25.encode_queries(query)
dense_vector = embeddings.embed_query(query)

results = index.query(
    namespace='ustava',
    include_metadata=True,
    top_k=3,
    vector=dense_vector,
    sparse_vector={
        'indices': sparse_vector['indices'],
        'values': sparse_vector['values']
    }
)

In [44]:
matches = results.matches

for match in matches:
    print(match['metadata']['text'])

4. člen
Slovenija je ozemeljsko enotna in nedeljiva država.
2. člen
Slovenija je pravna in socialna država.
1. člen
Slovenija je demokratična republika.
