### Get articles from Mongo
Articles in mongo are the source of truth

In [3]:
from pymongo.mongo_client import MongoClient
import os

client = MongoClient(os.getenv("MONGODB_URI"))

db = client.get_database('pravni-vodnik')
articles_col = db.get_collection('articles')
articles = articles_col.find({'law_id': 'kz-1'}).to_list()

In [6]:
print(len(articles))
articles[0:5]

395


[{'_id': 'kz-1_2.0',
  'law_id': 'kz-1',
  'article_number': '2',
  'article_index': 2,
  'text': 'Nikomur ne sme biti izrečena kazen ali druga kazenska sankcija za dejanje, ki ga zakon ni določil kot kaznivo dejanje, še preden je bilo storjeno, in za katero ni bila z zakonom predpisana kazen ali druga kazenska sankcija.\n\nSistem kazenskih sankcij',
  'chapter': 'I. TEMELJNE DOLOČBE',
  'article_title': 'Ni kaznivega dejanja in kazni brez zakona',
  'language': 'sl'},
 {'_id': 'kz-1_1.0',
  'law_id': 'kz-1',
  'article_number': '1',
  'article_index': 1,
  'text': '(1) Kazenska odgovornost v Republiki Sloveniji se sme uveljavljati ob spoštovanju ustavno zagotovljenih človekovih pravic in temeljnih svoboščin v demokratični ureditvi ter na načelih pravne države.\n\n(2) Po tem zakoniku se kazenska odgovornost uveljavi s kaznovanjem polnoletnih oseb zaradi storjenih kaznivih dejanj na podlagi ugotovljene krivde.\n\n(3) Kazenska odgovornost se ne uveljavi zoper osebo, katere krivda je izkl

### Create sparse and dense indexes

In [7]:
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
import os

pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

dense_index_name = "pravni-vodnik-dense"
sparse_index_name = "pravni-vodnik-sparse"

if not pc.has_index(dense_index_name):
    pc.create_index(
        name = dense_index_name,
        vector_type = 'dense',
        dimension = 1536,
        metric = 'cosine',
        spec = ServerlessSpec(
            cloud = 'aws',
            region = 'us-east-1'
        ),
        deletion_protection = 'disabled',
    )

if not pc.has_index(sparse_index_name):
    pc.create_index(
        name = sparse_index_name,
        vector_type = 'sparse',
        metric = 'dotproduct',
        spec = ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

  from .autonotebook import tqdm as notebook_tqdm


### Upsert dense vectors

In [8]:
dense_index = pc.Index(host='https://pravni-vodnik-dense-3w1hkry.svc.aped-4627-b74a.pinecone.io')

In [9]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small',
    dimensions=1536
)

Prepare records from mongo to be inserted (id, values, metadata)

In [10]:

def create_records(articles):
    records = []

    for article in articles:

        metadata = {
            "law_id": article['law_id'],
            "language": article['language'],
            'doc_id': article['_id']
        }

        if 'article_title' in article:
            records.append(
                {
                    "id": article['_id'],
                    "values": embeddings.embed_query(
                        f"""
                        CHAPTER: {article['chapter']}
                        ARTICLE TITLE: {article['article_title']}
                        CONTENT: {article['text']}
                        """),
                    "metadata": metadata
                }
            )
        else:
            records.append(
                {
                    "id": article['_id'],
                    "values": embeddings.embed_query(
                        f"""
                        CHAPTER: {article['chapter']}
                        CONTENT: {article['text']}
                        """),
                    "metadata": metadata
                }
            )

    return records

In [11]:
records = create_records(articles)

In [12]:
records[0]

{'id': 'kz-1_2.0',
 'values': [0.005957988556474447,
  0.037357162684202194,
  0.007624692749232054,
  0.04042236506938934,
  0.013046271167695522,
  0.05088237300515175,
  0.0015158387832343578,
  -0.021686311811208725,
  -0.006987705361098051,
  -0.021858729422092438,
  0.017088508233428,
  -0.024904776364564896,
  0.013410263694822788,
  -0.009885280393064022,
  0.03473258391022682,
  -0.0014703396009281278,
  0.013027112931013107,
  -0.018659424036741257,
  -0.0076486398465931416,
  0.05023101717233658,
  0.0032639624550938606,
  0.03601613640785217,
  -0.018697738647460938,
  0.011111360974609852,
  0.029770785942673683,
  -0.07222384959459305,
  0.014808762818574905,
  -0.0038985551800578833,
  0.04222317412495613,
  0.006202246993780136,
  0.05111226066946983,
  -0.014981180429458618,
  -0.009827807545661926,
  -0.029100272804498672,
  0.01628389209508896,
  -0.0202494990080595,
  -0.009550023823976517,
  -0.006968547590076923,
  0.021647997200489044,
  0.03208884596824646,
  -0

Upsert records to dense index

In [13]:
dense_index.upsert(
    vectors=records,
    namespace='__default__'
)

UpsertResponse(upserted_count=395, _response_info={'raw_headers': {'date': 'Sat, 03 Jan 2026 22:50:14 GMT', 'x-pinecone-request-lsn': '4', 'x-pinecone-request-logical-size': '2454182', 'x-pinecone-request-latency-ms': '1591', 'x-pinecone-request-id': '7309038561848048521', 'x-envoy-upstream-service-time': '363', 'x-pinecone-response-duration-ms': '1592', 'server': 'envoy'}})

Test querying dense index

In [14]:
query = "Kako se kaznuje uboj?"
query_embedding = embeddings.embed_query(query)

In [15]:
results = dense_index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True,
    include_values=False
)

In [16]:
print(len(results.matches))
results

5


QueryResponse(matches=[{'id': 'kz-1_115.0',
 'metadata': {'doc_id': 'kz-1_115.0', 'language': 'sl', 'law_id': 'kz-1'},
 'score': 0.568202018737793,
 'sparse_values': None,
 'values': []}, {'id': 'kz-1_117.0',
 'metadata': {'doc_id': 'kz-1_117.0', 'language': 'sl', 'law_id': 'kz-1'},
 'score': 0.5368652939796448,
 'sparse_values': None,
 'values': []}, {'id': 'kz-1_341.0',
 'metadata': {'doc_id': 'kz-1_341.0', 'language': 'sl', 'law_id': 'kz-1'},
 'score': 0.5224228501319885,
 'sparse_values': None,
 'values': []}, {'id': 'kz-1_134.0',
 'metadata': {'doc_id': 'kz-1_134.0', 'language': 'sl', 'law_id': 'kz-1'},
 'score': 0.5164929032325745,
 'sparse_values': None,
 'values': []}, {'id': 'kz-1_120.0',
 'metadata': {'doc_id': 'kz-1_120.0', 'language': 'sl', 'law_id': 'kz-1'},
 'score': 0.5155706405639648,
 'sparse_values': None,
 'values': []}], namespace='', usage={'read_units': 1}, _response_info={'raw_headers': {'date': 'Sat, 03 Jan 2026 22:50:40 GMT', 'x-pinecone-max-indexed-lsn': '4', 

### Upsert sparse vectors

Train BM25 on corpus

In [17]:
from pinecone_text.sparse import BM25Encoder

bm25 = BM25Encoder(
    b=0.75,
    k1=1.2,
    lower_case=True,
    remove_punctuation=True,
    remove_stopwords=False,
    stem=False,
)

Prepare article for BM25

In [18]:
def prepare_article_bm25(article):
    content = article['text']
    title = article.get('article_title', '')
    chapter = article.get('chapter', '')

    # repeat title 3 times to add weight
    return f"{title} {title} {title} {content} {chapter}".strip()

Train BM25

In [22]:
# train on the entire corpus (all laws)

from pymongo.mongo_client import MongoClient
import os

client = MongoClient(os.getenv("MONGODB_URI"))

db = client.get_database('pravni-vodnik')
articles_col = db.get_collection('articles')
articles = articles_col.find().to_list()

In [23]:
print(len(articles))

571


In [24]:
all_texts = [prepare_article_bm25(article) for article in articles]
bm25.fit(all_texts)

100%|██████████| 571/571 [00:00<00:00, 930.94it/s] 


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x76581a7e2ba0>

Create sparse values

In [25]:
from pinecone_text.sparse import SparseVector

def get_sparse_values_for_article(article) -> SparseVector:
    prepared = prepare_article_bm25(article).strip()
    
    vectors =  bm25._encode_single_document(prepared)
    print(vectors)
    return vectors

Create sparse records for upload

In [26]:
sparse_records = [
    {
        "id": article['_id'],
        "sparse_values": get_sparse_values_for_article(article),
        "metadata": {
            "law_id": article['law_id'],
            "language": article['language'],
            'doc_id': article['_id']
        }
    } for article in articles
]
sparse_records

{'indices': [2765784866, 3904829042, 1381030882, 874673559, 1458822944, 3545791278, 1107257502], 'values': [0.7379755371094301, 0.7379755371094301, 0.7379755371094301, 0.7379755371094301, 0.7379755371094301, 0.7379755371094301, 0.7379755371094301]}
{'indices': [2765784866, 3904829042, 493713151, 131900689, 129184470, 399117570, 1458822944, 3545791278, 1107257502], 'values': [0.7295066317262139, 0.7295066317262139, 0.7295066317262139, 0.7295066317262139, 0.7295066317262139, 0.7295066317262139, 0.7295066317262139, 0.7295066317262139, 0.7295066317262139]}
{'indices': [2765784866, 3904829042, 399117570, 2725676883, 1180899491, 733383643, 131900689, 3774532246, 3941739082, 2496253341, 130713793, 235532065, 3955371338, 1076637598, 3568414808, 361987090, 1518651811, 1029282494, 3182414933, 1817963848, 2641476617, 1215599678, 2107470092, 443323616, 1776829581, 3509337952, 1707564682, 4227771106, 3254163991, 4095983299, 4047169702, 3752041862, 2414541838, 2391793235, 2609875583, 647503473, 2575

[{'id': 'ustava_1.0',
  'sparse_values': {'indices': [2765784866,
    3904829042,
    1381030882,
    874673559,
    1458822944,
    3545791278,
    1107257502],
   'values': [0.7379755371094301,
    0.7379755371094301,
    0.7379755371094301,
    0.7379755371094301,
    0.7379755371094301,
    0.7379755371094301,
    0.7379755371094301]},
  'metadata': {'law_id': 'ustava', 'language': 'sl', 'doc_id': 'ustava_1.0'}},
 {'id': 'ustava_2.0',
  'sparse_values': {'indices': [2765784866,
    3904829042,
    493713151,
    131900689,
    129184470,
    399117570,
    1458822944,
    3545791278,
    1107257502],
   'values': [0.7295066317262139,
    0.7295066317262139,
    0.7295066317262139,
    0.7295066317262139,
    0.7295066317262139,
    0.7295066317262139,
    0.7295066317262139,
    0.7295066317262139,
    0.7295066317262139]},
  'metadata': {'law_id': 'ustava', 'language': 'sl', 'doc_id': 'ustava_2.0'}},
 {'id': 'ustava_3.0',
  'sparse_values': {'indices': [2765784866,
    3904829042,

In [27]:
len(sparse_records)

571

Upload sparse records

In [28]:
sparse_index = pc.Index(host="https://pravni-vodnik-sparse-3w1hkry.svc.aped-4627-b74a.pinecone.io")

sparse_index.upsert(vectors=sparse_records, namespace='__default__')

UpsertResponse(upserted_count=571, _response_info={'raw_headers': {'date': 'Sat, 03 Jan 2026 23:01:21 GMT', 'x-pinecone-request-lsn': '3', 'x-pinecone-request-logical-size': '349976', 'x-pinecone-request-latency-ms': '888', 'x-pinecone-request-id': '4536251664631163480', 'x-envoy-upstream-service-time': '481', 'x-pinecone-response-duration-ms': '890', 'server': 'envoy'}})

Test querying sparse index

In [None]:
query = 'Kdo ima v Sloveniji oblast?'

query_dict = bm25._encode_single_query(query)

sparse_results = sparse_index.query(
    namespace='__default__',
    sparse_vector=query_dict,
    top_k=2,
    include_metadata=True,
    include_values=False
)

sparse_results

### Hybrid Search

#### Semantic search (dense index)

1. Embed Query

In [12]:
dense_index = pc.Index(host='https://pravni-vodnik-dense-3w1hkry.svc.aped-4627-b74a.pinecone.io')
sparse_index = pc.Index(host="https://pravni-vodnik-sparse-3w1hkry.svc.aped-4627-b74a.pinecone.io")

queries = [
    "Kakšno vlogo ima državni zbor?",
    "Kaj je državni svet?",
    "Kako deluje parlament?"
]

dense_search_results = []
sparse_search_results = []

for query in queries:

    # Semantic search
    query_embeddings = embeddings.embed_query(query)
    dense_results = dense_index.query(
        vector=query_embeddings,
        top_k=5,
        include_metadata=True,
        include_values=False
    )
    dense_search_results.extend(dense_results.matches)

    # Lexical search
    sparse_results = sparse_index.query(
        namespace='__default__',
        sparse_vector=bm25._encode_single_query(query),
        top_k=5,
        include_metadata=True,
        include_values=False
    )
    sparse_search_results.extend(sparse_results.matches)



In [33]:
sorted_dense = sorted(dense_search_results, key=lambda x: x['score'], reverse=True)
sorted_sparse = sorted(sparse_search_results, key=lambda x: x['score'], reverse=True)


In [36]:
final_results = [*sorted_dense[:10], *sorted_sparse[:2]]
print(len(final_results))

doc_ids = [result['id'] for result in final_results]
doc_ids

12


['ustava_96.0',
 'ustava_86.0',
 'ustava_93.0',
 'ustava_97.0',
 'ustava_84.0',
 'ustava_97.0',
 'ustava_87.0',
 'ustava_131.0',
 'ustava_99.0',
 'ustava_101.0',
 'ustava_99.0',
 'ustava_96.0']