### Get articles from Mongo
Articles in mongo are the source of truth

In [22]:
from pymongo.mongo_client import MongoClient
import os

client = MongoClient(os.getenv("MONGODB_URI"))

db = client.get_database('pravni-vodnik')
articles_col = db.get_collection('articles')
articles = articles_col.find().to_list()

In [11]:
print(len(articles))
articles[0]

93


{'_id': 'gdpr_1.0',
 'law_id': 'gdpr',
 'article_number': '1',
 'article_index': 1,
 'text': 'Ta uredba določa pravila o varstvu posameznikov pri obdelavi osebnih podatkov in pravila o prostem pretoku osebnih podatkov.\n\nTa uredba varuje temeljne pravice in svoboščine posameznikov ter zlasti njihovo pravico do varstva osebnih podatkov.\n\nProsti pretok osebnih podatkov v Uniji ne sme biti omejen ali prepovedan iz razlogov, povezanih z varstvom posameznikov pri obdelavi osebnih podatkov.',
 'chapter': 'I. SPLOŠNE DOLOČBE',
 'article_title': 'Predmet urejanja in cilji',
 'language': 'sl'}

### Chunk Articles

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n', '\n', '.', ','],
    chunk_size=800,
    chunk_overlap=100
)

In [13]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small',
    dimensions=1536
)

def create_chunked_dense_records(articles: list):
    records = []

    for article in articles:
        chunks = text_splitter.split_text(article['text'])

        for idx, chunk in enumerate(chunks, start=1):
            chunk_id = f"{article['_id']}_chunk_{idx}"

            metadata = {
                'law_id': article['law_id'],
                "language": article['language'],
                "article_id": article['_id'],
                "article_number": article.get('article_number', ''),
                "article_title": article.get('article_title', ''),
                'chapter': article.get('chapter', ''),
                'chunk_index': idx,
                'chunk_text': chunk
            }

            embedding_input = f"""
            CHAPTER: {article.get('chapter', '')}
            ARTICLE TITLE: {article.get('article_title', '')}
            CONTENT: {chunk}
            """

            records.append({
                "id": chunk_id,
                "values": embeddings.embed_query(embedding_input),
                "metadata": metadata
            })
            
    return records

### Create sparse and dense indexes

In [5]:
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
import os

pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

dense_index_name = "pravni-vodnik-dense"
sparse_index_name = "pravni-vodnik-sparse"

if not pc.has_index(dense_index_name):
    pc.create_index(
        name = dense_index_name,
        vector_type = 'dense',
        dimension = 1536,
        metric = 'cosine',
        spec = ServerlessSpec(
            cloud = 'aws',
            region = 'us-east-1'
        ),
        deletion_protection = 'disabled',
    )

if not pc.has_index(sparse_index_name):
    pc.create_index(
        name = sparse_index_name,
        vector_type = 'sparse',
        metric = 'dotproduct',
        spec = ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

  from .autonotebook import tqdm as notebook_tqdm


### Upsert dense vectors

In [14]:
dense_index = pc.Index(host='https://pravni-vodnik-dense-3w1hkry.svc.aped-4627-b74a.pinecone.io')

In [7]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small',
    dimensions=1536
)

In [15]:
articles[0]

{'_id': 'gdpr_1.0',
 'law_id': 'gdpr',
 'article_number': '1',
 'article_index': 1,
 'text': 'Ta uredba določa pravila o varstvu posameznikov pri obdelavi osebnih podatkov in pravila o prostem pretoku osebnih podatkov.\n\nTa uredba varuje temeljne pravice in svoboščine posameznikov ter zlasti njihovo pravico do varstva osebnih podatkov.\n\nProsti pretok osebnih podatkov v Uniji ne sme biti omejen ali prepovedan iz razlogov, povezanih z varstvom posameznikov pri obdelavi osebnih podatkov.',
 'chapter': 'I. SPLOŠNE DOLOČBE',
 'article_title': 'Predmet urejanja in cilji',
 'language': 'sl'}

Prepare records from mongo to be inserted (id, values, metadata)

In [17]:
dense_records = create_chunked_dense_records(articles)

In [18]:
dense_records[0]

{'id': 'gdpr_1.0_chunk_1',
 'values': [0.015801141038537025,
  0.020565351471304893,
  0.026962189003825188,
  0.01886589452624321,
  -0.0061676958575844765,
  0.02067992277443409,
  0.0202789269387722,
  0.006248849909752607,
  0.0048047881573438644,
  -0.034027349203825,
  0.05125107243657112,
  -0.04514066129922867,
  -0.005017220042645931,
  0.0018772805342450738,
  0.03551676496863365,
  0.009394756518304348,
  -0.02077539637684822,
  -0.01603028178215027,
  0.01637399196624756,
  0.04269649460911751,
  0.0462481714785099,
  0.03595595061779022,
  -0.03316807374358177,
  0.007657108828425407,
  -0.01641218177974224,
  -0.07026972621679306,
  0.02102363295853138,
  0.004881168249994516,
  0.007270434405654669,
  -0.013872542418539524,
  -0.01176254078745842,
  -0.017758382484316826,
  0.03414192050695419,
  0.0010018926113843918,
  0.04296382516622543,
  -0.015848878771066666,
  -0.032175131142139435,
  -0.005074505228549242,
  0.012411772273480892,
  0.03404644504189491,
  -0.0091

Upsert records to dense index

In [19]:
BATCH_SIZE = 100

for i in range(0, len(dense_records), BATCH_SIZE):
    batch = dense_records[i:i+BATCH_SIZE]
    dense_index.upsert(
        vectors=batch,
        namespace="__default__"
    )
    print(f"Upserted batch {i} to {i+len(batch)}")

Upserted batch 0 to 100
Upserted batch 100 to 200
Upserted batch 200 to 300
Upserted batch 300 to 307


Test querying dense index

### Upsert sparse vectors

Train BM25 on corpus

In [20]:
from pinecone_text.sparse import BM25Encoder

bm25 = BM25Encoder(
    b=0.7,
    k1=1.5,
    lower_case=True,
    remove_punctuation=True,
    remove_stopwords=False,
    stem=False,
)

Prepare article for BM25

In [21]:
def prepare_chunk_bm25(article, chunk):
    title = article.get('article_title', '')
    chapter = article.get('chapter', '')

    # repeat title 3 times to add weight
    return f"{title} {title} {title} {chunk} {chapter}".strip()

Train BM25

In [23]:
all_chunk_texts = []

for article in articles:
    chunks = text_splitter.split_text(article['text'])
    for chunk in chunks:
        all_chunk_texts.append(prepare_chunk_bm25(article, chunk))

bm25.fit(all_chunk_texts)

100%|██████████| 2214/2214 [00:01<00:00, 1158.68it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7250193d5fd0>

Create sparse records for upload

In [24]:
sparse_records = []

for article in articles:
    chunks = text_splitter.split_text(article['text'])
    for idx, chunk in enumerate(chunks, start=1):
        chunk_id = f"{article['_id']}_chunk_{idx}"
        prepared = prepare_chunk_bm25(article, chunk)
        sparse_vector = bm25._encode_single_document(prepared)

        sparse_records.append({
            "id": chunk_id,
            "sparse_values": sparse_vector,
            "metadata": {
                'law_id': article['law_id'],
                'language': article['language'],
                'article_id': article['_id'],
                'article_number': article.get('article_number', ''),
                'article_title': article.get('article_title', ''),
                'chapter': article.get('chapter', ''),
                'chunk_index': idx,
                'chunk_text': chunk
            }
        })

Upload sparse records

In [30]:
sparse_index = pc.Index(host="https://pravni-vodnik-sparse-3w1hkry.svc.aped-4627-b74a.pinecone.io")

sparse_index.upsert(vectors=sparse_records[3000:4000], namespace='__default__')

PineconeException: UNKNOWN:Error received from peer  {grpc_status:3, grpc_message:"Invalid request."}

In [33]:
len(sparse_records[3000:4000])

0

Test querying sparse index

In [39]:
# query = 'Kdo ima v Sloveniji oblast?'

# query_dict = bm25._encode_single_query(query)

# sparse_results = sparse_index.query(
#     namespace='__default__',
#     sparse_vector=query_dict,
#     top_k=2,
#     include_metadata=True,
#     include_values=False
# )

# sparse_results

QueryResponse(matches=[{'id': 'ustava_3.0_chunk_1',
 'metadata': {'article_id': 'ustava_3.0',
              'article_number': '3',
              'article_title': '',
              'chapter': 'I. SPLOŠNE DOLOČBE',
              'chunk_index': 1.0,
              'language': 'sl',
              'law_id': 'ustava'},
 'score': 0.37335386872291565,
 'sparse_values': None,
 'values': []}, {'id': 'ustava_128.0_chunk_1',
 'metadata': {'article_id': 'ustava_128.0',
              'article_number': '128',
              'article_title': 'udeležba, državljanov pri izvajanju sodne '
                               'oblast',
              'chapter': 'IV. DRŽAVNA UREDITEV',
              'chunk_index': 1.0,
              'language': 'sl',
              'law_id': 'ustava'},
 'score': 0.2654499411582947,
 'sparse_values': None,
 'values': []}], namespace='__default__', usage={'read_units': 1}, _response_info={'raw_headers': {'date': 'Sun, 04 Jan 2026 14:33:41 GMT', 'x-pinecone-max-indexed-lsn': '1', 'x-pi

In [40]:
query_dict

{'indices': [2966185449, 2641476617, 3182414933, 1817963848, 1215599678],
 'values': [0.25446882073188654,
  0.1468815390925386,
  0.03392355920462732,
  0.22397487082534684,
  0.34075121014560067]}

### Hybrid Search

#### Semantic search (dense index)

1. Embed Query

In [12]:
dense_index = pc.Index(host='https://pravni-vodnik-dense-3w1hkry.svc.aped-4627-b74a.pinecone.io')
sparse_index = pc.Index(host="https://pravni-vodnik-sparse-3w1hkry.svc.aped-4627-b74a.pinecone.io")

queries = [
    "Kakšno vlogo ima državni zbor?",
    "Kaj je državni svet?",
    "Kako deluje parlament?"
]

dense_search_results = []
sparse_search_results = []

for query in queries:

    # Semantic search
    query_embeddings = embeddings.embed_query(query)
    dense_results = dense_index.query(
        vector=query_embeddings,
        top_k=5,
        include_metadata=True,
        include_values=False
    )
    dense_search_results.extend(dense_results.matches)

    # Lexical search
    sparse_results = sparse_index.query(
        namespace='__default__',
        sparse_vector=bm25._encode_single_query(query),
        top_k=5,
        include_metadata=True,
        include_values=False
    )
    sparse_search_results.extend(sparse_results.matches)



In [33]:
sorted_dense = sorted(dense_search_results, key=lambda x: x['score'], reverse=True)
sorted_sparse = sorted(sparse_search_results, key=lambda x: x['score'], reverse=True)


In [36]:
final_results = [*sorted_dense[:10], *sorted_sparse[:2]]
print(len(final_results))

doc_ids = [result['id'] for result in final_results]
doc_ids

12


['ustava_96.0',
 'ustava_86.0',
 'ustava_93.0',
 'ustava_97.0',
 'ustava_84.0',
 'ustava_97.0',
 'ustava_87.0',
 'ustava_131.0',
 'ustava_99.0',
 'ustava_101.0',
 'ustava_99.0',
 'ustava_96.0']