### Get articles from Mongo
Articles in mongo are the source of truth

In [10]:
from pymongo.mongo_client import MongoClient
import os

client = MongoClient(os.getenv("MONGODB_URI"))

db = client.get_database('pravni-vodnik')
articles_col = db.get_collection('articles')
articles = articles_col.find().to_list()

In [11]:
print(len(articles))

571


### Chunk Articles

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n', '\n', '.', ','],
    chunk_size=800,
    chunk_overlap=100
)

In [14]:
# test_chunks = text_splitter.split_text(articles[3]['text'])

# for i, chunk in enumerate(test_chunks, start=1):
#     print(f"Chunk {i}: {len(chunk)}")
#     print(chunk)



In [29]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small',
    dimensions=1536
)

def create_chunked_dense_records(articles):
    records = []

    for article in articles:
        chunks = text_splitter.split_text(article['text'])

        for idx, chunk in enumerate(chunks, start=1):
            chunk_id = f"{article['_id']}_chunk_{idx}"

            metadata = {
                'law_id': article['law_id'],
                "language": article['language'],
                "article_id": article['_id'],
                "article_number": article.get('article_number', ''),
                "article_title": article.get('article_title', ''),
                'chapter': article.get('chapter', ''),
                'chunk_index': idx 
            }

            embedding_input = f"""
            CHAPTER: {article.get('chapter', '')}
            ARTICLE TITLE: {article.get('article_title', '')}
            CONTENT: {chunk}
            """

            records.append({
                "id": chunk_id,
                "values": embeddings.embed_query(embedding_input),
                "metadata": metadata
            })
            
    return records

### Create sparse and dense indexes

In [16]:
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
import os

pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

dense_index_name = "pravni-vodnik-dense"
sparse_index_name = "pravni-vodnik-sparse"

if not pc.has_index(dense_index_name):
    pc.create_index(
        name = dense_index_name,
        vector_type = 'dense',
        dimension = 1536,
        metric = 'cosine',
        spec = ServerlessSpec(
            cloud = 'aws',
            region = 'us-east-1'
        ),
        deletion_protection = 'disabled',
    )

if not pc.has_index(sparse_index_name):
    pc.create_index(
        name = sparse_index_name,
        vector_type = 'sparse',
        metric = 'dotproduct',
        spec = ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

  from .autonotebook import tqdm as notebook_tqdm


### Upsert dense vectors

In [17]:
dense_index = pc.Index(host='https://pravni-vodnik-dense-3w1hkry.svc.aped-4627-b74a.pinecone.io')

In [18]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small',
    dimensions=1536
)

Prepare records from mongo to be inserted (id, values, metadata)

In [22]:
dense_records = create_chunked_dense_records(articles)

Upsert records to dense index

In [28]:
BATCH_SIZE = 100

for i in range(0, len(dense_records), BATCH_SIZE):
    batch = dense_records[i:i+BATCH_SIZE]
    dense_index.upsert(
        vectors=batch,
        namespace="__default__"
    )
    print(f"Upserted batch {i} to {i+len(batch)}")

Upserted batch 0 to 100
Upserted batch 100 to 200
Upserted batch 200 to 300
Upserted batch 300 to 400
Upserted batch 400 to 500
Upserted batch 500 to 600
Upserted batch 600 to 700
Upserted batch 700 to 800
Upserted batch 800 to 814


Test querying dense index

### Upsert sparse vectors

Train BM25 on corpus

In [34]:
from pinecone_text.sparse import BM25Encoder

bm25 = BM25Encoder(
    b=0.7,
    k1=1.5,
    lower_case=True,
    remove_punctuation=True,
    remove_stopwords=False,
    stem=False,
)

Prepare article for BM25

In [30]:
def prepare_chunk_bm25(article, chunk):
    title = article.get('article_title', '')
    chapter = article.get('chapter', '')

    # repeat title 3 times to add weight
    return f"{title} {title} {title} {chunk} {chapter}".strip()

Train BM25

In [35]:
all_chunk_texts = []

for article in articles:
    chunks = text_splitter.split_text(article['text'])
    for chunk in chunks:
        all_chunk_texts.append(prepare_chunk_bm25(article, chunk))

bm25.fit(all_chunk_texts)

  0%|          | 0/814 [00:00<?, ?it/s]

100%|██████████| 814/814 [00:00<00:00, 1174.61it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7a2ff5ff0b90>

Create sparse records for upload

In [36]:
sparse_records = []

for article in articles:
    chunks = text_splitter.split_text(article['text'])
    for idx, chunk in enumerate(chunks, start=1):
        chunk_id = f"{article['_id']}_chunk_{idx}"
        prepared = prepare_chunk_bm25(article, chunk)
        sparse_vector = bm25._encode_single_document(prepared)

        sparse_records.append({
            "id": chunk_id,
            "sparse_values": sparse_vector,
            "metadata": {
                'law_id': article['law_id'],
                'language': article['language'],
                'article_id': article['_id'],
                'article_number': article.get('article_number', ''),
                'article_title': article.get('article_title', ''),
                'chapter': article.get('chapter', ''),
                'chunk_index': idx
            }
        })

Upload sparse records

In [38]:
sparse_index = pc.Index(host="https://pravni-vodnik-sparse-3w1hkry.svc.aped-4627-b74a.pinecone.io")

sparse_index.upsert(vectors=sparse_records, namespace='__default__')

UpsertResponse(upserted_count=814, _response_info={'raw_headers': {'date': 'Sun, 04 Jan 2026 14:33:25 GMT', 'x-pinecone-request-lsn': '1', 'x-pinecone-request-logical-size': '567087', 'x-pinecone-request-latency-ms': '899', 'x-pinecone-request-id': '8178392621112211293', 'x-envoy-upstream-service-time': '389', 'x-pinecone-response-duration-ms': '901', 'server': 'envoy'}})

Test querying sparse index

In [None]:
# query = 'Kdo ima v Sloveniji oblast?'

# query_dict = bm25._encode_single_query(query)

# sparse_results = sparse_index.query(
#     namespace='__default__',
#     sparse_vector=query_dict,
#     top_k=2,
#     include_metadata=True,
#     include_values=False
# )

# sparse_results

QueryResponse(matches=[{'id': 'ustava_3.0_chunk_1',
 'metadata': {'article_id': 'ustava_3.0',
              'article_number': '3',
              'article_title': '',
              'chapter': 'I. SPLOŠNE DOLOČBE',
              'chunk_index': 1.0,
              'language': 'sl',
              'law_id': 'ustava'},
 'score': 0.37335386872291565,
 'sparse_values': None,
 'values': []}, {'id': 'ustava_128.0_chunk_1',
 'metadata': {'article_id': 'ustava_128.0',
              'article_number': '128',
              'article_title': 'udeležba, državljanov pri izvajanju sodne '
                               'oblast',
              'chapter': 'IV. DRŽAVNA UREDITEV',
              'chunk_index': 1.0,
              'language': 'sl',
              'law_id': 'ustava'},
 'score': 0.2654499411582947,
 'sparse_values': None,
 'values': []}], namespace='__default__', usage={'read_units': 1}, _response_info={'raw_headers': {'date': 'Sun, 04 Jan 2026 14:33:41 GMT', 'x-pinecone-max-indexed-lsn': '1', 'x-pi

In [40]:
query_dict

{'indices': [2966185449, 2641476617, 3182414933, 1817963848, 1215599678],
 'values': [0.25446882073188654,
  0.1468815390925386,
  0.03392355920462732,
  0.22397487082534684,
  0.34075121014560067]}

### Hybrid Search

#### Semantic search (dense index)

1. Embed Query

In [12]:
dense_index = pc.Index(host='https://pravni-vodnik-dense-3w1hkry.svc.aped-4627-b74a.pinecone.io')
sparse_index = pc.Index(host="https://pravni-vodnik-sparse-3w1hkry.svc.aped-4627-b74a.pinecone.io")

queries = [
    "Kakšno vlogo ima državni zbor?",
    "Kaj je državni svet?",
    "Kako deluje parlament?"
]

dense_search_results = []
sparse_search_results = []

for query in queries:

    # Semantic search
    query_embeddings = embeddings.embed_query(query)
    dense_results = dense_index.query(
        vector=query_embeddings,
        top_k=5,
        include_metadata=True,
        include_values=False
    )
    dense_search_results.extend(dense_results.matches)

    # Lexical search
    sparse_results = sparse_index.query(
        namespace='__default__',
        sparse_vector=bm25._encode_single_query(query),
        top_k=5,
        include_metadata=True,
        include_values=False
    )
    sparse_search_results.extend(sparse_results.matches)



In [33]:
sorted_dense = sorted(dense_search_results, key=lambda x: x['score'], reverse=True)
sorted_sparse = sorted(sparse_search_results, key=lambda x: x['score'], reverse=True)


In [36]:
final_results = [*sorted_dense[:10], *sorted_sparse[:2]]
print(len(final_results))

doc_ids = [result['id'] for result in final_results]
doc_ids

12


['ustava_96.0',
 'ustava_86.0',
 'ustava_93.0',
 'ustava_97.0',
 'ustava_84.0',
 'ustava_97.0',
 'ustava_87.0',
 'ustava_131.0',
 'ustava_99.0',
 'ustava_101.0',
 'ustava_99.0',
 'ustava_96.0']