### Get articles from Mongo
Articles in mongo are the source of truth

In [1]:
from pymongo.mongo_client import MongoClient
import os

client = MongoClient(os.getenv("MONGODB_URI"))

db = client.get_database('pravni-vodnik')
articles_col = db.get_collection('articles')
articles = articles_col.find().to_list()

In [2]:
articles[0:5]

[{'_id': 'ustava_1.0',
  'law_id': 'ustava',
  'article_number': '1',
  'article_index': 1,
  'text': 'Slovenija je demokratična republika.',
  'chapter': 'I. SPLOŠNE DOLOČBE',
  'language': 'sl'},
 {'_id': 'ustava_2.0',
  'law_id': 'ustava',
  'article_number': '2',
  'article_index': 2.0,
  'text': 'Slovenija je pravna in socialna država.',
  'chapter': 'I. SPLOŠNE DOLOČBE',
  'language': 'sl'},
 {'_id': 'ustava_3.0',
  'law_id': 'ustava',
  'article_number': '3',
  'article_index': 3.0,
  'text': 'Slovenija je država vseh svojih državljank in državljanov, ki temelji na trajni in neodtujljivi pravici slovenskega naroda do samoodločbe.\n\nV Sloveniji ima oblast ljudstvo. Državljanke in državljani jo izvršujejo neposredno in z volitvami, po načelu delitve oblasti na zakonodajno, izvršilno in sodno.',
  'chapter': 'I. SPLOŠNE DOLOČBE',
  'language': 'sl'},
 {'_id': 'ustava_3.1',
  'law_id': 'ustava',
  'article_number': '3.a',
  'article_index': 3.1,
  'text': 'Slovenija lahko z mednaro

### Create sparse and dense indexes

In [3]:
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
import os

pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

dense_index_name = "pravni-vodnik-dense"
sparse_index_name = "pravni-vodnik-sparse"

if not pc.has_index(dense_index_name):
    pc.create_index(
        name = dense_index_name,
        vector_type = 'dense',
        dimension = 1536,
        metric = 'cosine',
        spec = ServerlessSpec(
            cloud = 'aws',
            region = 'us-east-1'
        ),
        deletion_protection = 'disabled',
    )

if not pc.has_index(sparse_index_name):
    pc.create_index(
        name = sparse_index_name,
        vector_type = 'sparse',
        metric = 'dotproduct',
        spec = ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

  from .autonotebook import tqdm as notebook_tqdm


### Upsert dense vectors

In [4]:
dense_index = pc.Index(host='https://pravni-vodnik-dense-3w1hkry.svc.aped-4627-b74a.pinecone.io')

In [5]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small',
    dimensions=1536
)

Prepare records from mongo to be inserted (id, values, metadata)

In [6]:

def create_records(articles):
    records = []

    for article in articles:

        metadata = {
            "law_id": article['law_id'],
            "language": article['language'],
            'doc_id': article['_id']
        }

        if 'article_title' in article:
            records.append(
                {
                    "id": article['_id'],
                    "values": embeddings.embed_query(
                        f"""
                        CHAPTER: {article['chapter']}
                        ARTICLE TITLE: {article['article_title']}
                        CONTENT: {article['text']}
                        """),
                    "metadata": metadata
                }
            )
        else:
            records.append(
                {
                    "id": article['_id'],
                    "values": embeddings.embed_query(
                        f"""
                        CHAPTER: {article['chapter']}
                        CONTENT: {article['text']}
                        """),
                    "metadata": metadata
                }
            )

    return records

In [7]:
records = create_records(articles)

In [8]:
records[0]

{'id': 'ustava_1.0',
 'values': [0.0016223904676735401,
  -0.012650181539356709,
  -0.0038486251141875982,
  -0.0030568139627575874,
  0.016512904316186905,
  0.04308204725384712,
  0.018712118268013,
  0.03686033561825752,
  0.0009333738125860691,
  -0.012584392912685871,
  0.04605192318558693,
  -0.011212234385311604,
  -0.009238580241799355,
  -0.012283646501600742,
  0.016870042309165,
  0.010967876762151718,
  -0.023082353174686432,
  0.013223481364548206,
  0.020864341408014297,
  0.04345798119902611,
  0.019125645980238914,
  0.00871227215975523,
  -0.00960981473326683,
  0.004323241766542196,
  0.018279794603586197,
  -0.050337571650743484,
  -0.005714198108762503,
  -0.05067591369152069,
  -0.01872151717543602,
  -0.007692551240324974,
  0.022499654442071915,
  -0.021465836092829704,
  0.02107110433280468,
  -0.03721747547388077,
  0.01663508266210556,
  -0.0032095371279865503,
  -0.015497882850468159,
  -0.05169093608856201,
  0.015704646706581116,
  0.0142854955047369,
  -0.

Upsert records to dense index

In [9]:
dense_index.upsert(
    vectors=records,
    namespace='__default__'
)

UpsertResponse(upserted_count=176, _response_info={'raw_headers': {'date': 'Sun, 28 Dec 2025 19:21:47 GMT', 'x-pinecone-request-lsn': '3', 'x-pinecone-request-logical-size': '1094498', 'x-pinecone-request-latency-ms': '1092', 'x-pinecone-request-id': '8376362583602712628', 'x-envoy-upstream-service-time': '473', 'x-pinecone-response-duration-ms': '1095', 'server': 'envoy'}})

Test querying dense index

In [11]:
query = "Ali je slovenija demokratična republika?"
query_embedding = embeddings.embed_query(query)

In [16]:
results = dense_index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True,
    include_values=False
)

In [17]:
print(len(results.matches))
results

5


QueryResponse(matches=[{'id': 'ustava_1.0',
 'metadata': {'doc_id': 'ustava_1.0', 'language': 'sl', 'law_id': 'ustava'},
 'score': 0.7099829316139221,
 'sparse_values': None,
 'values': []}, {'id': 'ustava_3.0',
 'metadata': {'doc_id': 'ustava_3.0', 'language': 'sl', 'law_id': 'ustava'},
 'score': 0.6067467331886292,
 'sparse_values': None,
 'values': []}, {'id': 'ustava_4.0',
 'metadata': {'doc_id': 'ustava_4.0', 'language': 'sl', 'law_id': 'ustava'},
 'score': 0.5792542099952698,
 'sparse_values': None,
 'values': []}, {'id': 'ustava_80.0',
 'metadata': {'doc_id': 'ustava_80.0', 'language': 'sl', 'law_id': 'ustava'},
 'score': 0.5652799606323242,
 'sparse_values': None,
 'values': []}, {'id': 'ustava_2.0',
 'metadata': {'doc_id': 'ustava_2.0', 'language': 'sl', 'law_id': 'ustava'},
 'score': 0.5648813843727112,
 'sparse_values': None,
 'values': []}], namespace='', usage={'read_units': 1}, _response_info={'raw_headers': {'date': 'Sun, 28 Dec 2025 19:23:37 GMT', 'x-pinecone-max-indexe

### Upsert sparse vectors

Train BM25 on corpus

In [18]:
from pinecone_text.sparse import BM25Encoder

bm25 = BM25Encoder(
    b=0.75,
    k1=1.2,
    lower_case=True,
    remove_punctuation=True,
    remove_stopwords=False,
    stem=False,
)

Prepare article for BM25

In [38]:
def prepare_article_bm25(article):
    content = article['text']
    title = article.get('article_title', '')
    chapter = article.get('chapter', '')

    # repeat title 3 times to add weight
    return f"{title} {title} {title} {content} {chapter}".strip()

Train BM25

In [41]:
all_texts = [prepare_article_bm25(article) for article in articles]
bm25.fit(all_texts)

100%|██████████| 176/176 [00:00<00:00, 1573.38it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7415b1bd17f0>

Create sparse values

In [42]:
from pinecone_text.sparse import SparseVector

def get_sparse_values_for_article(article) -> SparseVector:
    prepared = prepare_article_bm25(article).strip()
    
    vectors =  bm25._encode_single_document(prepared)
    print(vectors)
    return vectors

Create sparse records for upload

In [43]:
sparse_records = [
    {
        "id": article['_id'],
        "sparse_values": get_sparse_values_for_article(article),
        "metadata": {
            "law_id": article['law_id'],
            "language": article['language'],
            'doc_id': article['_id']
        }
    } for article in articles
]
sparse_records

{'indices': [2765784866, 3904829042, 1381030882, 874673559, 1458822944, 3545791278, 1107257502], 'values': [0.7062411839860568, 0.7062411839860568, 0.7062411839860568, 0.7062411839860568, 0.7062411839860568, 0.7062411839860568, 0.7062411839860568]}
{'indices': [2765784866, 3904829042, 493713151, 131900689, 129184470, 399117570, 1458822944, 3545791278, 1107257502], 'values': [0.6900956160923687, 0.6900956160923687, 0.6900956160923687, 0.6900956160923687, 0.6900956160923687, 0.6900956160923687, 0.6900956160923687, 0.6900956160923687, 0.6900956160923687]}
{'indices': [2765784866, 3904829042, 399117570, 2725676883, 1180899491, 733383643, 131900689, 3774532246, 3941739082, 2496253341, 130713793, 235532065, 3955371338, 1076637598, 3568414808, 361987090, 1518651811, 1029282494, 3182414933, 1817963848, 2641476617, 1215599678, 2107470092, 443323616, 1776829581, 3509337952, 1707564682, 4227771106, 3254163991, 4095983299, 4047169702, 3752041862, 2414541838, 2391793235, 2609875583, 647503473, 2575

[{'id': 'ustava_1.0',
  'sparse_values': {'indices': [2765784866,
    3904829042,
    1381030882,
    874673559,
    1458822944,
    3545791278,
    1107257502],
   'values': [0.7062411839860568,
    0.7062411839860568,
    0.7062411839860568,
    0.7062411839860568,
    0.7062411839860568,
    0.7062411839860568,
    0.7062411839860568]},
  'metadata': {'law_id': 'ustava', 'language': 'sl', 'doc_id': 'ustava_1.0'}},
 {'id': 'ustava_2.0',
  'sparse_values': {'indices': [2765784866,
    3904829042,
    493713151,
    131900689,
    129184470,
    399117570,
    1458822944,
    3545791278,
    1107257502],
   'values': [0.6900956160923687,
    0.6900956160923687,
    0.6900956160923687,
    0.6900956160923687,
    0.6900956160923687,
    0.6900956160923687,
    0.6900956160923687,
    0.6900956160923687,
    0.6900956160923687]},
  'metadata': {'law_id': 'ustava', 'language': 'sl', 'doc_id': 'ustava_2.0'}},
 {'id': 'ustava_3.0',
  'sparse_values': {'indices': [2765784866,
    3904829042,

Upload sparse records

In [44]:
sparse_index = pc.Index(host="https://pravni-vodnik-sparse-3w1hkry.svc.aped-4627-b74a.pinecone.io")

sparse_index.upsert(vectors=sparse_records, namespace='__default__')

UpsertResponse(upserted_count=176, _response_info={'raw_headers': {'date': 'Sun, 28 Dec 2025 19:33:15 GMT', 'x-pinecone-request-lsn': '2', 'x-pinecone-request-logical-size': '65738', 'x-pinecone-request-latency-ms': '332', 'x-pinecone-request-id': '7655868801997787713', 'x-envoy-upstream-service-time': '228', 'x-pinecone-response-duration-ms': '335', 'server': 'envoy'}})

Test querying sparse index

In [45]:
query = 'Kdo ima v Sloveniji oblast?'

query_dict = bm25._encode_single_query(query)

sparse_results = sparse_index.query(
    namespace='__default__',
    sparse_vector=query_dict,
    top_k=2,
    include_metadata=True,
    include_values=False
)

sparse_results

QueryResponse(matches=[{'id': 'ustava_3.0',
 'metadata': {'doc_id': 'ustava_3.0', 'language': 'sl', 'law_id': 'ustava'},
 'score': 0.3228439688682556,
 'sparse_values': None,
 'values': []}, {'id': 'ustava_128.0',
 'metadata': {'doc_id': 'ustava_128.0', 'language': 'sl', 'law_id': 'ustava'},
 'score': 0.23654142022132874,
 'sparse_values': None,
 'values': []}], namespace='__default__', usage={'read_units': 1}, _response_info={'raw_headers': {'date': 'Sun, 28 Dec 2025 19:33:18 GMT', 'x-pinecone-max-indexed-lsn': '2', 'x-pinecone-request-latency-ms': '43', 'x-pinecone-request-id': '317319473325226587', 'x-envoy-upstream-service-time': '44', 'x-pinecone-response-duration-ms': '45', 'server': 'envoy'}})