### Get articles from Mongo
Articles in mongo are the source of truth

In [1]:
from pymongo.mongo_client import MongoClient
import os

client = MongoClient(os.getenv("MONGODB_URI"))

db = client.get_database('pravni-vodnik')
articles_col = db.get_collection('articles')
articles = articles_col.find().to_list()

In [29]:
articles[0:5]

[{'_id': 'ustava_1.0',
  'law_id': 'ustava',
  'article_number': '1',
  'article_index': 1,
  'text': 'Slovenija je demokratična republika.',
  'chapter': 'I. SPLOŠNE DOLOČBE',
  'language': 'sl'},
 {'_id': 'ustava_2.0',
  'law_id': 'ustava',
  'article_number': '2',
  'article_index': 2.0,
  'text': 'Slovenija je pravna in socialna država.',
  'chapter': 'I. SPLOŠNE DOLOČBE',
  'language': 'sl'},
 {'_id': 'ustava_3.0',
  'law_id': 'ustava',
  'article_number': '3',
  'article_index': 3.0,
  'text': 'Slovenija je država vseh svojih državljank in državljanov, ki temelji na trajni in neodtujljivi pravici slovenskega naroda do samoodločbe.\n\nV Sloveniji ima oblast ljudstvo. Državljanke in državljani jo izvršujejo neposredno in z volitvami, po načelu delitve oblasti na zakonodajno, izvršilno in sodno.',
  'chapter': 'I. SPLOŠNE DOLOČBE',
  'language': 'sl'},
 {'_id': 'ustava_3.1',
  'law_id': 'ustava',
  'article_number': '3.a',
  'article_index': 3.1,
  'text': 'Slovenija lahko z mednaro

### Create sparse and dense indexes

In [5]:
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
import os

pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

dense_index_name = "pravni-vodnik-dense"
sparse_index_name = "pravni-vodnik-sparse"

if not pc.has_index(dense_index_name):
    pc.create_index(
        name = dense_index_name,
        vector_type = 'dense',
        dimension = 1536,
        metric = 'cosine',
        spec = ServerlessSpec(
            cloud = 'aws',
            region = 'us-east-1'
        ),
        deletion_protection = 'disabled',
    )

if not pc.has_index(sparse_index_name):
    pc.create_index(
        name = sparse_index_name,
        vector_type = 'sparse',
        metric = 'dotproduct',
        spec = ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

### Upsert dense vectors

In [None]:
dense_index = pc.Index(host='https://pravni-vodnik-dense-3w1hkry.svc.aped-4627-b74a.pinecone.io')

['pravni-vodnik', 'pravni-vodnik-sparse', 'pravni-vodnik-dense']

In [10]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small',
    dimensions=1536
)

Prepare records from mongo to be inserted (id, values, metadata)

In [23]:

def create_records(articles):
    records = []

    for article in articles:

        metadata = {
            "law_id": article['law_id'],
            "language": article['language'],
            'doc_id': article['_id']
        }

        if 'article_title' in article:
            records.append(
                {
                    "id": article['_id'],
                    "values": embeddings.embed_query(
                        f"""
                        CHAPTER: {article['chapter']}
                        ARTICLE TITLE: {article['article_title']}
                        CONTENT: {article['text']}
                        """),
                    "metadata": metadata
                }
            )
        else:
            records.append(
                {
                    "id": article['_id'],
                    "values": embeddings.embed_query(
                        f"""
                        CHAPTER: {article['chapter']}
                        CONTENT: {article['text']}
                        """),
                    "metadata": metadata
                }
            )

    return records

In [None]:
records = create_records(articles[0:5])

In [None]:
records[0]

{'id': 'ustava_35.0',
 'values': [0.05912259966135025,
  0.04120003804564476,
  0.0039319442585110664,
  0.03868650645017624,
  -0.016010094434022903,
  0.016137592494487762,
  0.0178679209202528,
  0.021201079711318016,
  0.008865658193826675,
  -0.047356363385915756,
  0.030909134075045586,
  -0.019725747406482697,
  -0.004610415082424879,
  -0.015445460565388203,
  0.011055889539420605,
  0.02622814103960991,
  0.017612924799323082,
  -0.010864642448723316,
  0.06338467448949814,
  0.02626456879079342,
  0.0064705186523497105,
  0.020235739648342133,
  -0.021783927455544472,
  0.010145190171897411,
  -0.03253018110990524,
  -0.055552657693624496,
  0.014689579606056213,
  0.003565387800335884,
  0.022439630702137947,
  0.002039966406300664,
  0.00970805436372757,
  -0.03409658372402191,
  0.03436979278922081,
  -0.012003016658127308,
  0.001852134708315134,
  0.005099916364997625,
  -0.01670222543179989,
  -0.030854493379592896,
  0.006893993820995092,
  0.05067130923271179,
  -0.00

Upsert records to dense index

In [None]:
dense_index.upsert(
    vectors=records,
    namespace='__default__'
)

UpsertResponse(upserted_count=5, _response_info={'raw_headers': {'date': 'Sun, 28 Dec 2025 01:47:06 GMT', 'x-pinecone-request-lsn': '2', 'x-pinecone-request-logical-size': '31080', 'x-pinecone-request-latency-ms': '165', 'x-pinecone-request-id': '796312214815869703', 'x-envoy-upstream-service-time': '166', 'x-pinecone-response-duration-ms': '167', 'server': 'envoy'}})

Test querying dense index

In [33]:
query = "Ali je slovenija demokratična republika?"
query_embedding = embeddings.embed_query(query)

In [34]:
results = dense_index.query(
    vector=query_embedding,
    top_k=1,
    include_metadata=True,
    include_values=False
)

In [35]:
results

QueryResponse(matches=[{'id': 'ustava_1.0',
 'metadata': {'doc_id': 'ustava_1.0', 'language': 'sl', 'law_id': 'ustava'},
 'score': 0.7099829316139221,
 'sparse_values': None,
 'values': []}], namespace='', usage={'read_units': 1}, _response_info={'raw_headers': {'date': 'Sun, 28 Dec 2025 01:53:55 GMT', 'x-pinecone-max-indexed-lsn': '2', 'x-pinecone-request-latency-ms': '87', 'x-pinecone-request-id': '6864209550491861403', 'x-envoy-upstream-service-time': '88', 'x-pinecone-response-duration-ms': '89', 'server': 'envoy'}})