### Get articles from Mongo
Articles in mongo are the source of truth

In [1]:
from pymongo.mongo_client import MongoClient
import os

client = MongoClient(os.getenv("MONGODB_URI"))

db = client.get_database('pravni-vodnik')
articles_col = db.get_collection('articles')
articles = articles_col.find().to_list()

In [None]:
articles[0:5]

### Create sparse and dense indexes

In [2]:
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
import os

pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

dense_index_name = "pravni-vodnik-dense"
sparse_index_name = "pravni-vodnik-sparse"

if not pc.has_index(dense_index_name):
    pc.create_index(
        name = dense_index_name,
        vector_type = 'dense',
        dimension = 1536,
        metric = 'cosine',
        spec = ServerlessSpec(
            cloud = 'aws',
            region = 'us-east-1'
        ),
        deletion_protection = 'disabled',
    )

if not pc.has_index(sparse_index_name):
    pc.create_index(
        name = sparse_index_name,
        vector_type = 'sparse',
        metric = 'dotproduct',
        spec = ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

  from .autonotebook import tqdm as notebook_tqdm


### Upsert dense vectors

In [None]:
dense_index = pc.Index(host='https://pravni-vodnik-dense-3w1hkry.svc.aped-4627-b74a.pinecone.io')

In [3]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small',
    dimensions=1536
)

Prepare records from mongo to be inserted (id, values, metadata)

In [None]:

def create_records(articles):
    records = []

    for article in articles:

        metadata = {
            "law_id": article['law_id'],
            "language": article['language'],
            'doc_id': article['_id']
        }

        if 'article_title' in article:
            records.append(
                {
                    "id": article['_id'],
                    "values": embeddings.embed_query(
                        f"""
                        CHAPTER: {article['chapter']}
                        ARTICLE TITLE: {article['article_title']}
                        CONTENT: {article['text']}
                        """),
                    "metadata": metadata
                }
            )
        else:
            records.append(
                {
                    "id": article['_id'],
                    "values": embeddings.embed_query(
                        f"""
                        CHAPTER: {article['chapter']}
                        CONTENT: {article['text']}
                        """),
                    "metadata": metadata
                }
            )

    return records

In [None]:
records = create_records(articles)

In [None]:
records[0]

Upsert records to dense index

In [None]:
dense_index.upsert(
    vectors=records,
    namespace='__default__'
)

Test querying dense index

In [None]:
query = "Ali je slovenija demokratična republika?"
query_embedding = embeddings.embed_query(query)

In [None]:
results = dense_index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True,
    include_values=False
)

In [None]:
print(len(results.matches))
results

### Upsert sparse vectors

Train BM25 on corpus

In [4]:
from pinecone_text.sparse import BM25Encoder

bm25 = BM25Encoder(
    b=0.75,
    k1=1.2,
    lower_case=True,
    remove_punctuation=True,
    remove_stopwords=False,
    stem=False,
)

Prepare article for BM25

In [5]:
def prepare_article_bm25(article):
    content = article['text']
    title = article.get('article_title', '')
    chapter = article.get('chapter', '')

    # repeat title 3 times to add weight
    return f"{title} {title} {title} {content} {chapter}".strip()

Train BM25

In [6]:
all_texts = [prepare_article_bm25(article) for article in articles]
bm25.fit(all_texts)

100%|██████████| 176/176 [00:00<00:00, 1074.33it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7b6f21fecad0>

Create sparse values

In [None]:
from pinecone_text.sparse import SparseVector

def get_sparse_values_for_article(article) -> SparseVector:
    prepared = prepare_article_bm25(article).strip()
    
    vectors =  bm25._encode_single_document(prepared)
    print(vectors)
    return vectors

Create sparse records for upload

In [None]:
sparse_records = [
    {
        "id": article['_id'],
        "sparse_values": get_sparse_values_for_article(article),
        "metadata": {
            "law_id": article['law_id'],
            "language": article['language'],
            'doc_id': article['_id']
        }
    } for article in articles
]
sparse_records

Upload sparse records

In [None]:
sparse_index = pc.Index(host="https://pravni-vodnik-sparse-3w1hkry.svc.aped-4627-b74a.pinecone.io")

sparse_index.upsert(vectors=sparse_records, namespace='__default__')

Test querying sparse index

In [None]:
query = 'Kdo ima v Sloveniji oblast?'

query_dict = bm25._encode_single_query(query)

sparse_results = sparse_index.query(
    namespace='__default__',
    sparse_vector=query_dict,
    top_k=2,
    include_metadata=True,
    include_values=False
)

sparse_results

### Hybrid Search

#### Semantic search (dense index)

1. Embed Query

In [12]:
dense_index = pc.Index(host='https://pravni-vodnik-dense-3w1hkry.svc.aped-4627-b74a.pinecone.io')
sparse_index = pc.Index(host="https://pravni-vodnik-sparse-3w1hkry.svc.aped-4627-b74a.pinecone.io")

queries = [
    "Kakšno vlogo ima državni zbor?",
    "Kaj je državni svet?",
    "Kako deluje parlament?"
]

dense_search_results = []
sparse_search_results = []

for query in queries:

    # Semantic search
    query_embeddings = embeddings.embed_query(query)
    dense_results = dense_index.query(
        vector=query_embeddings,
        top_k=5,
        include_metadata=True,
        include_values=False
    )
    dense_search_results.extend(dense_results.matches)

    # Lexical search
    sparse_results = sparse_index.query(
        namespace='__default__',
        sparse_vector=bm25._encode_single_query(query),
        top_k=5,
        include_metadata=True,
        include_values=False
    )
    sparse_search_results.extend(sparse_results.matches)



In [33]:
sorted_dense = sorted(dense_search_results, key=lambda x: x['score'], reverse=True)
sorted_sparse = sorted(sparse_search_results, key=lambda x: x['score'], reverse=True)


In [36]:
final_results = [*sorted_dense[:10], *sorted_sparse[:2]]
print(len(final_results))

doc_ids = [result['id'] for result in final_results]
doc_ids

12


['ustava_96.0',
 'ustava_86.0',
 'ustava_93.0',
 'ustava_97.0',
 'ustava_84.0',
 'ustava_97.0',
 'ustava_87.0',
 'ustava_131.0',
 'ustava_99.0',
 'ustava_101.0',
 'ustava_99.0',
 'ustava_96.0']