In [10]:
from dotenv import load_dotenv

load_dotenv()

True

In [11]:
from qdrant_client import QdrantClient, models

client = QdrantClient(host="localhost", port=6333)

In [12]:
from fastembed import LateInteractionTextEmbedding 

late_model = LateInteractionTextEmbedding("answerdotai/answerai-colbert-small-v1")

def batch_embeddings_late(contents: list[str]):
    all_embeddings = []
    
    for i in range(0, len(contents), 100):
        batch = contents[i:i+100]
        embeddings = list(late_model.query_embed(batch))
        all_embeddings.extend(embeddings)
    
    return all_embeddings

In [13]:
# from fastembed import TextEmbedding

# dense_model = TextEmbedding(model_name="jinaai/jina-embeddings-v3")

# def batch_embeddings_dense(contents: list[str]):
#     all_embeddings = []
    
#     for i in range(0, len(contents), 100):
#         batch = contents[i:i+100]
#         embeddings = list(dense_model.query_embed(batch))
#         all_embeddings.extend(embeddings)
    
#     return all_embeddings

import voyageai

vo = voyageai.Client()

def batch_embeddings_dense(contents: list[str]):
    all_embeddings = []
    
    for i in range(0, len(contents), 100):
        batch = contents[i:i+100]
        embeddings = vo.embed(batch, model="voyage-finance-2", output_dimension=1024, truncation=True, input_type="query")
        all_embeddings.extend(embeddings.embeddings)
    
    return all_embeddings

In [14]:
from fastembed import SparseTextEmbedding

SparseTextEmbedding.list_supported_models()

sparse_model = SparseTextEmbedding(model_name="Qdrant/bm42-all-minilm-l6-v2-attentions")

def batch_embeddings_sparse(contents: list[str]):
    all_embeddings = []
    
    for i in range(0, len(contents), 100):
        batch = contents[i:i+100]
        embeddings = list(sparse_model.query_embed(batch))
        all_embeddings.extend(embeddings)
    
    return all_embeddings


In [16]:
import json

queries = [
    # "News on Bank Mandiri and financial/banks sectors in Indonesia Stock Market"
    # "PT Indo Tambangraya Megah ITMG news and coal sector news in Indonesia Stock Market"
    "The FED rate cut"
]

for i, query in enumerate(queries):
    result = client.query_points(
        collection_name="news",
        prefetch=[
            models.Prefetch(query=batch_embeddings_dense([query])[0], using="dense", limit=200),
            models.Prefetch(query=batch_embeddings_sparse([query])[0].as_object(), using="splade", limit=200),
        ],
        query=batch_embeddings_late([query])[0],
        with_payload=True,
        using="late",
        limit=20
    )


    final = {
        "query": query,
        "result": result.model_dump()["points"],
    }

    w = open(f"queries/the-fed.json", "w")

    w.write(json.dumps(final, indent=2))

    w.close()

