In [None]:
from dotenv import load_dotenv

load_dotenv()

import onnxruntime as ort

sess_options = ort.SessionOptions()
sess_options.intra_op_num_threads = 8
sess_options.inter_op_num_threads = 8

import os
os.environ["OMP_NUM_THREADS"] = "8"
os.environ["MKL_NUM_THREADS"] = "8"


In [None]:
from qdrant_client import QdrantClient, models

client = QdrantClient(host="localhost", port=6333)

In [None]:
from qdrant_client.models import VectorParams, Distance, SparseVectorParams

collection_name = "news"

if not client.collection_exists(collection_name):
   client.create_collection(
      collection_name=collection_name,
      vectors_config={
         "dense": VectorParams(size=1024, distance=Distance.COSINE),
         "late": models.VectorParams(
            size=96,
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM,
            ),
            hnsw_config=models.HnswConfigDiff(m=0)
        ),
      },
      sparse_vectors_config={
         "splade": SparseVectorParams(modifier="idf")
      }
   )

In [None]:
from fastembed import LateInteractionTextEmbedding 

late_model = LateInteractionTextEmbedding("answerdotai/answerai-colbert-small-v1")

def batch_embeddings_late(contents: list[str]):
    all_embeddings = []
    
    for i in range(0, len(contents), 100):
        batch = contents[i:i+100]
        embeddings = list(late_model.passage_embed(batch))
        all_embeddings.extend(embeddings)
    
    return all_embeddings

In [None]:
# from fastembed import TextEmbedding

# dense_model = TextEmbedding(model_name="jinaai/jina-embeddings-v3")

# def batch_embeddings_dense(contents: list[str]):
#     all_embeddings = []
    
#     for i in range(0, len(contents), 100):
#         batch = contents[i:i+100]
#         embeddings = list(dense_model.passage_embed(batch))
#         all_embeddings.extend(embeddings)
    
#     return all_embeddings

import voyageai

vo = voyageai.Client()

def batch_embeddings_dense(contents: list[str]):
    all_embeddings = []
    
    for i in range(0, len(contents), 100):
        batch = contents[i:i+100]
        embeddings = vo.embed(batch, model="voyage-finance-2", output_dimension=1024, truncation=True)
        all_embeddings.extend(embeddings.embeddings)
    
    return all_embeddings

In [None]:
from fastembed import SparseTextEmbedding

SparseTextEmbedding.list_supported_models()

sparse_model = SparseTextEmbedding(model_name="Qdrant/bm42-all-minilm-l6-v2-attentions")

def batch_embeddings_sparse(contents: list[str]):
    all_embeddings = []
    
    for i in range(0, len(contents), 100):
        batch = contents[i:i+100]
        embeddings = list(sparse_model.passage_embed(batch))
        all_embeddings.extend(embeddings)
    
    return all_embeddings


In [None]:
import os
import json

def load_json_files_to_array(folder_path):
    json_array = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)

            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    json_array.append(data)
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    return json_array

rawData = load_json_files_to_array("ai-apps/investment/dataset/stockbit-snips/processed")



In [None]:
import uuid

start_idx = 0

for idx, doc in enumerate(rawData[start_idx:]):
    print(f"processing doc {idx+start_idx}")

    chunks = []

    for news in doc["marketNews"]:
        news["publishDate"] = doc["publishDate"]
        news["type"] = "market-news"
        chunks.append(news)

    for news in doc["tickerNews"]:
        news["publishDate"] = doc["publishDate"]
        news["type"] = "ticker-news"
        chunks.append(news)

    chunk_contents = [f"{c['title']}\n{c['content']}" for c in chunks]

    # print("embedding dense start")
    chunk_embeddings = batch_embeddings_dense(chunk_contents)
    # print("embedding dense end")

    sparse_embeddings = batch_embeddings_sparse(chunk_contents)

    late_embeddings = batch_embeddings_late(chunk_contents)

    chunk_count = len(chunks)

    points: list[models.PointStruct] = []

    for i, chunk in enumerate(chunks):
        id = uuid.uuid5(uuid.NAMESPACE_DNS, f"{doc['publishDate']}-${i}")

        chunk["chunk_index"] = i
        chunk["chunk_count"] = chunk_count

        points.append(models.PointStruct(
            id=id,
            payload=chunk,
            vector={
                "dense": chunk_embeddings[i],
                "splade": sparse_embeddings[i].as_object(),
                "late": late_embeddings[i]
            }
        ))

    batch_size = 25
    for i in range(0, len(points), batch_size):
        batch = points[i:i + batch_size]
        client.upsert(
            collection_name="news",
            wait=True,
            points=batch,
        )

    print(f"success index {idx+start_idx}")
