In [1]:
from qdrant_client import QdrantClient, models

client = QdrantClient(host="localhost", port=6333)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from qdrant_client.models import VectorParams, Distance, SparseVectorParams

if not client.collection_exists("news"):
   client.create_collection(
      collection_name="news",
      vectors_config={
         "dense": VectorParams(size=1024, distance=Distance.COSINE)
      },
      sparse_vectors_config={
         "splade": SparseVectorParams(modifier="idf")
      }
   )

In [None]:
from fastembed import TextEmbedding

dense_model = TextEmbedding(model_name="jinaai/jina-embeddings-v3", 
                            providers=["CUDAExecutionProvider"])

def batch_embeddings_dense(contents: list[str]):
    all_embeddings = []
    
    for i in range(0, len(contents), 100):
        batch = contents[i:i+100]
        embeddings = list(dense_model.passage_embed(batch))
        all_embeddings.extend(embeddings)
    
    return all_embeddings

  from .autonotebook import tqdm as notebook_tqdm
Fetching 6 files:  33%|███▎      | 2/6 [00:03<00:07,  1.94s/it]

In [None]:
from fastembed import SparseTextEmbedding

SparseTextEmbedding.list_supported_models()

sparse_model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1", 
                                   providers=["CUDAExecutionProvider"])

def batch_embeddings_sparse(contents: list[str]):
    all_embeddings = []
    
    for i in range(0, len(contents), 100):
        batch = contents[i:i+100]
        embeddings = list(sparse_model.passage_embed(batch))
        all_embeddings.extend(embeddings)
    
    return all_embeddings


In [None]:
import os
import json

def load_json_files_to_array(folder_path):
    json_array = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)

            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    json_array.append(data)
                    print(f"Loaded: {filename}")
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    return json_array

rawData = load_json_files_to_array("ai-apps/investmen/dataset/stockbit-snips/processed")



In [None]:
import uuid

start_idx = 0

for idx, doc in enumerate(rawData[start_idx:]):
    print(f"processing doc {idx+start_idx}")

    chunks = []

    for news in doc["marketNews"]:
        news["publishDate"] = doc["publishDate"]
        news["type"] = "market-news"
        chunks.append(news)

    for news in doc["tickerNews"]:
        news["publishDate"] = doc["publishDate"]
        news["type"] = "ticker-news"
        chunks.append(news)

    chunk_contents = [f"{c['title']}\n{c['content']}" for c in chunks]

    chunk_embeddings = batch_embeddings_dense(chunk_contents)

    sparse_embeddings = batch_embeddings_sparse(chunk_contents)

    chunk_count = len(chunks)

    points: list[models.PointStruct] = []

    for i, chunk in enumerate(chunks):
        id = uuid.uuid5(uuid.NAMESPACE_DNS, f"{doc['publishDate']}-${i}")

        chunk["chunk_index"] = i
        chunk["chunk_count"] = chunk_count

        points.append(models.PointStruct(
            id=id,
            payload=chunk,
            vector={
                "dense": chunk_embeddings[i],
                "splade": sparse_embeddings[i].as_object()
            }
        ))

    batch_size = 25
    for i in range(0, len(points), batch_size):
        batch = points[i:i + batch_size]
        client.upsert(
            collection_name="news",
            wait=True,
            points=batch,
        )

    print(f"success index {idx+start_idx}")


processing doc 0 with 10 chunks
success index 0
processing doc 1 with 5 chunks
success index 1
processing doc 2 with 22 chunks
success index 2
processing doc 3 with 3 chunks
success index 3
processing doc 4 with 2 chunks
success index 4
processing doc 5 with 2 chunks
success index 5
processing doc 6 with 3 chunks
success index 6
processing doc 7 with 3 chunks
success index 7
processing doc 8 with 4 chunks
success index 8
processing doc 9 with 16 chunks
success index 9
processing doc 10 with 14 chunks
success index 10
processing doc 11 with 6 chunks
success index 11
processing doc 12 with 4 chunks
success index 12
processing doc 13 with 3 chunks
success index 13
processing doc 14 with 4 chunks
success index 14
processing doc 15 with 3 chunks
success index 15
processing doc 16 with 2 chunks
success index 16
processing doc 17 with 2 chunks
success index 17
processing doc 18 with 3 chunks
success index 18
processing doc 19 with 4 chunks
success index 19
processing doc 20 with 4 chunks
succ