In [1]:
import logging
import sys, os
from dotenv import load_dotenv

load_dotenv('../.env')

jinaai_api_key = os.environ.get("JINAAI_API_KEY")
groq_api_key = os.environ.get("GROQ_API_KEY")

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.vector_stores.milvus import MilvusVectorStore
from llama_index.core import Settings
from IPython.display import Markdown, display
import textwrap

In [2]:
from llama_index.embeddings.jinaai import JinaEmbedding

Settings.embed_model = JinaEmbedding(
    api_key=jinaai_api_key,
    #embed_batch_size=768,
    model="jina-embeddings-v2-base-en",
)

In [3]:
from llama_index.llms.groq import Groq

Settings.llm = Groq(model="llama3-70b-8192", api_key=groq_api_key)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# load documents
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()

print("Document ID:", documents[0].doc_id)

Document ID: 30629bd9-e75b-41ea-ae12-a1d8eb368b05


In [6]:
# Create an index over the documnts
from llama_index.core import StorageContext
import os


vector_store = MilvusVectorStore(
    uri='http://127.0.0.1:19530',
    dim=768,
    overwrite=True,
    enable_sparse=True,
    hybrid_ranker="RRFRanker",
    hybrid_ranker_params={"k": 60},
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)


Sparse embedding function is not provided, using default.
Fetching 30 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 156698.78it/s]


In [7]:
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

RPC error: [insert_rows], <MilvusException: (code=1100, message=the dim (768) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=768])>, <Time:{'RPC start': '2024-07-20 16:56:05.435572', 'RPC error': '2024-07-20 16:56:05.453130'}>


MilvusException: <MilvusException: (code=1100, message=the dim (768) of field data(embedding) is not equal to schema dim (1536): invalid parameter[expected=1536][actual=768])>

In [None]:
query_engine = index.as_query_engine()


In [None]:
response = query_engine.query("What did the author learn?")
#print(textwrap.fill(str(response), 100))

In [None]:
response = query_engine.query("What was a hard moment for the author?")
print(textwrap.fill(str(response), 100))

In [None]:
from FlagEmbedding import BGEM3FlagModel
from typing import List
from llama_index.vector_stores.milvus.utils import BaseSparseEmbeddingFunction


class ExampleEmbeddingFunction(BaseSparseEmbeddingFunction):
    def __init__(self):
        self.model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=False)

    def encode_queries(self, queries: List[str]):
        outputs = self.model.encode(
            queries,
            return_dense=False,
            return_sparse=True,
            return_colbert_vecs=False,
        )["lexical_weights"]
        return [self._to_standard_dict(output) for output in outputs]

    def encode_documents(self, documents: List[str]):
        outputs = self.model.encode(
            documents,
            return_dense=False,
            return_sparse=True,
            return_colbert_vecs=False,
        )["lexical_weights"]
        return [self._to_standard_dict(output) for output in outputs]

    def _to_standard_dict(self, raw_output):
        result = {}
        for k in raw_output:
            result[int(k)] = raw_output[k]
        return result

In [None]:
vector_store = MilvusVectorStore(
    dim=1536,
    overwrite=True,
    enable_sparse=True,
    sparse_embedding_function=ExampleEmbeddingFunction(),
    hybrid_ranker="RRFRanker",
    hybrid_ranker_params={"k": 60},
)