# Load Embedder

In [1]:
from FlagEmbedding import FlagModel

instr = "Represent this sentence for searching relevant passages: Provide a detailed and accurate representation of the query to retrieve relevant technical documentation, explanations, or examples related to KServe."

model = FlagModel('BAAI/bge-large-en-v1.5', 
                  query_instruction_for_retrieval=instr,
                  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [None]:
sentences_1 = ["样例数据-1", "样例数据-2"]
sentences_2 = ["样例数据-3", "样例数据-4"]
embeddings_1 = model.encode(sentences_1)
embeddings_2 = model.encode(sentences_2)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

# for s2p(short query to long passage) retrieval task, suggest to use encode_queries() which will automatically add the instruction to each query
# corpus in retrieval task can still use encode() or encode_corpus(), since they don't need instruction
queries = ['query_1', 'query_2']
passages = ["样例文档-1", "样例文档-2"]
q_embeddings = model.encode_queries(queries)
p_embeddings = model.encode(passages)
scores = q_embeddings @ p_embeddings.T

# Process KServe Documentation to Json

In [7]:
import os
import json
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import numpy as np
from tqdm import tqdm

docs_dir = "./clones/KServe/website/docs"
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

all_chunks = []

total_files = sum(len(files) for _, _, files in os.walk(docs_dir))

for root, dirs, files in tqdm(os.walk(docs_dir), total=total_files):
    for file in files:
        if file.endswith(".md"):
            path = os.path.join(root, file)
            
            loader = UnstructuredMarkdownLoader(path)
            docs = loader.load()
            
            chunks = text_splitter.split_documents(docs)
            
            for idx, chunk in enumerate(chunks):
                embedding_vector = model.encode(chunk.page_content).tolist()
                all_chunks.append({
                    'id': f"{os.path.relpath(path)}-{idx}",
                    'content': chunk.page_content,
                    'metadata': {
                        'source': os.path.relpath(path),
                        'category': root.split('/')[-1],
                        'filename': file,
                        'embedding': embedding_vector
                    }
                })


with open("./data/kserve/kserve_rag_data.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)


 26%|██▌       | 128/499 [01:02<03:01,  2.04it/s]
