In [1]:
import numpy as np
import pandas as pd
import os
import json

# Load Embedder

In [55]:
from FlagEmbedding import FlagModel

In [56]:
instr = "Represent this sentence for searching relevant passages: Provide a detailed and accurate representation of the query to retrieve relevant technical documentation, explanations, or examples related to KServe."

model = FlagModel('BAAI/bge-large-en-v1.5', 
                  query_instruction_for_retrieval=instr,
                  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation


# Process KServe Documentation to Json

In [53]:
import os
import json
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import numpy as np
from tqdm import tqdm

In [None]:
docs_dir = "./clones/KServe/website/docs"
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

all_chunks = []

total_files = sum(len(files) for _, _, files in os.walk(docs_dir))

for root, dirs, files in tqdm(os.walk(docs_dir), total=total_files):
    for file in files:
        if file.endswith(".md"):
            path = os.path.join(root, file)
            
            loader = UnstructuredMarkdownLoader(path)
            docs = loader.load()
            
            chunks = text_splitter.split_documents(docs)
            
            for idx, chunk in enumerate(chunks):
                embedding_vector = model.encode(chunk.page_content).tolist()
                all_chunks.append({
                    'id': f"{os.path.relpath(path)}-{idx}",
                    'content': chunk.page_content,
                    'metadata': {
                        'source': os.path.relpath(path),
                        'category': root.split('/')[-1],
                        'filename': file,
                        'embedding': embedding_vector
                    }
                })


with open("./data/kserve/kserve_rag_data.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)


 26%|██▌       | 128/499 [01:02<03:01,  2.04it/s]


In [10]:
all_chunks[1000]

{'id': 'clones/KServe/website/docs/sdk_docs/docs/V1beta1PredictorSpec.md-26',
 'content': 'list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] triton V1beta1TritonSpec [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] xgboost V1beta1XGBoostSpec [optional]',
 'metadata': {'source': 'clones/KServe/website/docs/sdk_docs/docs/V1beta1PredictorSpec.md',
  'category': 'docs',
  'filename': 'V1beta1PredictorSpec.md',
  'embedding': [0.01180267333984375,
   -0.0003330707550048828,
   -0.0101318359375,
   -0.01534271240234375,
   -0.0157318115234375,
   -0.0298004150390625,
   0.0107421875,
   0.014068603515625,
   0.00872039794921875,
   0.06414794921875,
   0.001910209

In [None]:
sentences_1 = ["样例数据-1", "样例数据-2"]
sentences_2 = ["样例数据-3", "样例数据-4"]
embeddings_1 = model.encode(sentences_1)
embeddings_2 = model.encode(sentences_2)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

# for s2p(short query to long passage) retrieval task, suggest to use encode_queries() which will automatically add the instruction to each query
# corpus in retrieval task can still use encode() or encode_corpus(), since they don't need instruction
queries = ['query_1', 'query_2']
passages = ["样例文档-1", "样例文档-2"]
q_embeddings = model.encode_queries(queries)
p_embeddings = model.encode(passages)
scores = q_embeddings @ p_embeddings.T

# Using Faiss to Search through Embeddings

In [3]:
import faiss

In [57]:
chunks = pd.read_json("./data/kserve/kserve_rag_data.json")
chunks = pd.json_normalize(chunks['metadata'])

In [12]:
type(chunks["embedding"][1][1])

float

In [None]:
embeddings = chunks["embedding"].to_numpy()
embeddings = np.vstack(embeddings)

In [36]:
embeddings.shape

(2876, 1024)

In [39]:
embeddings.dtype

dtype('float64')

In [40]:
embeddings = embeddings.astype("float32")

In [42]:
import faiss
import numpy as np
res = faiss.StandardGpuResources()
index = faiss.IndexFlatIP(embeddings.shape[1])
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)


In [47]:
index.add(embeddings)

In [43]:
gpu_index_flat.add(embeddings)

In [None]:
faiss.write_index(index, "faiss/faiss_index_mmap.index")

In [45]:
index

<faiss.swigfaiss.IndexFlatIP; proxy of <Swig Object of type 'faiss::IndexFlatIP *' at 0x7fad63e867c0> >

In [51]:
loaded_index = faiss.read_index("faiss/faiss_index_mmap.index", faiss.IO_FLAG_MMAP)


In [52]:
loaded_index

<faiss.swigfaiss.IndexFlat; proxy of <Swig Object of type 'faiss::IndexFlat *' at 0x7fad548648a0> >

In [None]:
def faiss_search(query, k=5):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    query_chucks = text_splitter.split_documents(query)

    q_embeddings = model.encode_queries(query_chucks)
    
    distances, indices = index.search(q_embeddings, k)

    retrieved_chunks = [chunk[i] for i in indices[0]]
    context = "\n".join([f"- {chunk['content']}" for chunk in retrieved_chunks])

    return context

# setup Ollama for llama3.3:70b

In [2]:
import ollama
from system_prompts.coder_agent import system_prompt as coder_system_prompt
from system_prompts.retrieve_agent import system_prompt as retrieve_system_prompt

In [None]:
def query_llama_rag(user_query, system_prompt, rag_context=""):
    """
    Query Llama 3.3:70B using Ollama for RAG-based retrieval.

    Args:
        user_query (str): The user's query.
        rag_context (str): Retrieved context from the RAG system.

    Returns:
        str: The response generated by Llama 3.3:70B.
    """

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Query: {user_query}\nContext: {rag_context}"}
    ]

    response = ollama.chat(model="llama3.3:70b", messages=messages)

    return response["message"]["content"]

In [None]:
user_query = "What are the prerequisites for installing KServe on Kubernetes?"
retrieve_response = query_llama_rag(user_query, retrieve_system_prompt)

In [None]:
rag_context = faiss_search(retrieve_response)

In [None]:
final_response = query_llama_rag(user_query, coder_system_prompt, rag_context)
print("Llama Response:", final_response)

# Set openweb-ui 

In [6]:
!sudo docker run -d -p 3000:8080 -e OLLAMA_BASE_URL=http://0.0.0.0:6666 -v open-webui:/app/backend/data --name open-webui --restart always ghcr.io/open-webui/open-webui:main

Unable to find image 'ghcr.io/open-webui/open-webui:main' locally


main: Pulling from open-webui/open-webui

[1B3256a31a: Pulling fs layer 
[1B0922284a: Pulling fs layer 
[1B3b698b72: Pulling fs layer 
[1Be2cb4ecc: Pulling fs layer 
[1B7593eba8: Pulling fs layer 
[1Bb700ef54: Pulling fs layer 
[1Bdeb00349: Pulling fs layer 
[3Bb700ef54: Waiting fs layer 
[2B720819a1: Waiting fs layer 
[1Ba6db44f8: Pulling fs layer 
[3B3f1759d3: Waiting fs layer 
[2Be02d3356: Waiting fs layer 
[2B02903e6c: Waiting fs layer 
[1B35071214: Pulling fs layer 
[5Be02d3356: Downloading  778.5MB/1.047GB[13A[2K[15A[2K[15A[2K[10A[2K[15A[2K[15A[2K[7A[2K[15A[2K[7A[2K[5A[2K[7A[2K[4A[2K[15A[2K[4A[2K[7A[2K[4A[2K[7A[2K[4A[2K[7A[2K[4A[2K[7A[2K[4A[2K[14A[2K[4A[2K[7A[2K[7A[2K[3A[2K[7A[2K[7A[2K[2A[2K[7A[2K[13A[2K[1A[2K[1A[2K[1A[2K[1A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[7A[2K[5A[2K[7A[2K[5A[2K[7A[2K[5A[2K[7A[2K[5A[2K[7A[2K[5A[2K[7A[2K[7A[2K[

In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/api/completions', methods=['POST'])
def handle_completions():
    """
    Handle redirected requests from Ollama's API.
    """
    data = request.json  # Get JSON payload from the request
    print("Received request:", data)

    # Process the request and generate a response
    response_data = {
        "message": "Request successfully intercepted and processed in Python.",
        "input": data
    }

    return jsonify(response_data)

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000)
