# RNGD SDK 문서 챗봇 

- 작성자: 이준원 (Joonwon Lee)

- 날짜: 12/24, 2024

Kernel name: llm-quantize

In [3]:
from typing import List
from pydantic import BaseModel, Field, HttpUrl
import json
from markdownify import markdownify as md
from llama_index.core import Document 
from llama_index.core.schema import TextNode
import os
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate

In [4]:
class Page(BaseModel):
    id: str = Field(..., description="ID of the Page")
    link: HttpUrl = Field(description="Url link of the page")
    name: str = Field(description="Name of the page")
    parent: str = Field(default="", description="ID of the parent page")
    child: List[str] = Field(default=[], description="List of ids of the child pages")
    description: str = Field(default="", description="Description of the page")
    description_clean: str = Field(default="", description="Content markdown")
    html_content: str = Field(default="", description="HTML code of the main content in the page")

    def __hash__(self):
        return hash((self.link, self.name))

    def __eq__(self, other):
        if not isinstance(other, Page):
            return False
        return (self.link, self.name) == (other.link, other.name)
    
class CustomDocument(Document):
    page_content: str = Field(default="", description="Additional content for the document")

def convert_page_to_llama_index_document(page: Page) -> CustomDocument:
    return CustomDocument(
        doc_id=page.id,
        metadata={
            "source": str(page.link),
            "title": page.name,
            "parent_doc_id": page.parent,
            "child_doc_ids": json.dumps(page.child),
        },
        text=page.description_clean,  # 기본 text
        page_content=page.description_clean,  # 추가 속성
    )   
    


DB 가져오기 

In [17]:
version_name = 'rngd'
data_dir = f'../../data/db/db-{version_name}_sdk.json'

with open(data_dir, "r") as f:
    data = json.load(f)
    all_pages = [Page.model_validate_json(page) for page in data['sdk']]

def find_page_with_url(url: str) -> Page:
    for page in all_pages:
        if str(page.link) == url:
            return page
    return None

all_pages



라마인덱스 "Document"로 변환

In [6]:
# Convert page to llama index Document and TextNode
docs = [convert_page_to_llama_index_document(page) for page in all_pages]
nodes = [TextNode(id_=doc.id_, text=doc.text, metadata=doc.metadata) for doc in docs]

## 챗봇 테스트 (link 인풋으로 직접 넣어주기)

In [7]:
# Define LLM
import os
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate

llm = Ollama(model="llama3.1:70b", request_timeout=600,temperature=0)
llm.complete("hello")

CompletionResponse(text='Hello! How can I assist you today?', additional_kwargs={'model': 'llama3.1:70b', 'created_at': '2024-12-23T21:09:10.876773727Z', 'done': True, 'done_reason': 'stop', 'context': [128009, 128006, 882, 128007, 271, 15339, 128009, 128006, 78191, 128007, 271, 9906, 0, 2650, 649, 358, 7945, 499, 3432, 30], 'total_duration': 198428672560, 'load_duration': 197643985909, 'prompt_eval_count': 12, 'prompt_eval_duration': 114793000, 'eval_count': 10, 'eval_duration': 666525000}, raw={'model': 'llama3.1:70b', 'created_at': '2024-12-23T21:09:10.876773727Z', 'response': 'Hello! How can I assist you today?', 'done': True, 'done_reason': 'stop', 'context': [128009, 128006, 882, 128007, 271, 15339, 128009, 128006, 78191, 128007, 271, 9906, 0, 2650, 649, 358, 7945, 499, 3432, 30], 'total_duration': 198428672560, 'load_duration': 197643985909, 'prompt_eval_count': 12, 'prompt_eval_duration': 114793000, 'eval_count': 10, 'eval_duration': 666525000}, logprobs=None, delta=None)

In [8]:
prompt = PromptTemplate(
    template="""You are a highly knowledgeable assistant specializing in Furiosa's NPU SDK. Your task is to provide detailed and accurate responses to user queries about Furiosa SDK, including:

1. Interpreting and explaining code examples.
2. Providing guidance on CLI (Command Line Interface) commands and their usage.
3. Offering detailed information about supported software and hardware configurations.

For each query:
- Extract key details from the question and the provided context.
- Use the retrieved contents to generate a clear and step-by-step explanation.
- Always include relevant examples or commands, where applicable, to enhance understanding.

Make sure your response is concise but comprehensive, ensuring the user can act on your guidance immediately.

Question:
{query}

Context:
{retrieved_contents}
"""
)

In [12]:
# Manual test 
query_in = "What is bert?"

link_in = 'https://furiosa-ai.github.io/docs-dev/2024.1/en/getting_started/furiosa_mlperf.html'
document = convert_page_to_llama_index_document(find_page_with_url(link_in))

print('* Query:')
print(query_in)
print("="*60)
print('\n')

print('* link_gt:')
print(link_in)
print("="*60)
print('\n')

print('* ChatBot response:')

full_prompt = prompt.format(query=query_in, retrieved_contents=document.text)
result = llm.complete(full_prompt)
print(result)

* Query:
What is bert?


* link_gt:
https://furiosa-ai.github.io/docs-dev/2024.1/en/getting_started/furiosa_mlperf.html


* ChatBot response:
**What is BERT?**

BERT (Bidirectional Encoder Representations from Transformers) is a popular deep learning model for natural language processing (NLP) tasks. It was developed by Google and introduced in the paper "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding" in 2018.

In the context of Furiosa's NPU SDK, BERT is one of the benchmarks used to evaluate the performance of machine learning models on FuriosaAI's hardware and software platforms. Specifically, the `furiosa-mlperf` command provides a subcommand called `bert-server` and `bert-offline`, which allow users to run BERT benchmark with server scenario and offline scenario, respectively.

**Running BERT Benchmark**

To run the BERT benchmark using the `furiosa-mlperf` command, you can use the following examples:

* Server Scenario:
```bash
furiosa-mlperf be

## 벡터DB 설정과 Retriever 테스트

In [1]:
# Define embedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="dunzhang/stella_en_1.5B_v5")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Load retriever from data dir
# Save from html nodes
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.retrievers import VectorIndexRetriever

vectordb_save_path = "../../data/db/llama-index-resources/chroma"
collection_name = "stella"
chroma_client = chromadb.PersistentClient(path=vectordb_save_path)
chroma_collection = chroma_client.get_or_create_collection(collection_name)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# Save data
index = VectorStoreIndex(nodes, storage_context=storage_context, embed_model=embed_model)
# Load data
# index = VectorStoreIndex.from_vector_store(
#     vector_store, storage_context=storage_context, embed_model=embed_model,
# )
chroma_retriever = VectorIndexRetriever(index=index, similarity_top_k=5)

## Save and Load BM25Retriever
- research에서 수집한 모든 페이지를 llama-index document로 변환 후 bm25에 저장

In [11]:
# Save llama index document to bm25
# Save html nodes
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer

bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=5,
    stemmer=Stemmer.Stemmer("english"),
    language="en",
)
bm25_save_path = "../../data/db/llama-index-resources/bm25"
bm25_retriever.persist(bm25_save_path)

Finding newlines for mmindex: 100%|██████████| 111k/111k [00:00<00:00, 127MB/s]


In [12]:
# Load bm25
import Stemmer
from llama_index.retrievers.bm25 import BM25Retriever

bm25_retriever = BM25Retriever.from_persist_dir(bm25_save_path)
bm25_retriever.similarity_top_k = 5
bm25_retriever.stemmer = Stemmer.Stemmer("english")
bm25_retriever.language = "en"

## Hybrid
- db: chroma + bm25 (두 방식이 상호보완적이므로)
- normalize: dbsf
- algorithm: Convex Combination

In [13]:
import numpy as np

def normalize_dbsf(scores: List[str]):
	arr = np.array(scores)
	mean_value = np.mean(arr)
	std_value = np.std(arr)
	min_value = mean_value - 3 * std_value
	max_value = mean_value + 3 * std_value
	norm_score = (arr - min_value) / (max_value - min_value)
	return norm_score

In [14]:
import numpy as np
import pandas as pd

def hybrid_cc(lexical_results, semantic_results, top_k=5, alpha=0.5):
    """
    Perform hybrid search using convex combination of BM25 and semantic scores.
    
    :param query: Search query (string)
    :param alpha: Weight for BM25 scores (0 <= alpha <= 1). 1-alpha is weight for semantic scores.
    """
    # Step 1: Perform BM25 Search
    bm25_ids = np.array([result.id_ for result in lexical_results])
    bm25_scores = np.array([result.score for result in lexical_results])
    
    # Step 2: Perform Semantic Search using ChromaRetriever
    chroma_ids = np.array([result.id_ for result in semantic_results])
    chroma_scores = np.array([result.score for result in semantic_results])
    
    # Step 3: Normalize the Scores
    bm25_scores_norm = normalize_dbsf(bm25_scores)
    chroma_scores_norm = normalize_dbsf(chroma_scores)

    ids = [bm25_ids, chroma_ids]
    scores = [bm25_scores_norm, chroma_scores_norm]
    
    df = pd.concat(
		[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
	)
    df.columns = ["semantic", "lexical"]
    df = df.fillna(0)
    df["weighted_sum"] = df.mul((alpha, 1.0 - alpha)).sum(axis=1)
    df = df.sort_values(by="weighted_sum", ascending=False)

    retrieved_ids, retrieved_scores = df.index.tolist()[:top_k], df["weighted_sum"][:top_k].tolist()
    retrieved_contents = []
    for idx, id in enumerate(retrieved_ids):
        content = next((node for node in lexical_results if node.id_ == id), None)
        if content is not None:
            content.score = retrieved_scores[idx]
            retrieved_contents.append(content)
            continue
        content = next((node for node in semantic_results if node.id_ == id), None)
        if content is not None:
            content.score = retrieved_scores[idx]
            retrieved_contents.append(content)

    return retrieved_contents

In [15]:
# Define cutoff
from llama_index.core.postprocessor import SimilarityPostprocessor

cutoff = SimilarityPostprocessor(similarity_cutoff=0.6)

## 단계 별 실행 

여기서부터 수정

In [18]:
qa_dir = f'../../data/chatbot/qa-{version_name}_sdk.csv'

qa_with_link = pd.read_csv(qa_dir, encoding="utf-8", index_col=0)
print(qa_with_link.loc[0])

page_id                  3e9b309f-d9d2-4ee7-be9e-9ffe421d4cfa
link        https://furiosa-ai.github.io/docs-dev/2024.1/e...
question    What are the planned features for Furiosa LLM'...
answer      Planned features for Furiosa LLM include Tenso...
Name: 0, dtype: object


In [20]:
qa_with_link

Unnamed: 0,page_id,link,question,answer
0,3e9b309f-d9d2-4ee7-be9e-9ffe421d4cfa,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the planned features for Furiosa LLM'...,Planned features for Furiosa LLM include Tenso...
1,3e9b309f-d9d2-4ee7-be9e-9ffe421d4cfa,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the key features of Furiosa LLM that ...,Furiosa LLM features include a vLLM-compatible...
2,3e9b309f-d9d2-4ee7-be9e-9ffe421d4cfa,https://furiosa-ai.github.io/docs-dev/2024.1/e...,How does Furiosa LLM manage efficient KV cache...,Furiosa LLM manages efficient KV cache through...
3,dcd59fbc-fb76-4f34-b6ec-ea88a833b047,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the default configuration values for ...,The default configuration values for deploying...
4,dcd59fbc-fb76-4f34-b6ec-ea88a833b047,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the functionalities provided by the F...,The Furiosa device plugin discovers Furiosa NP...
...,...,...,...,...
58,a214fb49-b797-4d38-b877-597b6bb059eb,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What command can be used to verify the install...,The command 'lspci -nn | grep FuriosaAI' can b...
59,a214fb49-b797-4d38-b877-597b6bb059eb,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the necessary steps to upgrade the fi...,"To upgrade the firmware of FuriosaAI devices, ..."
60,1ba93fae-bf2e-42c1-a66d-dabbee880912,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the constraints when specifying NPU r...,"When specifying NPU resources, you can set NPU..."
61,1ba93fae-bf2e-42c1-a66d-dabbee880912,https://furiosa-ai.github.io/docs-dev/2024.1/e...,How does the deployment of Furiosa Feature Dis...,Furiosa Feature Discovery labels nodes based o...


In [23]:
qa_with_link['question'].values[0]

"What are the planned features for Furiosa LLM's future releases, and how do they enhance its capabilities?"

In [26]:
# Step 2: Retrieve document
print(f"Answer = {qa_with_link['page_id'].values[0]} \n")
print("-"*10)

## VectorDB
semantic_results = chroma_retriever.retrieve(qa_with_link['question'].values[0])

## BM25
lexical_results = bm25_retriever.retrieve(qa_with_link['question'].values[0])
print(lexical_results)
print("-"*10)

## Hybrid
retrieved_contents = hybrid_cc(semantic_results=semantic_results, lexical_results=lexical_results)
print(retrieved_contents)
print("-"*10)


Answer = 3e9b309f-d9d2-4ee7-be9e-9ffe421d4cfa 

----------
----------
----------


In [28]:
# Step 3: Cutoff
retrieved_contents_filtered = cutoff.postprocess_nodes(retrieved_contents)
retrieved_contents_filtered



In [33]:
retrieved_contents_filtered[0].text



In [None]:
# Step 4: Complete prompt & Generate LLM answer
print(qa_with_link['question'].values[0])
if len(retrieved_contents) > 0:
    full_prompt = prompt.format(query=qa_with_link['question'].values[0], retrieved_contents=retrieved_contents_filtered)
    result = llm.complete(full_prompt)
    print(result)

## Workflow

In [None]:
from typing import List
from llama_index.core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

class SubBotResponse(BaseModel):
    answer: str = Field(
        default="",
        description="Answer of llm based on user question and given context",
    )
    docs: List[str] = Field(
        default=[],
        description="List of reference_id of the metadata in Something to read.",
    )

output_parser = PydanticOutputParser(output_cls=SubBotResponse)

In [None]:
from llama_index.core.workflow import (
    Event,
    StartEvent,
    StopEvent,
    Workflow,
    step,
    Context,
)
from llama_index.core.schema import NodeWithScore
from llama_index.llms.ollama import Ollama

class RetrieverEvent(Event):
    retrieved_nodes: List[NodeWithScore]

class CutoffEvent(Event):
    retrieved_nodes_with_score: List[NodeWithScore]

class PostprocessEvent(Event):
    retrieved_contents: List[dict]

class PromptEvent(Event):
    prompt: str

class HybridFlow(Workflow):

    llm = Ollama(model="llama3.1:70b")

    @step
    async def retrieve(self, ctx: Context, ev: StartEvent) -> RetrieverEvent:
        query = ev.query
        ctx.data["query"] = query
        return RetrieverEvent(retrieved_nodes=hybrid_cc(query, 3, 0.18))

    @step
    async def cutoff(self, ev: RetrieverEvent) -> CutoffEvent:
        retrieved_nodes = ev.retrieved_nodes
        return CutoffEvent(
            retrieved_nodes_with_score=cutoff.postprocess_nodes(retrieved_nodes)
        )

    @step
    async def postprocess(self, ev: CutoffEvent) -> PostprocessEvent:
        retrieved_nodes_with_score = ev.retrieved_nodes_with_score
        return PostprocessEvent(retrieved_contents=postprocess_nodes(retrieved_nodes_with_score))

    @step
    async def prompt(self, ctx: Context, ev: PostprocessEvent) -> PromptEvent:
        query = ctx.data["query"]
        retrieved_contents = ev.retrieved_contents
        return PromptEvent(
            prompt=prompt.format(query=query, retrieved_contents=retrieved_contents)
        )

    @step
    async def generate(self, ev: PromptEvent) -> StopEvent:
        prompt = ev.prompt
        print(prompt)
        response = await self.llm.acomplete(prompt)
        return StopEvent(result=str(response))

w = HybridFlow(timeout=60, verbose=True)
result = await w.run(query=qa_with_link['question'].values[1])
print(result)