# RNGD SDK 문서 챗봇 

- 작성자: 이준원 (Joonwon Lee)

- 날짜: 12/24, 2024

Kernel name: llm-quantize

In [1]:
from typing import List
from pydantic import BaseModel, Field, HttpUrl
import json
from markdownify import markdownify as md
from llama_index.core import Document 
from llama_index.core.schema import TextNode
import os
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate

In [2]:
class Page(BaseModel):
    id: str = Field(..., description="ID of the Page")
    link: HttpUrl = Field(description="Url link of the page")
    name: str = Field(description="Name of the page")
    parent: str = Field(default="", description="ID of the parent page")
    child: List[str] = Field(default=[], description="List of ids of the child pages")
    description: str = Field(default="", description="Description of the page")
    description_clean: str = Field(default="", description="Content markdown")
    html_content: str = Field(default="", description="HTML code of the main content in the page")

    def __hash__(self):
        return hash((self.link, self.name))

    def __eq__(self, other):
        if not isinstance(other, Page):
            return False
        return (self.link, self.name) == (other.link, other.name)
    
class CustomDocument(Document):
    page_content: str = Field(default="", description="Additional content for the document")

def convert_page_to_llama_index_document(page: Page) -> CustomDocument:
    return CustomDocument(
        doc_id=page.id,
        metadata={
            "source": str(page.link),
            "title": page.name,
            "parent_doc_id": page.parent,
            "child_doc_ids": json.dumps(page.child),
        },
        text=page.description_clean,  # 기본 text
        page_content=page.description_clean,  # 추가 속성
    )   
    


DB 가져오기 

In [3]:
version_name = 'warboy'
data_dir = f'../../data/db/db-{version_name}_sdk.json'

with open(data_dir, "r") as f:
    data = json.load(f)
    all_pages = [Page.model_validate_json(page) for page in data['sdk']]

def find_page_with_url(url: str) -> Page:
    for page in all_pages:
        if str(page.link) == url:
            return page
    return None

all_pages



라마인덱스 "Document"로 변환

In [4]:
# Convert page to llama index Document and TextNode
docs = [convert_page_to_llama_index_document(page) for page in all_pages]
nodes = [TextNode(id_=doc.id_, text=doc.text, metadata=doc.metadata) for doc in docs]

## 챗봇 테스트 (link 인풋으로 직접 넣어주기)

In [5]:
# Define LLM
import os
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate

llm = Ollama(model="llama3.1:70b", request_timeout=600,temperature=0)
llm.complete("hello")

CompletionResponse(text='Hello! How can I assist you today?', additional_kwargs={'model': 'llama3.1:70b', 'created_at': '2024-12-24T08:58:29.599469501Z', 'done': True, 'done_reason': 'stop', 'context': [128009, 128006, 882, 128007, 271, 15339, 128009, 128006, 78191, 128007, 271, 9906, 0, 2650, 649, 358, 7945, 499, 3432, 30], 'total_duration': 29463410216, 'load_duration': 28653532332, 'prompt_eval_count': 12, 'prompt_eval_duration': 114700000, 'eval_count': 10, 'eval_duration': 650808000}, raw={'model': 'llama3.1:70b', 'created_at': '2024-12-24T08:58:29.599469501Z', 'response': 'Hello! How can I assist you today?', 'done': True, 'done_reason': 'stop', 'context': [128009, 128006, 882, 128007, 271, 15339, 128009, 128006, 78191, 128007, 271, 9906, 0, 2650, 649, 358, 7945, 499, 3432, 30], 'total_duration': 29463410216, 'load_duration': 28653532332, 'prompt_eval_count': 12, 'prompt_eval_duration': 114700000, 'eval_count': 10, 'eval_duration': 650808000}, logprobs=None, delta=None)

In [6]:
prompt = PromptTemplate(
    template="""You are a highly knowledgeable assistant specializing in Furiosa's NPU SDK. Your task is to provide detailed and accurate responses to user queries about Furiosa SDK, including:

1. Interpreting and explaining code examples.
2. Providing guidance on CLI (Command Line Interface) commands and their usage.
3. Offering detailed information about supported software and hardware configurations.

For each query:
- Extract key details from the question and the provided context.
- Use the retrieved contents to generate a clear and step-by-step explanation.
- Always include relevant examples or commands, where applicable, to enhance understanding.

Make sure your response is concise but comprehensive, ensuring the user can act on your guidance immediately.

Question:
{query}

Context:
{retrieved_contents}
"""
)

In [60]:
prompt_simple = PromptTemplate(
    template="""You are a highly knowledgeable assistant specializing in Furiosa's NPU SDK. Your task is to provide a short responses to user queries.
For each query, extract the most relevant information from the retrieved contents to generate a clear and compact explanation to the question. Make the response within 3 sentences.

Question:
{query}

Context:
{retrieved_contents}
"""
)

In [7]:
# Manual test 
query_in = "What is bert?"

link_in = 'https://furiosa-ai.github.io/docs-dev/2024.1/en/getting_started/furiosa_mlperf.html'
document = convert_page_to_llama_index_document(find_page_with_url(link_in))

print('* Query:')
print(query_in)
print("="*60)
print('\n')

print('* link_gt:')
print(link_in)
print("="*60)
print('\n')

print('* ChatBot response:')

full_prompt = prompt.format(query=query_in, retrieved_contents=document.text)
result = llm.complete(full_prompt)
print(result)

AttributeError: 'NoneType' object has no attribute 'id'

## 벡터DB 설정과 Retriever 테스트

In [8]:
# Define embedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="dunzhang/stella_en_1.5B_v5")

In [9]:
# Load retriever from data dir
# Save from html nodes
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.retrievers import VectorIndexRetriever

vectordb_save_path = "../../data/db/llama-index-resources/chroma"
collection_name = "stella"
chroma_client = chromadb.PersistentClient(path=vectordb_save_path)
chroma_collection = chroma_client.get_or_create_collection(collection_name)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# Save data
index = VectorStoreIndex(nodes, storage_context=storage_context, embed_model=embed_model)
# Load data
# index = VectorStoreIndex.from_vector_store(
#     vector_store, storage_context=storage_context, embed_model=embed_model,
# )
chroma_retriever = VectorIndexRetriever(index=index, similarity_top_k=5)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


## Save and Load BM25Retriever
- research에서 수집한 모든 페이지를 llama-index document로 변환 후 bm25에 저장

In [10]:
# Save llama index document to bm25
# Save html nodes
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer

bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=5,
    stemmer=Stemmer.Stemmer("english"),
    language="en",
)
bm25_save_path = "../../data/db/llama-index-resources/bm25"
bm25_retriever.persist(bm25_save_path)

BM25S Count Tokens:   0%|          | 0/23 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/23 [00:00<?, ?it/s]

Finding newlines for mmindex:   0%|          | 0.00/244k [00:00<?, ?B/s]

In [11]:
# Load bm25
import Stemmer
from llama_index.retrievers.bm25 import BM25Retriever

bm25_retriever = BM25Retriever.from_persist_dir(bm25_save_path)
bm25_retriever.similarity_top_k = 5
bm25_retriever.stemmer = Stemmer.Stemmer("english")
bm25_retriever.language = "en"

## Hybrid
- db: chroma + bm25 (두 방식이 상호보완적이므로)
- normalize: dbsf
- algorithm: Convex Combination

In [12]:
import numpy as np

def normalize_dbsf(scores: List[str]):
	arr = np.array(scores)
	mean_value = np.mean(arr)
	std_value = np.std(arr)
	min_value = mean_value - 3 * std_value
	max_value = mean_value + 3 * std_value
	norm_score = (arr - min_value) / (max_value - min_value)
	return norm_score

In [40]:
import numpy as np
import pandas as pd

def hybrid_cc(lexical_results, semantic_results, top_k=5, alpha=0.5):
    """
    Perform hybrid search using convex combination of BM25 and semantic scores.
    
    :param query: Search query (string)
    :param alpha: Weight for BM25 scores (0 <= alpha <= 1). 1-alpha is weight for semantic scores.
    """
    # Step 1: Perform BM25 Search
    bm25_ids = np.array([result.id_ for result in lexical_results])
    bm25_scores = np.array([result.score for result in lexical_results])
    
    # Step 2: Perform Semantic Search using ChromaRetriever
    chroma_ids = np.array([result.id_ for result in semantic_results])
    chroma_scores = np.array([result.score for result in semantic_results])
    
    # Step 3: Normalize the Scores
    bm25_scores_norm = normalize_dbsf(bm25_scores)
    chroma_scores_norm = normalize_dbsf(chroma_scores)

    ids = [bm25_ids, chroma_ids]
    scores = [bm25_scores_norm, chroma_scores_norm]
    
    df = pd.concat(
		[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
	)
    df.columns = ["semantic", "lexical"]
    df = df.fillna(0)
    df["weighted_sum"] = df.mul((alpha, 1.0 - alpha)).sum(axis=1)
    df = df.sort_values(by="weighted_sum", ascending=False)

    retrieved_ids, retrieved_scores = df.index.tolist()[:top_k], df["weighted_sum"][:top_k].tolist()
    retrieved_contents = []
    for idx, id in enumerate(retrieved_ids):
        content = next((node for node in lexical_results if node.id_ == id), None)
        if content is not None:
            content.score = retrieved_scores[idx]
            retrieved_contents.append(content)
            continue
        content = next((node for node in semantic_results if node.id_ == id), None)
        if content is not None:
            content.score = retrieved_scores[idx]
            retrieved_contents.append(content)

    return retrieved_contents

In [41]:
# Define cutoff
from llama_index.core.postprocessor import SimilarityPostprocessor

cutoff = SimilarityPostprocessor(similarity_cutoff=0.3)

## 단계 별 실행 

여기서부터 수정

In [16]:
qa_dir = f'../../data/chatbot/qa-{version_name}_sdk.csv'

qa_with_link = pd.read_csv(qa_dir, encoding="utf-8", index_col=0)
print(qa_with_link.loc[0])

page_id                  cf227685-cc4e-420e-b21a-e7da166093e5
link        https://furiosa-ai.github.io/docs/latest/en/so...
question    What steps are necessary to ensure that a Warb...
answer      First, enable IOMMU in both BIOS and Linux OS....
Name: 0, dtype: object


In [16]:
qa_with_link

Unnamed: 0,page_id,link,question,answer
0,3e9b309f-d9d2-4ee7-be9e-9ffe421d4cfa,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the planned features for Furiosa LLM'...,Planned features for Furiosa LLM include Tenso...
1,3e9b309f-d9d2-4ee7-be9e-9ffe421d4cfa,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the key features of Furiosa LLM that ...,Furiosa LLM features include a vLLM-compatible...
2,3e9b309f-d9d2-4ee7-be9e-9ffe421d4cfa,https://furiosa-ai.github.io/docs-dev/2024.1/e...,How does Furiosa LLM manage efficient KV cache...,Furiosa LLM manages efficient KV cache through...
3,dcd59fbc-fb76-4f34-b6ec-ea88a833b047,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the default configuration values for ...,The default configuration values for deploying...
4,dcd59fbc-fb76-4f34-b6ec-ea88a833b047,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the functionalities provided by the F...,The Furiosa device plugin discovers Furiosa NP...
...,...,...,...,...
58,a214fb49-b797-4d38-b877-597b6bb059eb,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What command can be used to verify the install...,The command 'lspci -nn | grep FuriosaAI' can b...
59,a214fb49-b797-4d38-b877-597b6bb059eb,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the necessary steps to upgrade the fi...,"To upgrade the firmware of FuriosaAI devices, ..."
60,1ba93fae-bf2e-42c1-a66d-dabbee880912,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the constraints when specifying NPU r...,"When specifying NPU resources, you can set NPU..."
61,1ba93fae-bf2e-42c1-a66d-dabbee880912,https://furiosa-ai.github.io/docs-dev/2024.1/e...,How does the deployment of Furiosa Feature Dis...,Furiosa Feature Discovery labels nodes based o...


In [17]:
qa_with_link['question'].values[0]

'What steps are necessary to ensure that a Warboy device is recognized and available within a virtual machine using QEMU-KVM?'

In [18]:
# Step 2: Retrieve document
print(f"Answer = {qa_with_link['page_id'].values[0]} \n")
print("-"*10)

## VectorDB
semantic_results = chroma_retriever.retrieve(qa_with_link['question'].values[0])

## BM25
lexical_results = bm25_retriever.retrieve(qa_with_link['question'].values[0])
print(lexical_results)
print("-"*10)

## Hybrid
retrieved_contents = hybrid_cc(semantic_results=semantic_results, lexical_results=lexical_results)
print(retrieved_contents)
print("-"*10)


Answer = 3e9b309f-d9d2-4ee7-be9e-9ffe421d4cfa 

----------
----------
----------


In [19]:
# Step 3: Cutoff
retrieved_contents_filtered = cutoff.postprocess_nodes(retrieved_contents)
retrieved_contents_filtered



In [20]:
retrieved_contents_filtered[0].text



아래 셀에 문제 있을 수 있음. 체크 

In [23]:
# Step 4: Complete prompt & Generate LLM answer
print(qa_with_link['question'].values[0])
if len(retrieved_contents) > 0:
    full_prompt = prompt.format(query=qa_with_link['question'].values[0], retrieved_contents=retrieved_contents_filtered[0].text)
    result = llm.complete(full_prompt)
    print(result)

What are the planned features for Furiosa LLM's future releases, and how do they enhance its capabilities?
**Planned Features for Furiosa LLM's Future Releases**

Based on the provided context, I've extracted the key details about the planned features for Furiosa LLM's future releases. These features aim to enhance its capabilities and provide state-of-the-art serving optimization.

**1. Tensor Parallelism (Release 2024.2)**

Tensor Parallelism is a planned feature that will allow Furiosa LLM to parallelize tensor operations across multiple NPUs. This feature will enable more efficient processing of large models and improve overall performance.

**Example:** To utilize Tensor Parallelism, you can use the `--tensor-parallelism` flag when running your model with Furiosa LLM. For instance:
```bash
furiosa_llm --model my_model --input input_data --tensor-parallelism
```
This will enable tensor parallelism for your model and distribute the computation across multiple NPUs.

**2. Speculative

In [21]:
# Step 4: Complete prompt & Generate LLM answer
print(qa_with_link['question'].values[0])
if len(retrieved_contents) > 0:
    full_prompt = prompt.format(query=qa_with_link['question'].values[0], retrieved_contents=retrieved_contents_filtered)
    result = llm.complete(full_prompt)
    print(result)

What are the planned features for Furiosa LLM's future releases, and how do they enhance its capabilities?
**Planned Features for Furiosa LLM's Future Releases**

According to the provided context, several features are planned for future releases of Furiosa LLM, which will enhance its capabilities:

1.  **Tensor Parallelism**: This feature is planned for release in 2024.2 and will allow for parallelization across multiple NPUs.
2.  **Speculative Decoding**: This decoding algorithm is planned for a future release and will provide an additional option for users.
3.  **HuggingFace PEFT Support**: This feature is also planned for a future release, which will enable support for HuggingFace's PEFT (Parameter-Efficient Fine-Tuning) technique.

These features will further enhance Furiosa LLM's capabilities, providing users with more options for optimizing their models and improving performance.

**Current Features**

In addition to the planned features, Furiosa LLM currently provides several k

## Generate responses for all qa dataset

- rngd 

- warboy 

- faq

In [43]:
version_name = 'warboy'
# qa_dir = f'../../data/chatbot/qa-{version_name}_sdk.csv'
qa_dir = '../../data/handmade-faq/qa-warboy_sdk.csv'
qa_with_link = pd.read_csv(qa_dir, encoding="iso-8859-1", index_col=0)
qa_with_link

Unnamed: 0,page_id,link,question,answer
0,3e9b309f-d9d2-4ee7-be9e-9ffe421d4cfa,https://furiosa-ai.discourse.group/t/furiosaai...,How can I convert an ONNX file to an ENF file ...,You can use the following command: furiosa com...
1,3e9b309f-d9d2-4ee7-be9e-9ffe421d4cfa,https://furiosa-ai.discourse.group/t/furiosaai...,Can I specify NPU 0 and NPU 1 separately when ...,"Yes, you can execute tasks by specifying diffe..."
2,3e9b309f-d9d2-4ee7-be9e-9ffe421d4cfa,https://furiosa-ai.discourse.group/t/furiosaai...,Please provide a precise explanation of the ta...,Warboy only supports inference and does not su...
3,dcd59fbc-fb76-4f34-b6ec-ea88a833b047,https://furiosa-ai.discourse.group/t/furiosaai...,"What operators are supported by Warboy, and wh...",It is specialized in accelerating CNN-based mo...
4,dcd59fbc-fb76-4f34-b6ec-ea88a833b047,https://furiosa-ai.discourse.group/t/furiosaai...,Can accuracy drop after quantization? Are ther...,Accuracy may drop when quantizing an FP32 mode...
5,dcd59fbc-fb76-4f34-b6ec-ea88a833b047,https://furiosa-ai.discourse.group/t/furiosaai...,Why does the Insufficient Instruction Memory e...,Cause: This error occurs when the number of op...
6,3bb3bd0a-7bdc-45a7-8e3e-e556a52d7eda,https://furiosa-ai.discourse.group/t/furiosaai...,Why does the Incompatible configuration runtim...,Cause: This error occurs when the SDK version ...
7,3bb3bd0a-7bdc-45a7-8e3e-e556a52d7eda,https://furiosa-ai.discourse.group/t/furiosaai...,Why does the model inference time take longer ...,Cause: This occurs when the model uses operato...
8,3bb3bd0a-7bdc-45a7-8e3e-e556a52d7eda,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the specific subcommands provided by ...,The 'furiosa-mlperf' command provides subcomma...
9,a3d94379-304a-4dbc-8300-39169378bfd5,https://furiosa-ai.github.io/docs-dev/2024.1/e...,What are the necessary components and steps re...,To launch the OpenAI-compatible Furiosa-LLM se...


In [59]:
print(prompt_simple.template)

You are a highly knowledgeable assistant specializing in Furiosa's NPU SDK. Your task is to provide an short, but accurate responses to user queries.
For each query, use the retrieved contents to generate a clear and compact explanation (1~3 sentences) to the question. Make sure your response is concise but comprehensive, ensuring the user can act on your guidance immediately. 

Question:
{query}

Context:
{retrieved_contents}



In [62]:
prompt_simple = PromptTemplate(
    template="""You are a highly knowledgeable assistant specializing in Furiosa's NPU SDK. Your task is to provide short responses to user queries.
For each query, extract the most relevant information from the retrieved contents to generate a clear and compact explanation to the question. Limit the response to within 3 sentences.

Question:
{query}

Context:
{retrieved_contents}"""
)

In [67]:
prompt_compact = PromptTemplate(
    template="""Your task is to provide short 1-sentence answer to user queries based on the provided context. 
Question:
{query}
Context:
{retrieved_contents}"""
)

In [66]:
question = qa_with_link['question'].values[0]
print(question)
print("--"*40)

# Step 1: VectorDB 검색
semantic_results = chroma_retriever.retrieve(question)

# Step 2: BM25 검색
lexical_results = bm25_retriever.retrieve(question)

# Step 3: Hybrid 검색
retrieved_contents = hybrid_cc(semantic_results=semantic_results, lexical_results=lexical_results)

# Step 4: Cutoff
retrieved_contents_filtered = cutoff.postprocess_nodes(retrieved_contents)

full_prompt_1 = prompt_compact.format(query=qa_with_link['question'].values[0], retrieved_contents=retrieved_contents_filtered[0].text[:200])
result = llm.complete(full_prompt_1)
print(result)

How can I convert an ONNX file to an ENF file after quantization to later use it by simply opening a session?
--------------------------------------------------------------------------------
This text is a documentation for optimizing the performance of deep learning models on Furiosa AI's NPU (Neural Processing Unit). The optimization techniques are categorized into two levels: model level and runtime level.

**Model Level Optimization**

1.  **Knowledge Distillation**: This technique involves training a smaller student model to mimic the behavior of a larger teacher model. The student model is trained using the output of the teacher model as its target, which helps to transfer knowledge from the teacher model to the student model.
2.  **Pruning**: Pruning involves removing redundant or unnecessary weights and connections in the neural network. This can help reduce the computational cost and memory requirements of the model.
3.  **Quantization**: Quantization involves reducing the pre

In [47]:
version_name = 'warboy'
# qa_dir = f'../../data/chatbot/qa-{version_name}_sdk.csv'
# qa_with_link = pd.read_csv(qa_dir, encoding="utf-8", index_col=0)

qa_dir = '../../data/handmade-faq/qa-warboy_sdk.csv'
qa_with_link = pd.read_csv(qa_dir, encoding="iso-8859-1", index_col=0)

# 새로운 컬럼 추가를 위해 DataFrame 복사
qa_with_link['generated'] = ""  # 생성된 텍스트를 저장할 새로운 컬럼 초기화

for idx, row in qa_with_link.iterrows():
    question = row['question']
    
    # Step 1: VectorDB 검색
    semantic_results = chroma_retriever.retrieve(question)

    # Step 2: BM25 검색
    lexical_results = bm25_retriever.retrieve(question)

    # Step 3: Hybrid 검색
    retrieved_contents = hybrid_cc(semantic_results=semantic_results, lexical_results=lexical_results)

    # Step 4: Cutoff
    retrieved_contents_filtered = cutoff.postprocess_nodes(retrieved_contents)

    # Step 5: LLM 처리
    if len(retrieved_contents_filtered) > 0:
        # full_prompt = prompt.format(query=question, retrieved_contents=retrieved_contents_filtered[0].text)
        full_prompt = prompt_simple.format(query=question, retrieved_contents=retrieved_contents_filtered[0].text)
        result = llm.complete(full_prompt)
        qa_with_link.at[idx, 'generated'] = result  # 결과를 'generated' 컬럼에 추가
    else:
        qa_with_link.at[idx, 'generated'] = "No relevant content found"  # 검색 결과가 없는 경우 기본 메시지 추가

# 저장
# output_file = f"qa_with_generated_results-{version_name}.csv"
output_file = f"qa_with_generated_results-{version_name}_simple3.csv"
qa_with_link.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

Results saved to qa_with_generated_results-warboy_simple2.csv


## Workflow

In [24]:
from typing import List
from llama_index.core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

class SubBotResponse(BaseModel):
    answer: str = Field(
        default="",
        description="Answer of llm based on user question and given context",
    )
    docs: List[str] = Field(
        default=[],
        description="List of reference_id of the metadata in Something to read.",
    )

output_parser = PydanticOutputParser(output_cls=SubBotResponse)

In [37]:
from llama_index.core.workflow import (
    Event,
    StartEvent,
    StopEvent,
    Workflow,
    step,
    Context,
)
from llama_index.core.schema import NodeWithScore
from llama_index.llms.ollama import Ollama

class RetrieverEvent(Event):
    retrieved_nodes: List[NodeWithScore]

class CutoffEvent(Event):
    retrieved_nodes_with_score: List[NodeWithScore]

class PostprocessEvent(Event):
    retrieved_contents: List[dict]

class PromptEvent(Event):
    prompt: str

class HybridFlow(Workflow):
    llm = Ollama(model="llama3.1:70b")

    @step
    async def retrieve(self, ctx: Context, ev: StartEvent) -> RetrieverEvent:
        query = ev.question
        ctx.data["query"] = query
        return RetrieverEvent(retrieved_nodes=hybrid_cc(query, 3, 0.18))

    @step
    async def cutoff(self, ev: RetrieverEvent) -> CutoffEvent:
        retrieved_nodes = ev.retrieved_nodes
        return CutoffEvent(
            retrieved_nodes_with_score=cutoff.postprocess_nodes(retrieved_nodes)
        )

    @step
    async def postprocess(self, ev: CutoffEvent) -> PostprocessEvent:
        retrieved_nodes_with_score = ev.retrieved_nodes_with_score
        return PostprocessEvent(retrieved_contents=postprocess_nodes(retrieved_nodes_with_score))

    @step
    async def prompt(self, ctx: Context, ev: PostprocessEvent) -> PromptEvent:
        query = ctx.data["query"]
        retrieved_contents = ev.retrieved_contents
        return PromptEvent(
            prompt=prompt.format(query=query, retrieved_contents=retrieved_contents)
        )

    @step
    async def generate(self, ev: PromptEvent) -> StopEvent:
        prompt = ev.prompt
        print(prompt)
        response = await self.llm.acomplete(prompt)
        return StopEvent(result=str(response))



In [38]:
w = HybridFlow(timeout=60, verbose=True)
result = await w.run(query=qa_with_link['question'].values[1])
print(result)

Running step retrieve


AttributeError: 'StartEvent' object has no attribute 'question'