# QA 데이터셋 만들기 

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from typing import List
from pydantic import BaseModel, Field, HttpUrl

class Page(BaseModel):
    id: str = Field(..., description="ID of the Page")
    link: HttpUrl = Field(description="Url link of the page")
    name: str = Field(description="Name of the page")
    parent: str = Field(default="", description="ID of the parent page")
    child: List[str] = Field(default=[], description="List of ids of the child pages")
    description: str = Field(default="", description="Description of the page")
    description_clean: str = Field(default="", description="Content markdown")
    html_content: str = Field(default="", description="HTML code of the main content in the page")

    def __hash__(self):
        return hash((self.link, self.name))

    def __eq__(self, other):
        if not isinstance(other, Page):
            return False
        return (self.link, self.name) == (other.link, other.name)

## DB 가지고 오기 

In [3]:
import json

data_dir = '/Users/jwlee-pro/Documents/Workspace_2025/projects/llm-rag-chatbot/data/db-warboy_sdk_v1.json'

with open(data_dir, "r") as f:
    data = json.load(f)
    final_pages = [Page.model_validate_json(page) for page in data["sdk"]]


In [4]:
final_pages[0]



In [5]:
# final_pages의 첫 번째 객체를 출력
print(final_pages[0])

# final_pages 전체 크기 확인
print(f"Total pages loaded: {len(final_pages)}")

Total pages loaded: 23


## QA 데이터셋 만들기 

In [6]:
import os

# openAI key 
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

version #1
- html -> 마크다운 

In [25]:
from markdownify import markdownify as md
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.schema import HumanMessage
import pandas as pd

# LLM 설정
llm = ChatOpenAI(model="gpt-4o", temperature=0.2)

# 번역용 프롬프트 템플릿
prompt = PromptTemplate(template="""
You are an expert translator specializing in technical documentation. Your task is to translate the provided markdown content from English to Korean while retaining the original meaning and structure.

Instructions:
1. Translate all textual content to Korean.
2. Preserve programming code, CLI commands, and technical terms in English for readability.
3. Ensure the translated document is well-formatted in markdown style.
4. Avoid literal translations for idiomatic expressions; focus on clear, context-aware translations.

CONTENT:
{content}
""",
    input_variables=["content"],
)

# 번역 함수
def translate_documents(pages):
    """
    Translate the content of each page to Korean.

    Args:
        pages (List[Page]): List of Page objects.

    Returns:
        pd.DataFrame: DataFrame containing translated documents.
    """
    data = []

    for page in pages:
        try:
            # 프롬프트 생성
            prompt_message = prompt.format(content=page.description_clean)
            message = HumanMessage(content=prompt_message)

            # 번역 수행 (llm에 list 형태로 메시지 전달)
            translated_content = llm([message]).content

            # 데이터 추가
            data.append({
                "page_id": page.id,
                "link": str(page.link),
                "original_content": page.description_clean,
                "translated_content": translated_content,
            })
        except Exception as e:
            print(f"Error translating page {page.id}: {e}")

    # 번역 결과를 DataFrame으로 변환
    df = pd.DataFrame(data)
    return df



In [29]:
translated_df = translate_documents(final_pages)
translated_df

Unnamed: 0,page_id,link,original_content,translated_content
0,cf227685-cc4e-420e-b21a-e7da166093e5,https://furiosa-ai.github.io/docs/latest/en/so...,* Configuring Warboy Pass-through for Virtual ...,```markdown\n* 가상 머신을 위한 Warboy 패스스루 구성 * [페이지...
1,333851a4-2ea4-4903-87a2-0d50943faf1f,https://furiosa-ai.github.io/docs/latest/en/so...,* Performance Optimization * [View page source...,```markdown\n* 성능 최적화 * [페이지 소스 보기](../_source...
2,13973d88-9f7d-49e4-8019-68c7f29141b3,https://furiosa-ai.github.io/docs/latest/en/re...,* Release Notes - 0.9.0 * [View page source](....,```markdown\n* 릴리스 노트 - 0.9.0 * [페이지 소스 보기](.....
3,64f0ffea-6087-4f27-8889-83479e61e89e,https://furiosa-ai.github.io/docs/latest/en/so...,* Performance Profiling * [View page source](....,```markdown\n* 성능 프로파일링 * [페이지 소스 보기](../_sour...
4,65a65779-8c0e-48f0-890a-f6ecc20a9f41,https://furiosa-ai.github.io/docs/latest/en/re...,* Release Notes - 0.5.0 * [View page source](....,```markdown\n* 릴리스 노트 - 0.5.0 * [페이지 소스 보기](.....
5,b6c45005-84e1-46e6-a185-42be1be00b6e,https://furiosa-ai.github.io/docs/latest/en/so...,* Compiler * [View page source](../_sources/so...,```markdown\n* 컴파일러 * [페이지 소스 보기](../_sources/...
6,a50ce0f1-25f2-4882-a3e5-8da4cb7215b7,https://furiosa-ai.github.io/docs/latest/en/so...,* References * [View page source](../_sources/...,\n* 참고 문서 * [페이지 소스 보기](../_sources/software/r...
7,6c328d98-54c8-4c8e-bea7-5927c3921609,https://furiosa-ai.github.io/docs/latest/en/so...,* Model Server (Serving Framework) * [View pag...,```markdown\n* 모델 서버 (서빙 프레임워크) * [페이지 소스 보기](...
8,4336ecbc-28cf-41e6-863f-25212b860e38,https://furiosa-ai.github.io/docs/latest/en/so...,* Tutorial and Code Examples * [View page sour...,```markdown\n* 튜토리얼 및 코드 예제 * [페이지 소스 보기](../_...
9,e527d72f-b0b3-4132-b935-b89558bc7add,https://furiosa-ai.github.io/docs/latest/en/re...,* Release Notes - 0.8.0 * [View page source](....,```markdown\n* 릴리스 노트 - 0.8.0 * [페이지 소스 보기](.....


In [35]:
print(translated_df['translated_content'][0])

```markdown
* 가상 머신을 위한 Warboy 패스스루 구성 * [페이지 소스 보기](../_sources/software/vm_support.rst.txt)
---
이 섹션에서는 가상 머신에 Warboy 패스스루를 활성화하는 방법을 설명합니다. 이 섹션의 예제는 특정 VM 도구 `QEMU-KVM`을 기반으로 하지만, 다른 VM 도구에서도 작동합니다. 예제에서 사용된 환경은 다음과 같습니다:
* 호스트 OS: CentOS 8 * 게스트 OS: Ubuntu 20.04 * 가상 머신: QEMU-KVM
사전 준비 사항 [](#prerequisites "Permalink to this heading") -------------------------------------------------------------
* IOMMU와 VT-x가 BIOS에서 활성화되어 있어야 합니다. * 호스트 머신에 `qemu-kvm`, `libvirt`, `virt-install`이 설치되어 있어야 합니다.
설정 지침 [](#setup-instruction "Permalink to this heading") ---------------------------------------------------------------------
### 1. BIOS 및 Linux OS에서 IOMMU 활성화 [](#enabling-iommu-in-bios-and-linux-os "Permalink to this heading")
먼저, BIOS와 Linux OS에서 IOMMU를 활성화해야 합니다. 다음 명령어는 IOMMU가 활성화되었는지 보여줍니다.
``` 
dmesg | grep -e DMAR -e IOMMU
```
IOMMU가 활성화된 경우 DMAR 또는 IOMMU와 관련된 메시지를 볼 수 있습니다. DMAR 또는 IOMMU와 관련된 메시지를 찾을 수 없다면, BIOS, Linux OS 또는 둘 다에서 IOMMU를 활성화해야 합니다. BIOS에서 IOMMU를 활성화하는 방법은 서버나 메

In [31]:
# 번역 결과를 CSV 및 JSON으로 저장
data_dir = '/Users/jwlee-pro/Documents/Workspace_2025/projects/llm-rag-chatbot/data/translate'
file_dir = f'{data_dir}/qa-warboy_sdk'

translated_df.to_csv(f"{file_dir}.csv", index=False, encoding="utf-8")
translated_df.to_json(f"{file_dir}.json", orient="records", force_ascii=False)

## Generate QA of card page with RAGAS

* Note: I think generating QA pair with RAGAS has low quality (warning)

In [66]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "FuriosaAI RNGD Ragas QA Dataset"

In [87]:
from markdownify import markdownify as md
from llama_index.core import Document 

class CustomDocument(Document):
    page_content: str = Field(default="", description="Additional content for the document")

def convert_page_to_llama_index_document(page: Page) -> CustomDocument:
    return CustomDocument(
        doc_id=page.id,
        metadata={
            "source": str(page.link),
            "title": page.name,
            "parent_doc_id": page.parent,
            "child_doc_ids": json.dumps(page.child),
        },
        text=page.description_clean,  # 기본 text
        page_content=page.description_clean,  # 추가 속성
    )

docs = [convert_page_to_llama_index_document(page) for page in final_pages]
docs



In [68]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

Wrapp the LLMs in LangchainLLMWrapper so that it can be used with ragas.

In [69]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [82]:
docs[0]



In [93]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=50)

Applying HeadlineSplitter:   0%|          | 0/21 [00:00<?, ?it/s]           unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
Applying SummaryExtractor:  41%|████      | 11/27 [00:03<00:02,  6.53it/s]Property 'summary' already exists in node 'fba6c3'. Skipping!
Applying SummaryExtractor:  44%|████▍     | 12/27 [00:03<00:03,  4.75it/s]Property 'summary' already exists in node '5d2c26'. Skipping!
Applying SummaryExtractor:  48%|████▊     | 13/27 [00:03<00:03,  4.38it/s]Property 'summary' already exists in node '8bf36e'. Skipping!
Property 'summary' already exists in node '4c3e39'. Skipping!
Applying SummaryExtractor:  59%|█████▉    | 16/27 [00:04<00:01,  5.89it/s]Property 'summary' already exists in node '0b7164'. Skipping!
Property 'summ

In [94]:
# save data
file_dir = f'{data_dir}/qa-warboy_sdk_ragas_v2'

dataset_df = dataset.to_pandas()
dataset_df.to_parquet(f"{file_dir}.parquet")
dataset_df.to_csv(f"{file_dir}.csv")

In [92]:
dataset_df

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What is FuriosaAI's role in running MLPerf Inf...,[* [.rst](../_sources/getting_started/furiosa_...,FuriosaAI Software Stack provides the `furiosa...,single_hop_specifc_query_synthesizer
1,Howw doo youu usee Furiosa LLM withh FuriosaAI...,[<1-hop>\n\n* [.rst](../_sources/getting_start...,To use Furiosa LLM with FuriosaAI NPU in a con...,multi_hop_abstract_query_synthesizer
2,What are the installation requirements for Fur...,[<1-hop>\n\nRunning Furiosa LLM in container e...,The installation requirements for Furiosa LLM ...,multi_hop_specific_query_synthesizer


In [10]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context, conditional
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-4o", temperature=0.2)
critic_llm = ChatOpenAI(model="gpt-4o", temperature=0.2)
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 1,
    # multi_context: 1,
    # reasoning: 0.35,
    # conditional: 0.2,
}

testset = generator.generate_with_llamaindex_docs(docs, 10, distributions) 
testset_df = testset.to_pandas()
testset_df.to_parquet("v2/qa_ragas_2.parquet")
testset_df.to_csv("v2/qa_ragas_2.csv")

ModuleNotFoundError: No module named 'ragas.testset.evolutions'