# QA 데이터셋 만들기 

In [11]:
from dotenv import load_dotenv
load_dotenv()

True

In [1]:
from typing import List
from pydantic import BaseModel, Field, HttpUrl

class Page(BaseModel):
    id: str = Field(..., description="ID of the Page")
    link: HttpUrl = Field(description="Url link of the page")
    name: str = Field(description="Name of the page")
    parent: str = Field(default="", description="ID of the parent page")
    child: List[str] = Field(default=[], description="List of ids of the child pages")
    description: str = Field(default="", description="Description of the page")
    description_clean: str = Field(default="", description="Content markdown")
    html_content: str = Field(default="", description="HTML code of the main content in the page")

    def __hash__(self):
        return hash((self.link, self.name))

    def __eq__(self, other):
        if not isinstance(other, Page):
            return False
        return (self.link, self.name) == (other.link, other.name)

## DB 가지고 오기 

In [2]:
import json

data_dir = '/Users/jwlee-pro/Documents/Workspace_2025/projects/llm-rag-chatbot/data/db-warboy_sdk_v1.json'

with open(data_dir, "r") as f:
    data = json.load(f)
    final_pages = [Page.model_validate_json(page) for page in data["sdk"]]


In [3]:
final_pages[0]



In [4]:
# final_pages의 첫 번째 객체를 출력
print(final_pages[0])

# final_pages 전체 크기 확인
print(f"Total pages loaded: {len(final_pages)}")

Total pages loaded: 23


## QA 데이터셋 만들기 

In [13]:
import os

# openAI key 
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

version #1

In [60]:
from markdownify import markdownify as md
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
import pandas as pd

class QAPair(BaseModel):
    question: str = Field(description="Question generated by llm")
    answer: str = Field(description="Answer generated by llm")

output_parser = PydanticOutputParser(pydantic_object=QAPair)


# LLM 및 Prompt 설정
llm = ChatOpenAI(model="gpt-4o", temperature=0.2)

prompt = PromptTemplate(
    template="""As an expert in creating educational Question-Answer datasets, your task is to generate one high-quality QA pair based on the provided markdown content. 

Instructions:
1. Carefully analyze the markdown content in CONTENT section, identifying key concepts, details, and information.
2. Imagine you are a first-time visitor to a website and aim to create a challenging, abstract question that encourages deep engagement with the content.
3. Ensure that the question is specific enough that it can only be answered by referencing the given markdown.
4. Generate a concise, direct answer without introductory phrases like "The markdown says" or "Here is...".
5. Output **only one** QA pair.
6. {instructions}

Desired Format:
- Question: [Your abstract, content-specific question]
- Answer: [Your precise, context-reliant answer]

CONTENT:
{content}
""",
    partial_variables={"instructions": output_parser.get_format_instructions()},
    input_variables=["content"],
)

chain = prompt | llm | output_parser

def generate_qa(pages, qa_per_page=10):
    """
    Generate QA pairs for each page and save them into a DataFrame.

    Args:
        pages (List[Page]): List of Page objects.
        qa_per_page (int): Number of QA pairs to generate per page.

    Returns:
        pd.DataFrame: DataFrame containing generated QA pairs.
    """
    data = []

    for page in pages:
        markdown_content = md(page.html_content, strip=["img"])  # Convert HTML to markdown, stripping <img> tags

        # Generate QA pairs for each page
        for _ in range(qa_per_page):
            try:
                qa_pair = chain.invoke({"content": markdown_content})
                data.append({
                    "page_id": page.id,
                    "link": str(page.link),
                    "question": qa_pair.question,
                    "answer": qa_pair.answer,
                })
            except Exception as e:
                print(f"Error generating QA for page {page.id}: {e}")

    # Convert data to DataFrame
    df = pd.DataFrame(data)
    return df

version #3
- 중복 방지 
- html -> 마크다운 

In [14]:
from markdownify import markdownify as md
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
import pandas as pd


class QAPair(BaseModel):
    question: str = Field(description="Question generated by llm")
    answer: str = Field(description="Answer generated by llm")


output_parser = PydanticOutputParser(pydantic_object=QAPair)

# LLM 및 Prompt 설정
llm = ChatOpenAI(model="gpt-4o", temperature=0.2)

prompt = PromptTemplate(template="""
As an expert in creating educational Question-Answer datasets, your task is to generate one high-quality QA pair based on the provided markdown content.

Instructions:
1. Carefully analyze the markdown content in CONTENT section, identifying key concepts, details, and information.
2. Avoid generating a question that is similar to any previously generated questions listed in PREVIOUS_QUESTIONS.
3. Imagine you are a first-time visitor to a website and aim to create a challenging, abstract question that encourages deep engagement with the content.
4. Ensure that the question is specific enough that it can only be answered by referencing the given markdown.
5. Generate a concise, direct answer without introductory phrases like "The markdown says" or "Here is...".
6. Output the result in the following JSON format:

```json
{{
    "question": "Your abstract, content-specific question",
    "answer": "Your precise, context-reliant answer"
}}
                        
CONTENT:
{content}

PREVIOUS_QUESTIONS:
{previous_questions}
""",
    input_variables=["content", "previous_questions"],
)

chain = prompt | llm | output_parser

def generate_qa(pages, qa_per_page=10):
    """
    Generate QA pairs for each page and save them into a DataFrame.
    Args:
        pages (List[Page]): List of Page objects.
        qa_per_page (int): Number of QA pairs to generate per page.

    Returns:
        pd.DataFrame: DataFrame containing generated QA pairs.
    """
    data = []

    for page in pages:
        markdown_content = md(page.html_content, strip=["img"])  # Convert HTML to markdown, stripping <img> tags
        previous_questions = set()

        # Generate QA pairs for each page
        for _ in range(qa_per_page):
            try:
                previous_questions_str = "\n".join(previous_questions) if previous_questions else "None"
                qa_pair = chain.invoke({
                    "content": markdown_content,
                    "previous_questions": previous_questions_str,
                })

                # 중복된 질문 방지
                if qa_pair.question in previous_questions:
                    print(f"Duplicate question detected and skipped: {qa_pair.question}")
                    continue

                # 데이터 추가
                data.append({
                    "page_id": page.id,
                    "link": str(page.link),
                    "question": qa_pair.question,
                    "answer": qa_pair.answer,
                })

                # 현재 질문 저장
                previous_questions.add(qa_pair.question)
            except Exception as e:
                print(f"Error generating QA for page {page.id}: {e}")

    # Convert data to DataFrame
    df = pd.DataFrame(data)
    return df


자 이제 QA 데이터셋 생성해보자. 

In [15]:
# Example Usage
qa_per_page = 3  # Number of QA pairs per page
qa_df = generate_qa(final_pages, qa_per_page)
qa_df

Unnamed: 0,page_id,link,question,answer
0,cf227685-cc4e-420e-b21a-e7da166093e5,https://furiosa-ai.github.io/docs/latest/en/so...,What steps are necessary to ensure that a Warb...,"First, enable IOMMU in both BIOS and Linux OS...."
1,cf227685-cc4e-420e-b21a-e7da166093e5,https://furiosa-ai.github.io/docs/latest/en/so...,What is the significance of the PCI BDF in con...,"The PCI BDF (Bus, Device, Function) is a uniqu..."
2,cf227685-cc4e-420e-b21a-e7da166093e5,https://furiosa-ai.github.io/docs/latest/en/so...,What are the specific BIOS and Linux OS config...,"In BIOS, IOMMU and VT-x should be enabled. In ..."
3,333851a4-2ea4-4903-87a2-0d50943faf1f,https://furiosa-ai.github.io/docs/latest/en/so...,How does the optimization of the 'Quantize' op...,The optimization of the 'Quantize' operator us...
4,333851a4-2ea4-4903-87a2-0d50943faf1f,https://furiosa-ai.github.io/docs/latest/en/so...,What are the key considerations and strategies...,Optimizing inference performance in a producti...
...,...,...,...,...
64,f6a8c152-6f4c-4ac0-98ae-fbe3093522c6,https://furiosa-ai.github.io/docs/latest/en/cu...,What steps should be taken if a model compilat...,File a bug report at the FuriosaAI customer se...
65,f6a8c152-6f4c-4ac0-98ae-fbe3093522c6,https://furiosa-ai.github.io/docs/latest/en/cu...,What is the default action recommended by Furi...,File a report to the Bug Report section of the...
66,ca538c67-ce79-412f-bd1b-c0f73dd818e7,https://furiosa-ai.github.io/docs/latest/en/re...,What new support features were added to the Fu...,The 0.7.0 release added Linear and Nearest mod...
67,ca538c67-ce79-412f-bd1b-c0f73dd818e7,https://furiosa-ai.github.io/docs/latest/en/re...,What functionality does the compiler cache int...,The compiler cache stores compiled binaries in...


In [46]:
# data_dir = '/Users/jwlee-pro/Documents/Workspace_2025/projects/llm-rag-chatbot/data'
# file_dir = f'{data_dir}/qa-rngd_sdk.csv'
# qa_df.to_csv(file_dir, encoding="utf-8")

In [47]:
# def remove_duplicates(qa_df):
#     # 중복된 질문 제거
#     qa_df = qa_df.drop_duplicates(subset=["question"], keep="first")
#     return qa_df

# remove_duplicates(qa_df)
# file_dir = f'{data_dir}/qa-rngd_sdk_v2.csv'
# qa_df.to_csv(file_dir, encoding="utf-8")

In [16]:
data_dir = '/Users/jwlee-pro/Documents/Workspace_2025/projects/llm-rag-chatbot/data'
file_dir = f'{data_dir}/qa-warboy_sdk_v3.csv'
qa_df.to_csv(file_dir, encoding="utf-8")

## Generate QA of card page with RAGAS

* Note: I think generating QA pair with RAGAS has low quality (warning)

In [66]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "FuriosaAI RNGD Ragas QA Dataset"

In [87]:
from markdownify import markdownify as md
from llama_index.core import Document 

class CustomDocument(Document):
    page_content: str = Field(default="", description="Additional content for the document")

def convert_page_to_llama_index_document(page: Page) -> CustomDocument:
    return CustomDocument(
        doc_id=page.id,
        metadata={
            "source": str(page.link),
            "title": page.name,
            "parent_doc_id": page.parent,
            "child_doc_ids": json.dumps(page.child),
        },
        text=page.description_clean,  # 기본 text
        page_content=page.description_clean,  # 추가 속성
    )

docs = [convert_page_to_llama_index_document(page) for page in final_pages]
docs



In [68]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

Wrapp the LLMs in LangchainLLMWrapper so that it can be used with ragas.

In [69]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [82]:
docs[0]



In [93]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=50)

Applying HeadlineSplitter:   0%|          | 0/21 [00:00<?, ?it/s]           unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
Applying SummaryExtractor:  41%|████      | 11/27 [00:03<00:02,  6.53it/s]Property 'summary' already exists in node 'fba6c3'. Skipping!
Applying SummaryExtractor:  44%|████▍     | 12/27 [00:03<00:03,  4.75it/s]Property 'summary' already exists in node '5d2c26'. Skipping!
Applying SummaryExtractor:  48%|████▊     | 13/27 [00:03<00:03,  4.38it/s]Property 'summary' already exists in node '8bf36e'. Skipping!
Property 'summary' already exists in node '4c3e39'. Skipping!
Applying SummaryExtractor:  59%|█████▉    | 16/27 [00:04<00:01,  5.89it/s]Property 'summary' already exists in node '0b7164'. Skipping!
Property 'summ

In [94]:
# save data
file_dir = f'{data_dir}/qa-warboy_sdk_ragas_v2'

dataset_df = dataset.to_pandas()
dataset_df.to_parquet(f"{file_dir}.parquet")
dataset_df.to_csv(f"{file_dir}.csv")

In [92]:
dataset_df

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What is FuriosaAI's role in running MLPerf Inf...,[* [.rst](../_sources/getting_started/furiosa_...,FuriosaAI Software Stack provides the `furiosa...,single_hop_specifc_query_synthesizer
1,Howw doo youu usee Furiosa LLM withh FuriosaAI...,[<1-hop>\n\n* [.rst](../_sources/getting_start...,To use Furiosa LLM with FuriosaAI NPU in a con...,multi_hop_abstract_query_synthesizer
2,What are the installation requirements for Fur...,[<1-hop>\n\nRunning Furiosa LLM in container e...,The installation requirements for Furiosa LLM ...,multi_hop_specific_query_synthesizer


In [10]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context, conditional
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-4o", temperature=0.2)
critic_llm = ChatOpenAI(model="gpt-4o", temperature=0.2)
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 1,
    # multi_context: 1,
    # reasoning: 0.35,
    # conditional: 0.2,
}

testset = generator.generate_with_llamaindex_docs(docs, 10, distributions) 
testset_df = testset.to_pandas()
testset_df.to_parquet("v2/qa_ragas_2.parquet")
testset_df.to_csv("v2/qa_ragas_2.csv")

ModuleNotFoundError: No module named 'ragas.testset.evolutions'