# QA 데이터셋 만들기 

In [3]:
from typing import List
from pydantic import BaseModel, Field, HttpUrl

class Page(BaseModel):
    id: str = Field(..., description="ID of the Page")
    link: HttpUrl = Field(description="Url link of the page")
    name: str = Field(description="Name of the page")
    parent: str = Field(default="", description="ID of the parent page")
    child: List[str] = Field(default=[], description="List of ids of the child pages")
    description: str = Field(default="", description="Description of the page")
    description_clean: str = Field(default="", description="Content markdown")
    html_content: str = Field(default="", description="HTML code of the main content in the page")

    def __hash__(self):
        return hash((self.link, self.name))

    def __eq__(self, other):
        if not isinstance(other, Page):
            return False
        return (self.link, self.name) == (other.link, other.name)

## DB 가지고 오기 

In [4]:
import json

data_dir = '/Users/jwlee-pro/Documents/Workspace_2025/projects/llm-rag-chatbot/data/db-rngd_sdk_v2.json'

with open(data_dir, "r") as f:
    data = json.load(f)
    final_pages = [Page.model_validate_json(page) for page in data["sdk"]]


In [5]:
final_pages[0]



In [25]:
# final_pages의 첫 번째 객체를 출력
print(final_pages[0])

# final_pages 전체 크기 확인
print(f"Total pages loaded: {len(final_pages)}")

Total pages loaded: 21


## QA 데이터셋 만들기 

In [17]:
import os

# openAI key 
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [29]:
from markdownify import markdownify as md
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

class QAPair(BaseModel):
    question: str = Field(description="Question generated by llm")
    answer: str = Field(description="Answer generated by llm")

output_parser = PydanticOutputParser(pydantic_object=QAPair)


# LLM 및 Prompt 설정
llm = ChatOpenAI(model="gpt-4o", temperature=0.2)

prompt = PromptTemplate(
    template="""As an expert in creating educational Question-Answer datasets, your task is to generate one high-quality QA pair based on the provided markdown content. 

Instructions:
1. Carefully analyze the markdown content in CONTENT section, identifying key concepts, details, and information.
2. Imagine you are a first-time visitor to a website and aim to create a challenging, abstract question that encourages deep engagement with the content.
3. Ensure that the question is specific enough that it can only be answered by referencing the given markdown.
4. Generate a concise, direct answer without introductory phrases like "The markdown says" or "Here is...".
5. Output **only one** QA pair.
6. {instructions}

Desired Format:
- Question: [Your abstract, content-specific question]
- Answer: [Your precise, context-reliant answer]

CONTENT:
{content}
""",
    partial_variables={"instructions": output_parser.get_format_instructions()},
    input_variables=["content"],
)

chain = prompt | llm | output_parser

# QA 생성 함수
def generate_qa(pages):
    qa_pair_list = []

    # 각 Page 객체에서 QA 생성
    for page in pages:
        markdown_content = page.description_clean  # 이미 Markdown 데이터 사용
        qa_pair = chain.invoke({"content": markdown_content})
        qa_pair_list.append(qa_pair)

    # QA 데이터프레임 생성
    questions = [pair.question for pair in qa_pair_list]
    answers = [pair.answer for pair in qa_pair_list]

    return questions, answers

In [18]:
from langchain.output_parsers import PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

class QAPair(BaseModel):
    question: str = Field(description="Question generated by llm")
    answer: str = Field(description="Answer generated by llm")

output_parser = PydanticOutputParser(pydantic_object=QAPair)

In [24]:
from markdownify import markdownify as md
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema import BaseOutputParser

# Define a simple output parser
class SimpleOutputParser(BaseOutputParser):
    def get_format_instructions(self):
        return "Output should follow this format:\n- Question: [Your question]\n- Answer: [Your answer]"

    def parse(self, text: str):
        lines = text.strip().split("\n")
        question = lines[0].replace("Question: ", "").strip()
        answer = lines[1].replace("Answer: ", "").strip()
        return {"question": question, "answer": answer}

# Initialize the parser
output_parser = SimpleOutputParser()

# Initialize the LLM
llm = ChatOpenAI(model="gpt-4o", temperature=0.2)

# Define the prompt template
prompt = PromptTemplate(
    template="""As an expert in creating educational Question-Answer datasets, your task is to generate one high-quality QA pair based on the provided markdown content. 

Instructions:
1. Carefully analyze the markdown content in CONTENT section, identifying key concepts, details, and information.
2. Imagine you are a first-time visitor to a website and aim to create a challenging, abstract question that encourages deep engagement with the content.
3. Ensure that the question is specific enough that it can only be answered by referencing the given markdown.
4. Generate a concise, direct answer without introductory phrases like "The markdown says" or "Here is...".
5. Output **only one** QA pair.
6. {instructions}

Desired Format:
- Question: [Your abstract, content-specific question]
- Answer: [Your precise, context-reliant answer]

CONTENT:
{content}
""",
    input_variables=["content"],
    partial_variables={"instructions": output_parser.get_format_instructions()},
)

PydanticUserError: A non-annotated attribute was detected: `input_keys = ['content']`. All model fields require a type annotation; if `input_keys` is not meant to be a field, you may be able to resolve this error by annotating it as a `ClassVar` or updating `model_config['ignored_types']`.

For further information visit https://errors.pydantic.dev/2.10/u/model-field-missing-annotation

In [19]:
from markdownify import markdownify as md
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

llm = ChatOpenAI(model="gpt-4o", temperature=0.2)

prompt = PromptTemplate(template="""As an expert in creating educational Question-Answer datasets, your task is to generate one high-quality QA pair based on the provided markdown content. 

Instructions:
1. Carefully analyze the markdown content in CONTENT section, identifying key concepts, details, and information.
2. Imagine you are a first-time visitor to a website and aim to create a challenging, abstract question that encourages deep engagement with the content.
3. Ensure that the question is specific enough that it can only be answered by referencing the given markdown.
4. Generate a concise, direct answer without introductory phrases like "The markdown says" or "Here is...".
5. Output **only one** QA pair.
6. {instructions}

Desired Format:
- Question: [Your abstract, content-specific question]
- Answer: [Your precise, context-reliant answer]

CONTENT:
{content}
""",
    partial_variables={"instructions": output_parser.get_format_instructions()},
    input_variables=["content"],
)

chain = prompt | llm | output_parser

def generate_qa(df):
    qa_pair_list = []
    for _, row in df.iterrows():
        context_content = [md(page.html_content, strip="img") for page in row["contexts"]]
        qa_pair = chain.invoke({"content": context_content}) 
        qa_pair_list.append(qa_pair)
    questions = [pair.question for pair in qa_pair_list] 
    answers = [pair.answer for pair in qa_pair_list] 
    df["query"] = questions
    df["generation_gt"] = answers

AttributeError: type object 'QAPair' has no attribute 'model_json_schema'

## Generate QA of card page with RAGAS

In [11]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "FuriosaAI RNGD Ragas QA Dataset"

In [12]:
from markdownify import markdownify as md
from llama_index.core import Document 
def convert_page_to_llama_index_document(page: Page) -> Document:
    return Document(
        doc_id=page.id,
        metadata={
            "source": str(page.link),
            "title": page.name,
            "parent_doc_id": page.parent,
            "child_doc_ids": json.dumps(page.child),
        },
        text=page.description_clean,
    )    

docs = [convert_page_to_llama_index_document(page) for page in final_pages]
docs



In [13]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

Wrapp the LLMs in LangchainLLMWrapper so that it can be used with ragas.

In [14]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

# generator with openai models
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
critic_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [15]:
generator.generate_with_llamaindex_docs()

NameError: name 'generator' is not defined

In [34]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# distributions = {
#     simple: 1,
#     # multi_context: 1,
#     # reasoning: 0.35,
#     # conditional: 0.2,
# }

testset = generator.generate_with_llamaindex_docs(docs, 10) 


NameError: name 'LCDocument' is not defined

## Generate Testset
Now we will run the test generation using the loaded documents and the LLM setup. If you have used llama_index to load documents, please use generate_with_llama_index_docs method instead.

In [28]:
dataset = generator.generate_with_llamaindex_docs(docs, testset_size=3)
dataset

NameError: name 'LCDocument' is not defined

In [26]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)
dataset

AttributeError: 'Document' object has no attribute 'page_content'

In [15]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-4")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
testset = generator.generate_with_langchain_docs(docs, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

ModuleNotFoundError: No module named 'ragas.testset.generator'

In [13]:
from ragas.testset

SyntaxError: invalid syntax (3248346490.py, line 1)

In [11]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context, conditional

ModuleNotFoundError: No module named 'ragas.testset.evolutions'

In [10]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context, conditional
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# documents = load your documents

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-4o", temperature=0.2)
critic_llm = ChatOpenAI(model="gpt-4o", temperature=0.2)
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 1,
    # multi_context: 1,
    # reasoning: 0.35,
    # conditional: 0.2,
}

testset = generator.generate_with_llamaindex_docs(docs, 10, distributions) 
testset_df = testset.to_pandas()
testset_df.to_parquet("v2/qa_ragas_2.parquet")
testset_df.to_csv("v2/qa_ragas_2.csv")

ModuleNotFoundError: No module named 'ragas.testset.evolutions'

In [16]:
from glob import glob
import pandas as pd

files = glob("./v2/shinhan*.csv")
merged_csv = pd.concat([pd.read_csv(file, index_col=0) for file in files])
merged_csv.to_csv("v2/merged_shinhan_card_qa_ragas.csv")

## 셀프 QA 데이터 생성
- 재귀적으로 페이지를 순회하면서 qa 데이터셋을 만들 context 수집
- 최상단 페이지는 무조건 포함
- 같은 부모 페이지로부터 나온 자식 페이지에서, 1~len(자식 페이지) 크기로 묶음. 묶인 페이지들을 multi-context 카테고리로 질문 생성

In [5]:
import random

def make_retrieval_set_recursively(root_page: Page, pages: List[Page], contexts: list):
    sub_pages = [p for p in pages for cid in root_page.child if p.id == cid]
    if len(sub_pages) == 0:
        return
    selected_page_list = random.sample(sub_pages, random.choice([i+1 for i in range(int((len(sub_pages)/2)))]))
    selected_page = random.sample(sub_pages, 1)
    contexts.append(selected_page)
    contexts.append(selected_page_list)
    print(f"{root_page.name} -> {[p.name for p in sub_pages]}")
    print(len(selected_page_list))
    print(selected_page[0].name)
    print([page.name for page in selected_page_list])
    print("="*20)
    for p in sub_pages:
        make_retrieval_set_recursively(p, pages, contexts)

In [6]:
root_page = [page for page in card_final_pages if page.parent == ""][0]
card_contexts = [[root_page]]
make_retrieval_set_recursively(root_page, card_final_pages, card_contexts)

card-product -> ['consumer-debit-card', 'corporate-debit-card', 'corporate-credit-card', 'card-comparison', 'consumer-credit-card']
1
card-comparison
['consumer-debit-card']
consumer-debit-card -> ['shinhan-pwm-debit-mastercard', 'hutech-shinhan-debit-card', 'visa-international-shinhan-my-sol-debit-card', 'domestic-atm-card', 'shinhan-be-safe-debit-card', 'visa-international-classic-debit-card', 'visa-international-shinhan-lotte-mart-debit-card', 'visa-pwm-international-classic-debit-card']
1
domestic-atm-card
['shinhan-be-safe-debit-card']
corporate-debit-card -> ['korcham-shinhan-corporate-debit-card', 'kocham-shinhan-corporate-debit-card', 'shinhan-visa-corporate-debit-card']
1
korcham-shinhan-corporate-debit-card
['kocham-shinhan-corporate-debit-card']
corporate-credit-card -> ['kocham-shinhan-corporate-credit-card', 'shinhan-corporate-world-credit-card', 'shinhan-visa-corporate-credit-card', 'korean-air-shinhan-corporate-credit-card', 'bizzi-shinhan-corporate-credit-card', 'korcha

In [32]:
import pandas as pd

card_qa_df = pd.DataFrame({"contexts": card_contexts})

In [33]:
card_qa_df.head()

Unnamed: 0,contexts
0,[id='32f049c3-cb5e-464f-8d26-68b46b11bb68' lin...
1,[id='3c960359-94f0-42a4-882e-da4d24e6bb40' lin...
2,[id='55ecf8f8-66a4-40db-affb-a7a112080885' lin...
3,[id='19fff71d-2445-49eb-8a27-ec8d41f4f49d' lin...
4,[id='0c8c9ded-b2be-422e-9b11-a1938d6a7e44' lin...


In [1]:
from langchain.output_parsers import PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

class QAPair(BaseModel):
    question: str = Field(description="Question generated by llm")
    answer: str = Field(description="Answer generated by llm")

output_parser = PydanticOutputParser(pydantic_object=QAPair)


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
from markdownify import markdownify as md
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

llm = ChatOpenAI(model="gpt-4o", temperature=0.2)

prompt = PromptTemplate(template="""As an expert in creating educational Question-Answer datasets, your task is to generate one high-quality QA pair based on the provided markdown content. 

Instructions:
1. Carefully analyze the markdown content in CONTENT section, identifying key concepts, details, and information.
2. Imagine you are a first-time visitor to a website and aim to create a challenging, abstract question that encourages deep engagement with the content.
3. Ensure that the question is specific enough that it can only be answered by referencing the given markdown.
4. Generate a concise, direct answer without introductory phrases like "The markdown says" or "Here is...".
5. Output **only one** QA pair.
6. {instructions}

Desired Format:
- Question: [Your abstract, content-specific question]
- Answer: [Your precise, context-reliant answer]

CONTENT:
{content}
""",
    partial_variables={"instructions": output_parser.get_format_instructions()},
    input_variables=["content"],
)

chain = prompt | llm | output_parser

def generate_qa(df):
    qa_pair_list = []
    for _, row in df.iterrows():
        context_content = [md(page.html_content, strip="img") for page in row["contexts"]]
        qa_pair = chain.invoke({"content": context_content}) 
        qa_pair_list.append(qa_pair)
    questions = [pair.question for pair in qa_pair_list] 
    answers = [pair.answer for pair in qa_pair_list] 
    df["query"] = questions
    df["generation_gt"] = answers

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [36]:
generate_qa(card_qa_df)

In [38]:
card_qa_df.to_csv("v2/card_qa_sample.csv", encoding="utf-8")

## Sampling
- 지금까지 만들었던 qa 데이터 중에서 괜찮은 질문 추출