In [2]:
from typing import List
from pydantic import BaseModel, Field, HttpUrl

class Page(BaseModel):
    id: str = Field(..., description="ID of the Page")
    link: HttpUrl = Field(description="Url link of the page")
    name: str = Field(description="Name of the page")
    parent: str = Field(default="", description="ID of the parent page")
    child: List[str] = Field(default=[], description="List of ids of the child pages")
    description: str = Field(default="", description="Description of the page")
    html_content: str = Field(default="", description="HTML code of the main content in the page")

    def __hash__(self):
        return hash((self.link, self.name))

    def __eq__(self, other):
        if not isinstance(other, Page):
            return False
        return (self.link, self.name) == (other.link, other.name)

In [3]:
import json

with open("deposit_page_with_description.json", "r") as f:
    data = json.load(f)
    personal_deposit_final_pages = [Page.model_validate_json(page) for page in data["personal_deposit"]]

with open("loan_page_with_description.json", "r") as f:
    data = json.load(f)
    personal_loan_final_pages = [Page.model_validate_json(page) for page in data["personal_loan"]]

with open("card_page_with_description.json", "r") as f:
    data = json.load(f)
    card_product_final_pages = [Page.model_validate_json(page) for page in data["card_product"]]
    offers_and_services_final_pages = [Page.model_validate_json(page) for page in data["offers_and_services"]]
    card_promotion_final_pages = [Page.model_validate_json(page) for page in data["card_promotion"]]
    card_final_pages = sum([card_product_final_pages, offers_and_services_final_pages, card_promotion_final_pages], [])

In [3]:
card_final_pages[0]

Page(id='beebee06-ad16-40e3-b3a0-59fba08fe81e', link=Url('https://shinhan.com.vn/en/card/kocham-shinhan-corporate-credit-card.html'), name='kocham-shinhan-corporate-credit-card', parent='be56b4cc-b45e-4b8b-9e2a-f0b3d84ce949', child=[], description="The Shinhan Visa Corporate Credit Card is a credit card service offered by Shinhan Bank Vietnam, designed for corporations, organizations, and institutions. The card provides various benefits, including exclusive privileges, special offers, and a convenient bill payment service.\n\nTo be eligible for the card, the applicant must meet certain criteria, such as having a corporate bank account at Shinhan Bank Vietnam and being a member of the Korean Chamber of Commerce in Vietnam (KOCHAM).\n\nThe required documents for application include:\n\n* Card Application\n* Business License/Investment License\n* Tax Code Certificate\n* Seal & Registered Stamp certificate\n* The Company's Charter\n* CEO's ID card or passport\n* Delegator's ID card or pass

## Generate QA of card page with RAGAS

In [4]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "Shinhan Ragas QA Dataset"

In [5]:
from markdownify import markdownify as md
from llama_index.core import Document 
def convert_page_to_llama_index_document(page: Page) -> Document:
    return Document(
        doc_id=page.id,
        metadata={
            "source": str(page.link),
            "title": page.name,
            "parent_doc_id": page.parent,
            "child_doc_ids": json.dumps(page.child),
            "description": page.description,
        },
        text=md(page.html_content, strip=["img"]),
    )    

docs = [convert_page_to_llama_index_document(page) for page in card_final_pages]

In [15]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context, conditional
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# documents = load your documents

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-4o", temperature=0.2)
critic_llm = ChatOpenAI(model="gpt-4o", temperature=0.2)
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 1,
    # multi_context: 1,
    # reasoning: 0.35,
    # conditional: 0.2,
}

testset = generator.generate_with_llamaindex_docs(docs, 10, distributions) 
testset_df = testset.to_pandas()
testset_df.to_parquet("v2/shinhan_card_qa_ragas_2.parquet")
testset_df.to_csv("v2/shinhan_card_qa_ragas_2.csv")

embedding nodes:   0%|          | 0/256 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/10 [00:00<?, ?it/s]

In [16]:
from glob import glob
import pandas as pd

files = glob("./v2/shinhan*.csv")
merged_csv = pd.concat([pd.read_csv(file, index_col=0) for file in files])
merged_csv.to_csv("v2/merged_shinhan_card_qa_ragas.csv")

## 셀프 QA 데이터 생성
- 재귀적으로 페이지를 순회하면서 qa 데이터셋을 만들 context 수집
- 최상단 페이지는 무조건 포함
- 같은 부모 페이지로부터 나온 자식 페이지에서, 1~len(자식 페이지) 크기로 묶음. 묶인 페이지들을 multi-context 카테고리로 질문 생성

In [5]:
import random

def make_retrieval_set_recursively(root_page: Page, pages: List[Page], contexts: list):
    sub_pages = [p for p in pages for cid in root_page.child if p.id == cid]
    if len(sub_pages) == 0:
        return
    selected_page_list = random.sample(sub_pages, random.choice([i+1 for i in range(int((len(sub_pages)/2)))]))
    selected_page = random.sample(sub_pages, 1)
    contexts.append(selected_page)
    contexts.append(selected_page_list)
    print(f"{root_page.name} -> {[p.name for p in sub_pages]}")
    print(len(selected_page_list))
    print(selected_page[0].name)
    print([page.name for page in selected_page_list])
    print("="*20)
    for p in sub_pages:
        make_retrieval_set_recursively(p, pages, contexts)

In [6]:
root_page = [page for page in card_final_pages if page.parent == ""][0]
card_contexts = [[root_page]]
make_retrieval_set_recursively(root_page, card_final_pages, card_contexts)

card-product -> ['consumer-debit-card', 'corporate-debit-card', 'corporate-credit-card', 'card-comparison', 'consumer-credit-card']
1
card-comparison
['consumer-debit-card']
consumer-debit-card -> ['shinhan-pwm-debit-mastercard', 'hutech-shinhan-debit-card', 'visa-international-shinhan-my-sol-debit-card', 'domestic-atm-card', 'shinhan-be-safe-debit-card', 'visa-international-classic-debit-card', 'visa-international-shinhan-lotte-mart-debit-card', 'visa-pwm-international-classic-debit-card']
1
domestic-atm-card
['shinhan-be-safe-debit-card']
corporate-debit-card -> ['korcham-shinhan-corporate-debit-card', 'kocham-shinhan-corporate-debit-card', 'shinhan-visa-corporate-debit-card']
1
korcham-shinhan-corporate-debit-card
['kocham-shinhan-corporate-debit-card']
corporate-credit-card -> ['kocham-shinhan-corporate-credit-card', 'shinhan-corporate-world-credit-card', 'shinhan-visa-corporate-credit-card', 'korean-air-shinhan-corporate-credit-card', 'bizzi-shinhan-corporate-credit-card', 'korcha

In [32]:
import pandas as pd

card_qa_df = pd.DataFrame({"contexts": card_contexts})

In [33]:
card_qa_df.head()

Unnamed: 0,contexts
0,[id='32f049c3-cb5e-464f-8d26-68b46b11bb68' lin...
1,[id='3c960359-94f0-42a4-882e-da4d24e6bb40' lin...
2,[id='55ecf8f8-66a4-40db-affb-a7a112080885' lin...
3,[id='19fff71d-2445-49eb-8a27-ec8d41f4f49d' lin...
4,[id='0c8c9ded-b2be-422e-9b11-a1938d6a7e44' lin...


In [34]:
from langchain.output_parsers import PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

class QAPair(BaseModel):
    question: str = Field(description="Question generated by llm")
    answer: str = Field(description="Answer generated by llm")

output_parser = PydanticOutputParser(pydantic_object=QAPair)

In [35]:
from markdownify import markdownify as md
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

llm = ChatOpenAI(model="gpt-4o", temperature=0.2)

prompt = PromptTemplate(template="""As an expert in creating educational Question-Answer datasets, your task is to generate one high-quality QA pair based on the provided markdown content. 

Instructions:
1. Carefully analyze the markdown content in CONTENT section, identifying key concepts, details, and information.
2. Imagine you are a first-time visitor to a website and aim to create a challenging, abstract question that encourages deep engagement with the content.
3. Ensure that the question is specific enough that it can only be answered by referencing the given markdown.
4. Generate a concise, direct answer without introductory phrases like "The markdown says" or "Here is...".
5. Output **only one** QA pair.
6. {instructions}

Desired Format:
- Question: [Your abstract, content-specific question]
- Answer: [Your precise, context-reliant answer]

CONTENT:
{content}
""",
    partial_variables={"instructions": output_parser.get_format_instructions()},
    input_variables=["content"],
)

chain = prompt | llm | output_parser

def generate_qa(df):
    qa_pair_list = []
    for _, row in df.iterrows():
        context_content = [md(page.html_content, strip="img") for page in row["contexts"]]
        qa_pair = chain.invoke({"content": context_content}) 
        qa_pair_list.append(qa_pair)
    questions = [pair.question for pair in qa_pair_list] 
    answers = [pair.answer for pair in qa_pair_list] 
    df["query"] = questions
    df["generation_gt"] = answers

In [36]:
generate_qa(card_qa_df)

In [38]:
card_qa_df.to_csv("v2/card_qa_sample.csv", encoding="utf-8")

## Sampling
- 지금까지 만들었던 qa 데이터 중에서 괜찮은 질문 추출