# Amazon Bedrock 기반하에 RAGAS 사작 하기
- [필수 사항] 이 노트북을 실행하기 이전에 setup/README.md 를 참고하여 "가상환경" 을 먼저 설치하시고, 이 가상 환경을 커널로 설정 후에 진행 하세요.
- 참고 
    - RAGAS Git Repo: [Supercharge Your LLM Application Evaluations](https://github.com/explodinggradients/ragas)

### 환경 확인
- 아래와 같은 버전이 매칭 되어야 합니다.

In [7]:
 ! pip list | grep -E "ragas|pydantic|langchain"

langchain                0.3.17
langchain-aws            0.2.11
langchain-community      0.3.16
langchain-core           0.3.33
langchain-openai         0.3.3
langchain-text-splitters 0.3.5
pydantic                 2.10.6
pydantic_core            2.27.2
pydantic-settings        2.7.1
ragas                    0.2.12


## ragas 래핑 모델 생성

In [8]:
import boto3
from datasets import Dataset

from langchain_aws import ChatBedrockConverse
from ragas import evaluate


from ragas.metrics import (
    Faithfulness,
    AnswerRelevancy,
    ContextRecall,
    ContextPrecision
)

# Bedrock 클라이언트 설정
bedrock_client = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-west-2'
)

from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatBedrockConverse(
    model="anthropic.claude-3-5-haiku-20241022-v1:0", 
    client=bedrock_client,
))



## 랩핑 모델 테스트: 요약 정확도 확인

In [9]:
from ragas import SingleTurnSample
from ragas.metrics import AspectCritic

test_data = {
    "user_input": "summarise given text\nThe company reported an 8% rise in Q3 2024, driven by strong performance in the Asian market. Sales in this region have significantly contributed to the overall growth. Analysts attribute this success to strategic marketing and product localization. The positive trend in the Asian market is expected to continue into the next quarter.",
    "response": "The company experienced an 8% increase in Q3 2024, largely due to effective marketing strategies and product adaptation, with expectations of continued growth in the coming quarter.",
}

metric = AspectCritic(name="summary_accuracy",llm=evaluator_llm, definition="Verify if the summary is accurate.")
test_data = SingleTurnSample(**test_data)
print("test_data: \n", test_data)
await metric.single_turn_ascore(test_data)

test_data: 
 user_input='summarise given text\nThe company reported an 8% rise in Q3 2024, driven by strong performance in the Asian market. Sales in this region have significantly contributed to the overall growth. Analysts attribute this success to strategic marketing and product localization. The positive trend in the Asian market is expected to continue into the next quarter.' retrieved_contexts=None reference_contexts=None response='The company experienced an 8% increase in Q3 2024, largely due to effective marketing strategies and product adaptation, with expectations of continued growth in the coming quarter.' multi_responses=None reference=None rubrics=None


1

## ragas 래핑 임베딩 모델 생성

In [10]:
from langchain_community.embeddings import BedrockEmbeddings
# from ragas.embeddings import LangchainEmbeddingWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

# Bedrock Embeddings 설정
base_embeddings = BedrockEmbeddings(
    client=bedrock_client,
    model_id="amazon.titan-embed-text-v1"  # 또는 다른 임베딩 모델
)

# RAGAS Wrapper로 감싸기
embeddings_wrapper = LangchainEmbeddingsWrapper(base_embeddings)

## RAG 평가

### 데이터 셋 생성

In [11]:
def prepare_evaluation_dataset(examples):
    return {
        "question": examples["question"],
        "answer": examples["generated_answer"],
        "response": examples["generated_answer"],  # response 컬럼 추가
        "contexts": examples["retrieved_contexts"],
        "ground_truth": examples["ground_truth"]
    }

data = [{
    "question": "파이썬이란 무엇인가요?",
    "ground_truth": "파이썬은 쉽고 간결한 프로그래밍 언어입니다.",
    "retrieved_contexts": ["파이썬은 프로그래밍 언어입니다.", "파이썬은 읽기 쉽고 간결합니다."],
    "generated_answer": "파이썬은 읽기 쉽고 간결한 프로그래밍 언어입니다."
}]
dataset = Dataset.from_list(data)

# 2. RAGAS 평가용 데이터셋 포맷으로 변환
eval_dataset = dataset.map(
    prepare_evaluation_dataset,
    remove_columns=dataset.column_names
)
eval_dataset

Map: 100%|██████████| 1/1 [00:00<00:00, 288.05 examples/s]


Dataset({
    features: ['question', 'ground_truth', 'answer', 'response', 'contexts'],
    num_rows: 1
})

### 데이터셋 평가

In [12]:
# 먼저 Faithfulness만 테스트
try:
    llm = evaluator_llm

    metrics = [
                Faithfulness(llm=llm),
                 AnswerRelevancy(llm=llm, embeddings= embeddings_wrapper),
                ContextRecall(llm=llm),
                ContextPrecision(llm=llm),
    ]
    
    # 평가 실행a
    results = evaluate(
        eval_dataset,
        metrics=metrics,
    )
    
except Exception as e:
    print(f"Error occurred: {str(e)}")
    print(f"Error type: {type(e)}")

results

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]


{'faithfulness': 1.0000, 'answer_relevancy': 0.9028, 'context_recall': 1.0000, 'context_precision': 1.0000}