# 6. LangSmith를 이용한 RAG 앱 평가하기

## ⑷ Ragas를 통합 합성 테스트 데이터 생성

In [2]:
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = userdata.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "agent-test"

### 패키지 설치


In [3]:
!pip install langchain-core==0.2.30 langchain-openai==0.1.21 \
    langchain-community==0.2.12 GitPython==3.1.43 \
    langchain-chroma==0.1.2 chromadb==0.5.3 \
    ragas==0.1.14 nest-asyncio==1.6.0

Collecting langchain-core==0.2.30
  Downloading langchain_core-0.2.30-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-openai==0.1.21
  Downloading langchain_openai-0.1.21-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-community==0.2.12
  Downloading langchain_community-0.2.12-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-chroma==0.1.2
  Downloading langchain_chroma-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting chromadb==0.5.3
  Downloading chromadb-0.5.3-py3-none-any.whl.metadata (6.8 kB)
Collecting ragas==0.1.14
  Downloading ragas-0.1.14-py3-none-any.whl.metadata (5.3 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain-core==0.2.30)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai==0.1.21)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community==0.2.12)
  Downloa

### 검색대상 문서 불러오기

In [4]:
from langchain_community.document_loaders import GitLoader


def file_filter(file_path: str) -> bool:
    return file_path.endswith(".mdx")


loader = GitLoader(
    clone_url="https://github.com/langchain-ai/langchain",
    repo_path="./langchain",
    branch="master",
    file_filter=file_filter,
)

documents = loader.load()
print(len(documents))

371


### Ragas를 통한 합성 데스트 데이터 생성 구현

In [5]:
for document in documents:
    document.metadata["filename"] = document.metadata["source"]

In [7]:
import nest_asyncio
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

nest_asyncio.apply()

generator = TestsetGenerator.from_langchain(
    generator_llm=ChatOpenAI(model="gpt-4o-mini"),
    critic_llm=ChatOpenAI(model="gpt-4o-mini"),
    embeddings=OpenAIEmbeddings(),
)

testset = generator.generate_with_langchain_docs(
    documents,
    test_size=4,
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
)

embedding nodes:   0%|          | 0/1154 [00:00<?, ?it/s]

Generating:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
testset.to_pandas()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the purpose of collecting user prompts...,[# Trubrics\n\n\n>[Trubrics](https://trubrics....,The answer to given question is not present in...,simple,[{'source': 'docs/docs/integrations/providers/...,True
1,What role does PromptLayer play in the field o...,[# PromptLayer\n\n>[PromptLayer](https://docs....,PromptLayer is a platform for prompt engineeri...,simple,[{'source': 'docs/docs/integrations/providers/...,True
2,What integrations help MindsDB tailor AI with ...,[# MindsDB\n\nMindsDB is the platform for cust...,MindsDB integrates with nearly 200 data source...,reasoning,[{'source': 'docs/docs/integrations/providers/...,True
3,What AI integration features does MindsDB prov...,[# MindsDB\n\nMindsDB is the platform for cust...,The context does not provide specific details ...,multi_context,[{'source': 'docs/docs/integrations/providers/...,True


### LangSmith의 Dataset 만들기

In [9]:
from langsmith import Client

dataset_name = "agent-test"

client = Client()

if client.has_dataset(dataset_name=dataset_name):
    client.delete_dataset(dataset_name=dataset_name)

dataset = client.create_dataset(dataset_name=dataset_name)

### 합성 테스트데이터 저장

In [10]:
inputs = []
outputs = []
metadatas = []

for testset_record in testset.test_data:
    inputs.append(
        {
            "question": testset_record.question,
        }
    )
    outputs.append(
        {
            "contexts": testset_record.contexts,
            "ground_truth": testset_record.ground_truth,
        }
    )
    metadatas.append(
        {
            "source": testset_record.metadata[0]["source"],
            "evolution_type": testset_record.evolution_type,
        }
    )

In [11]:
client.create_examples(
    inputs=inputs,
    outputs=outputs,
    metadata=metadatas,
    dataset_id=dataset.id,
)

## ⑸ LangSmith와 Ragas를 이용한 오프라인 평가 구현

### 사용자지정 Evaluator 구현

In [12]:
from typing import Any

from langchain_core.embeddings import Embeddings
from langchain_core.language_models import BaseChatModel
from langsmith.schemas import Example, Run
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from ragas.metrics.base import Metric, MetricWithEmbeddings, MetricWithLLM


class RagasMetricEvaluator:
    def __init__(self, metric: Metric, llm: BaseChatModel, embeddings: Embeddings):
        self.metric = metric

        # LLM과 Embeddings을 Metric으로 설정
        if isinstance(self.metric, MetricWithLLM):
            self.metric.llm = LangchainLLMWrapper(llm)
        if isinstance(self.metric, MetricWithEmbeddings):
            self.metric.embeddings = LangchainEmbeddingsWrapper(embeddings)

    def evaluate(self, run: Run, example: Example) -> dict[str, Any]:
        context_strs = [doc.page_content for doc in run.outputs["contexts"]]

        # Ragas의 평가 메트릭스의 score메소드로 산출함
        score = self.metric.score(
            {
                "question": example.inputs["question"],
                "answer": run.outputs["answer"],
                "contexts": context_strs,
                "ground_truth": example.outputs["ground_truth"],
            },
        )
        return {"key": self.metric.name, "score": score}

In [13]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.metrics import answer_relevancy, context_precision

metrics = [context_precision, answer_relevancy]

llm = ChatOpenAI(model="gpt-4o", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

evaluators = [
    RagasMetricEvaluator(metric, llm, embeddings).evaluate
    for metric in metrics
]

### 추론 기능 구현

In [14]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
db = Chroma.from_documents(documents, embeddings)

In [15]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai import ChatOpenAI

prompt = ChatPromptTemplate.from_template('''\
아래 문맥을 고려하여 질문에 답해 주세요.

문맥: """
{context}
"""

질문: {question}
''')

model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

retriever = db.as_retriever()

chain = RunnableParallel(
    {
        "question": RunnablePassthrough(),
        "context": retriever,
    }
).assign(answer=prompt | model | StrOutputParser())

In [16]:
def predict(inputs: dict[str, Any]) -> dict[str, Any]:
    question = inputs["question"]
    output = chain.invoke(question)
    return {
        "contexts": output["context"],
        "answer": output["answer"],
    }

### 오프라인 평가 구현 및 실행

In [17]:
from langsmith.evaluation import evaluate

evaluate(
    predict,
    data="agent-test",
    evaluators=evaluators,
)

View the evaluation results for experiment: 'mealy-digestion-66' at:
https://smith.langchain.com/o/78d424ea-1632-42d0-ba8e-4097b0f3ce0a/datasets/b6b9cf72-d865-4bd7-87b7-7021dd81df8d/compare?selectedSessions=3ff9e49a-72fe-4f8e-9692-e52edda5de60




0it [00:00, ?it/s]

Unnamed: 0,inputs.question,outputs.contexts,outputs.answer,error,reference.contexts,reference.ground_truth,feedback.context_precision,feedback.answer_relevancy,execution_time,example_id,id
0,What AI integration features does MindsDB prov...,[page_content='# Motherduck\n\n>[Motherduck](h...,The provided context does not mention any spec...,,[# MindsDB\n\nMindsDB is the platform for cust...,The context does not provide specific details ...,0.0,0.0,2.543492,5d1aef43-323b-4478-a196-aeb3802e06d0,507716dc-64e9-498f-9eee-f35e542d24e0
1,What integrations help MindsDB tailor AI with ...,[page_content='# BagelDB\n\n> [BagelDB](https:...,문맥에서 제공된 정보에는 MindsDB와 관련된 통합 내용이 포함되어 있지 않습니다...,,[# MindsDB\n\nMindsDB is the platform for cust...,MindsDB integrates with nearly 200 data source...,0.0,0.0,2.943428,0c846b6e-52c2-42a3-a0d7-039274a06e60,923f092e-a11c-4e99-ade0-dfecb5845447
2,What is the purpose of collecting user prompts...,[page_content='# Trubrics\n\n\n>[Trubrics](htt...,The purpose of collecting user prompts and fee...,,[# Trubrics\n\n\n>[Trubrics](https://trubrics....,The answer to given question is not present in...,0.0,1.0,2.611276,7ca76473-7012-4a1a-9035-dfeac0968dc3,e42ad69e-d570-4db9-bb25-0df38219aa0f
3,What role does PromptLayer play in the field o...,[page_content='# PromptLayer\n\n>[PromptLayer]...,PromptLayer is a platform designed for prompt ...,,[# PromptLayer\n\n>[PromptLayer](https://docs....,PromptLayer is a platform for prompt engineeri...,1.0,0.836911,3.23603,dce05ccc-57cd-4ec9-b762-c9ad896ab81d,4d90245a-f5df-46b5-8d79-f03fb2fb0863


## ⑹ LangSmith를 이용한 피드백 수집

### 구현하는 피드백 기능에 대해

In [18]:
from uuid import UUID

import ipywidgets as widgets
from IPython.display import display
from langsmith import Client


def display_feedback_buttons(run_id: UUID) -> None:
    # Good버튼과 Bad버튼 준비
    good_button = widgets.Button(
        description="Good",
        button_style="success",
        icon="thumbs-up",
    )
    bad_button = widgets.Button(
        description="Bad",
        button_style="danger",
        icon="thumbs-down",
    )

    # 클릭시 실행되는 함수 정의
    def on_button_clicked(button: widgets.Button) -> None:
        if button == good_button:
            score = 1
        elif button == bad_button:
            score = 0
        else:
            raise ValueError(f"Unknown button: {button}")

        client = Client()
        client.create_feedback(run_id=run_id, key="thumbs", score=score)
        print("피드백을 보냈습니다.")

    # 버튼이 클릭되었을 때 on_button_clicked함수 실행
    good_button.on_click(on_button_clicked)
    bad_button.on_click(on_button_clicked)

    # 버튼 표시
    display(good_button, bad_button)

### 피드백 버튼 표시

In [19]:
from langchain_core.tracers.context import collect_runs

# LangSmith의 트레이스 ID(Run ID)를 얻기 위해 collect_runs함수 사용
with collect_runs() as runs_cb:
    output = chain.invoke("LangChain의 개요에 대하여 ")
    print(output["answer"])
    run_id = runs_cb.traced_runs[0].id

display_feedback_buttons(run_id)

LangChain은 대형 언어 모델(LLM)을 활용한 애플리케이션 개발을 위한 프레임워크입니다. 이 프레임워크는 LLM 애플리케이션의 라이프사이클의 모든 단계를 간소화하는 데 중점을 두고 있습니다. LangChain의 주요 기능은 다음과 같습니다:

1. **개발**: LangChain의 오픈 소스 빌딩 블록, 구성 요소 및 제3자 통합을 사용하여 애플리케이션을 구축할 수 있습니다. LangGraph를 사용하여 상태를 유지하는 에이전트를 구축하고, 스트리밍 및 인간-루프 지원을 제공합니다.

2. **생산화**: LangSmith를 사용하여 체인을 검사, 모니터링 및 평가함으로써 지속적으로 최적화하고 자신 있게 배포할 수 있습니다.

3. **배포**: LangGraph 애플리케이션을 프로덕션 준비가 완료된 API 및 어시스턴트로 변환할 수 있습니다.

LangChain은 여러 오픈 소스 라이브러리로 구성되어 있으며, 주요 구성 요소로는 `langchain-core`, 다양한 통합 패키지, `langchain`, `langchain-community`, LangGraph, LangServe, LangSmith 등이 있습니다. 이 프레임워크는 Python과 JavaScript 라이브러리 모두에 대한 문서를 제공하며, 사용자는 특정 요구 사항에 맞는 구성 요소를 선택하여 사용할 수 있습니다.

LangChain은 또한 표준화된 구성 요소 인터페이스, 복잡한 애플리케이션을 위한 오케스트레이션, 그리고 애플리케이션의 가시성과 평가를 지원하여 개발자가 더 쉽게 애플리케이션을 구축하고 관리할 수 있도록 돕습니다.


Button(button_style='success', description='Good', icon='thumbs-up', style=ButtonStyle())

Button(button_style='danger', description='Bad', icon='thumbs-down', style=ButtonStyle())

피드백을 보냈습니다.
