In [2]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from bs4 import BeautifulSoup

In [None]:
url = "https://python.langchain.com/docs/expression_language/"
loader = RecursiveUrlLoader(url=url, max_depth=20, extractor=lambda x: BeautifulSoup(x, "html.parser").text)
docs = loader.load()
len(docs)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
len(splits)

In [8]:
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

In [None]:
retriever.invoke("What is LCEL?")

In [20]:
# RAG
import openai
from langsmith import traceable
from langsmith.wrappers import wrap_openai


class RagBot:
    def __init__(self, retriever, model: str="gpt-4o-mini"):
        self._retriever = retriever
        self._client = wrap_openai(openai.Client())
        self._model = model

    @traceable
    def get_answer(self, question: str):
        similar = self._retriever.invoke(question)
        response = self._client.chat.completions.create(
            model=self._model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful AI assistant with expertise in LCEL."
                    " Use the following docs to produce a concise code solution to the user question.\n\n"
                    f"## Docs\n\n{similar}",
                },
            ]
        )

        return {
            "answer": response.choices[0].message.content,
            "contexts": [str(doc) for doc in similar],
        }

rag_bot = RagBot(retriever)

In [None]:
response = rag_bot.get_answer("What is LCEL?")
print(response["answer"])

In [None]:
# RAG dataset
from langsmith import Client

inputs = [
    "How can I directly pass a string to a runnable and use it to construct the input needed for my prompt?",
    "How can I make the output of my LCEL chain a string?",
    "How can I apply a custom function to one of the inputs of an LCEL chain?"
]

outputs = [
    "Use RunnablePassthrough.",
    "Use StrOutputParser.",
    "Use RunnableLambda with itemgetter to extract the relevant key."
]

qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
qa_pairs

In [24]:
client = Client()
dataset_name = "RAG_test_LCEL"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="QA pairs about LCEL."
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id
)

## RAG Evaluators

### Answer Accuracy

In [26]:
# RAG chain
def predict_rag_answer(example: dict):
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"]}


def predict_rag_answer_with_context(example: dict):
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

In [None]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

# 予測と正解の比較
# Answer Accuracy
"""
あなたは小テストを採点する教師です。
あなたには問題、問題の背景、学生の答えが与えられます。あなたは文脈に基づいて、生徒の答えを「正解」または「不正解」のどちらかに採点するよう求められます。
あなたの結論が正しいことを確認するために、あなたの推論を段階的に書き出してください。最初に単に正解を述べることは避けてください。

フォーマット例
QUESTION: ここに質問
CONTEXT：質問の文脈はここ
STUDENT ANSWER：学生の答えはここ
EXPLANATION: ここでのステップごとの推論
GRADE: CORRECT（正解）またはINCORRECT（不正解）をここに記入します。

生徒の解答は事実の正確さのみに基づいて採点します。学生の解答と本当の解答の句読点や言い回しの違いは無視します。矛盾する記述がない限り、生徒の答えが本当の答えより多くの情報を含んでいても構いません。開始します！
"""

qa_evaluator = [
    LangChainStringEvaluator("cot_qa",
                             # これは、あってもなくても同じではないか？
                             prepare_data=lambda run, example: {
                                 "prediction": run.outputs["answer"],
                                 "reference": example.outputs["answer"],
                                 "input": example.inputs["question"],
                             })]

dataset_name = "RAG_test_LCEL"
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="rag-qa-oai",
    metadata={"variant": "LCEL context, gpt-3.5-turbo"}
)

### Answer Hallucination

In [34]:
# Hallucinationの評価
# 検索した文書から生成した回答が得られるか？
from langsmith.evaluation import LangChainStringEvaluator, evaluate

answer_hallucination_evaluator = LangChainStringEvaluator(
    "labeled_score_string",
    config={
        "criteria": {
            "accuracy": """Is the Assistant's Answer grounded in the Ground Truth documentation? A score of 0 means that the
            Assistant answer contains is not at all based upon / grounded in the Ground Truth documentation. A score of 5 means
            that the Assistant answer contains some information (e.g., a hallucination) that is not captured in the Ground Truth
            documentation. A score of 10 means that the Assistant answer is fully based upon the in the Ground Truth documentation."""
        },
        "normalized_by": 10
    },
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["contexts"],
        "input": example.inputs["question"],
    }
)

In [None]:
dataset_name = "RAG_test_LCEL"

experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[answer_hallucination_evaluator],
    experiment_prefix="rag-qa-oai-hallucination",
)