In [None]:
import os
import asyncio
import pandas as pd
import time
import ast
from tqdm import tqdm
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import (
    LLMContextPrecisionWithoutReference,
    ContextRelevance,
    AnswerAccuracy,
    SemanticSimilarity,
    Faithfulness
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# 设置 API Key 和 Base URL
os.environ["OPENAI_API_KEY"] = "your api key"

# 创建 LangChain LLM 实例，支持自定义 base_url
llm_instance = ChatOpenAI(
    model="deepseek-chat",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
)

# 包装成 Ragas LLM
evaluator_llm = LangchainLLMWrapper(llm_instance)

# 创建 LangChain 嵌入实例，使用 text-embedding-3-small 模型
evaluator_embedding = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
)

# 包装成 Ragas 嵌入
evaluator_embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)

async def main():
    # 读取输入 CSV 文件
    input_file = 'merged_test_dataset.csv'
    df = pd.read_csv(input_file)

    # 创建评估器（在循环外创建以复用）
    context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)
    scorer_relevance = ContextRelevance(llm=evaluator_llm)
    scorer_accuracy = AnswerAccuracy(llm=evaluator_llm)
    scorer_similarity = SemanticSimilarity(embeddings=evaluator_embeddings_wrapper)
    scorer_faithfulness = Faithfulness(llm=evaluator_llm)

    results = []

    for index, row in tqdm(df.iterrows(), total=len(df), desc="评估进度"):
        start_time = time.time()

        user_input = row['Query']
        reference = row['Ground_truth']
        response = row['Response']
        retrieved_context_str = row['Retrievaled_context']

        # 尝试解析 retrieved_contexts 为列表，如果失败则视为单个字符串的列表
        try:
            retrieved_contexts = ast.literal_eval(retrieved_context_str)
            if not isinstance(retrieved_contexts, list):
                retrieved_contexts = [retrieved_context_str]
        except:
            retrieved_contexts = [retrieved_context_str]

        # 创建样本并计算分数
        sample1 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score1 = await context_precision.single_turn_ascore(sample1)

        sample2 = SingleTurnSample(
            user_input=user_input,
            retrieved_contexts=retrieved_contexts,
        )
        score2 = await scorer_relevance.single_turn_ascore(sample2)

        sample3 = SingleTurnSample(
            user_input=user_input,
            response=response,
            reference=reference
        )
        score3 = await scorer_accuracy.single_turn_ascore(sample3)

        sample4 = SingleTurnSample(
            response=response,
            reference=reference
        )
        score4 = await scorer_similarity.single_turn_ascore(sample4)

        sample5 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score5 = await scorer_faithfulness.single_turn_ascore(sample5)

        end_time = time.time()
        evaluation_time = end_time - start_time

        # 记录结果
        results.append({
            'Query': user_input,
            'Ground_truth': reference,
            'Response': response,
            'Retrievaled_context': retrieved_context_str,
            'Context_Precision': score1,
            'Context_Relevance': score2,
            'Answer_Accuracy': score3,
            'Semantic_Similarity': score4,
            'Faithfulness': score5,
            'Evaluation_Time': evaluation_time
        })

    # 保存结果到新的 CSV 文件
    output_df = pd.DataFrame(results)
    output_file = 'evaluation_results.csv'
    output_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    asyncio.run(main())

In [None]:
import os
import asyncio
import pandas as pd
import time
import ast
from tqdm import tqdm
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import (
    LLMContextPrecisionWithoutReference,
    ContextRelevance,
    AnswerAccuracy,
    SemanticSimilarity,
    Faithfulness
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import openpyxl  # 确保安装 openpyxl 以支持 Excel 追加

# 设置 API Key 和 Base URL
os.environ["OPENAI_API_KEY"] = "your api key"

# 创建 LangChain LLM 实例，支持自定义 base_url，并强制 JSON 输出以修复解析错误
llm_instance = ChatOpenAI(
    model="deepseek-chat",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
    response_format={"type": "json_object"}  # 强制 JSON 输出模式，修复 OutputParserException
)

# 包装成 Ragas LLM
evaluator_llm = LangchainLLMWrapper(llm_instance)

# 创建 LangChain 嵌入实例，使用 text-embedding-3-small 模型
evaluator_embedding = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
)

# 包装成 Ragas 嵌入
evaluator_embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)

async def main():
    # 读取输入 XLSX 文件
    input_file = 'merged_test_dataset_1.xlsx'
    df = pd.read_excel(input_file)

    # 创建评估器（在循环外创建以复用）
    context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)
    scorer_relevance = ContextRelevance(llm=evaluator_llm)
    scorer_accuracy = AnswerAccuracy(llm=evaluator_llm)
    scorer_similarity = SemanticSimilarity(embeddings=evaluator_embeddings_wrapper)
    scorer_faithfulness = Faithfulness(llm=evaluator_llm)

    output_file = 'evaluation_results_1.xlsx'

    # 如果输出文件不存在，创建空文件并写入表头
    if not os.path.exists(output_file):
        header_df = pd.DataFrame(columns=[
            'Query', 'Ground_truth', 'Response', 'Retrievaled_context',
            'Context_Precision', 'Context_Relevance', 'Answer_Accuracy',
            'Semantic_Similarity', 'Faithfulness', 'Evaluation_Time'
        ])
        header_df.to_excel(output_file, index=False)

    for index, row in tqdm(df.iterrows(), total=len(df), desc="评估进度"):
        start_time = time.time()

        user_input = row['Query']
        reference = row['Ground_truth']
        response = row['Response']
        retrieved_context_str = row['Retrievaled_context']

        # 尝试解析 retrieved_contexts 为列表，如果失败则视为单个字符串的列表
        try:
            retrieved_contexts = ast.literal_eval(retrieved_context_str)
            if not isinstance(retrieved_contexts, list):
                retrieved_contexts = [retrieved_context_str]
        except:
            retrieved_contexts = [retrieved_context_str]

        # 创建样本并计算分数
        sample1 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score1 = await context_precision.single_turn_ascore(sample1)

        sample2 = SingleTurnSample(
            user_input=user_input,
            retrieved_contexts=retrieved_contexts,
        )
        score2 = await scorer_relevance.single_turn_ascore(sample2)

        sample3 = SingleTurnSample(
            user_input=user_input,
            response=response,
            reference=reference
        )
        score3 = await scorer_accuracy.single_turn_ascore(sample3)

        sample4 = SingleTurnSample(
            response=response,
            reference=reference
        )
        score4 = await scorer_similarity.single_turn_ascore(sample4)

        sample5 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score5 = await scorer_faithfulness.single_turn_ascore(sample5)

        end_time = time.time()
        evaluation_time = end_time - start_time

        # 记录单行结果
        result = {
            'Query': user_input,
            'Ground_truth': reference,
            'Response': response,
            'Retrievaled_context': retrieved_context_str,
            'Context_Precision': score1,
            'Context_Relevance': score2,
            'Answer_Accuracy': score3,
            'Semantic_Similarity': score4,
            'Faithfulness': score5,
            'Evaluation_Time': evaluation_time
        }

        # 将单行结果追加到 Excel 文件（使用 openpyxl 引擎支持追加），添加重试机制以处理 PermissionError
        result_df = pd.DataFrame([result])
        retries = 3
        for attempt in range(retries):
            try:
                with pd.ExcelWriter(output_file, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:
                    # 读取现有行数
                    book = writer.book
                    sheet = book.active
                    startrow = sheet.max_row
                    # 追加数据（不包括表头）
                    result_df.to_excel(writer, index=False, header=False, startrow=startrow)
                print(f"Row {index + 1} saved to {output_file}")
                break
            except PermissionError:
                print(f"Permission denied on attempt {attempt + 1} for row {index + 1}. Please close the file if it's open. Retrying in 5 seconds...")
                time.sleep(5)
        else:
            raise PermissionError(f"Failed to write to {output_file} after {retries} attempts. Ensure the file is closed.")

if __name__ == "__main__":
    asyncio.run(main())

  from .autonotebook import tqdm as notebook_tqdm
                response_format was transferred to model_kwargs.
                Please confirm that response_format is what you intended.
  if await self.run_code(code, result, async_=asy):
  evaluator_embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)
评估进度:   5%|▌         | 1/19 [00:33<09:55, 33.07s/it]

Row 1 saved to evaluation_results_1.xlsx


评估进度:  11%|█         | 2/19 [01:03<08:54, 31.44s/it]

Row 2 saved to evaluation_results_1.xlsx


评估进度:  16%|█▌        | 3/19 [02:03<11:52, 44.55s/it]

Row 3 saved to evaluation_results_1.xlsx


评估进度:  21%|██        | 4/19 [02:37<10:07, 40.50s/it]

Row 4 saved to evaluation_results_1.xlsx


评估进度:  26%|██▋       | 5/19 [03:11<08:51, 37.98s/it]

Row 5 saved to evaluation_results_1.xlsx


评估进度:  32%|███▏      | 6/19 [03:50<08:19, 38.41s/it]

Row 6 saved to evaluation_results_1.xlsx


评估进度:  37%|███▋      | 7/19 [04:24<07:21, 36.81s/it]

Row 7 saved to evaluation_results_1.xlsx


评估进度:  42%|████▏     | 8/19 [04:56<06:29, 35.38s/it]

Row 8 saved to evaluation_results_1.xlsx


评估进度:  47%|████▋     | 9/19 [05:35<06:04, 36.46s/it]

Row 9 saved to evaluation_results_1.xlsx


评估进度:  53%|█████▎    | 10/19 [06:08<05:20, 35.57s/it]

Row 10 saved to evaluation_results_1.xlsx


评估进度:  58%|█████▊    | 11/19 [06:50<05:00, 37.52s/it]

Row 11 saved to evaluation_results_1.xlsx


评估进度:  63%|██████▎   | 12/19 [07:59<05:28, 46.97s/it]

Row 12 saved to evaluation_results_1.xlsx


评估进度:  68%|██████▊   | 13/19 [09:34<06:08, 61.50s/it]

Row 13 saved to evaluation_results_1.xlsx


评估进度:  74%|███████▎  | 14/19 [12:20<07:45, 93.13s/it]

Row 14 saved to evaluation_results_1.xlsx


评估进度:  79%|███████▉  | 15/19 [13:16<05:28, 82.01s/it]

Row 15 saved to evaluation_results_1.xlsx


评估进度:  84%|████████▍ | 16/19 [14:15<03:45, 75.10s/it]

Row 16 saved to evaluation_results_1.xlsx


评估进度:  89%|████████▉ | 17/19 [14:54<02:08, 64.24s/it]

Row 17 saved to evaluation_results_1.xlsx


评估进度:  95%|█████████▍| 18/19 [15:40<00:58, 58.78s/it]

Row 18 saved to evaluation_results_1.xlsx


评估进度: 100%|██████████| 19/19 [17:34<00:00, 55.49s/it]

Row 19 saved to evaluation_results_1.xlsx



