In [None]:
import os
import asyncio
import pandas as pd
import time
import ast
from tqdm import tqdm
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import (
    LLMContextPrecisionWithoutReference,
    ContextRelevance,
    AnswerAccuracy,
    SemanticSimilarity,
    Faithfulness
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# 设置 API Key 和 Base URL
os.environ["OPENAI_API_KEY"] = "your api key"

# 创建 LangChain LLM 实例，支持自定义 base_url
llm_instance = ChatOpenAI(
    model="deepseek-chat",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
)

# 包装成 Ragas LLM
evaluator_llm = LangchainLLMWrapper(llm_instance)

# 创建 LangChain 嵌入实例，使用 text-embedding-3-small 模型
evaluator_embedding = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
)

# 包装成 Ragas 嵌入
evaluator_embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)

async def main():
    # 读取输入 CSV 文件
    input_file = 'input.csv'
    df = pd.read_csv(input_file)

    # 创建评估器（在循环外创建以复用）
    context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)
    scorer_relevance = ContextRelevance(llm=evaluator_llm)
    scorer_accuracy = AnswerAccuracy(llm=evaluator_llm)
    scorer_similarity = SemanticSimilarity(embeddings=evaluator_embeddings_wrapper)
    scorer_faithfulness = Faithfulness(llm=evaluator_llm)

    results = []

    for index, row in tqdm(df.iterrows(), total=len(df), desc="评估进度"):
        start_time = time.time()

        user_input = row['Query']
        reference = row['Ground_truth']
        response = row['Response']
        retrieved_context_str = row['Retrievaled_context']

        # 尝试解析 retrieved_contexts 为列表，如果失败则视为单个字符串的列表
        try:
            retrieved_contexts = ast.literal_eval(retrieved_context_str)
            if not isinstance(retrieved_contexts, list):
                retrieved_contexts = [retrieved_context_str]
        except:
            retrieved_contexts = [retrieved_context_str]

        # 创建样本并计算分数
        sample1 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score1 = await context_precision.single_turn_ascore(sample1)

        sample2 = SingleTurnSample(
            user_input=user_input,
            retrieved_contexts=retrieved_contexts,
        )
        score2 = await scorer_relevance.single_turn_ascore(sample2)

        sample3 = SingleTurnSample(
            user_input=user_input,
            response=response,
            reference=reference
        )
        score3 = await scorer_accuracy.single_turn_ascore(sample3)

        sample4 = SingleTurnSample(
            response=response,
            reference=reference
        )
        score4 = await scorer_similarity.single_turn_ascore(sample4)

        sample5 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score5 = await scorer_faithfulness.single_turn_ascore(sample5)

        end_time = time.time()
        evaluation_time = end_time - start_time

        # 记录结果
        results.append({
            'Query': user_input,
            'Ground_truth': reference,
            'Response': response,
            'Retrievaled_context': retrieved_context_str,
            'Context_Precision': score1,
            'Context_Relevance': score2,
            'Answer_Accuracy': score3,
            'Semantic_Similarity': score4,
            'Faithfulness': score5,
            'Evaluation_Time': evaluation_time
        })

    # 保存结果到新的 CSV 文件
    output_df = pd.DataFrame(results)
    output_file = 'evaluation_results.csv'
    output_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    asyncio.run(main())

In [None]:
import os
import asyncio
import pandas as pd
import time
import ast
from tqdm import tqdm
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import (
    LLMContextPrecisionWithoutReference,
    ContextRelevance,
    AnswerAccuracy,
    SemanticSimilarity,
    Faithfulness
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import openpyxl  # 确保安装 openpyxl 以支持 Excel 追加

# 设置 API Key 和 Base URL
os.environ["OPENAI_API_KEY"] = "your api key"

# 创建 LangChain LLM 实例，支持自定义 base_url，并强制 JSON 输出以修复解析错误
llm_instance = ChatOpenAI(
    model="deepseek-chat",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
    response_format={"type": "json_object"}  # 强制 JSON 输出模式，修复 OutputParserException
)

# 包装成 Ragas LLM
evaluator_llm = LangchainLLMWrapper(llm_instance)

# 创建 LangChain 嵌入实例，使用 text-embedding-3-small 模型
evaluator_embedding = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
)

# 包装成 Ragas 嵌入
evaluator_embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)

async def main():
    # 读取输入 XLSX 文件
    input_file = 'merged_test_dataset_1.xlsx'
    df = pd.read_excel(input_file)

    # 创建评估器（在循环外创建以复用）
    context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)
    scorer_relevance = ContextRelevance(llm=evaluator_llm)
    scorer_accuracy = AnswerAccuracy(llm=evaluator_llm)
    scorer_similarity = SemanticSimilarity(embeddings=evaluator_embeddings_wrapper)
    scorer_faithfulness = Faithfulness(llm=evaluator_llm)

    output_file = 'evaluation_results_1.xlsx'

    # 如果输出文件不存在，创建空文件并写入表头
    if not os.path.exists(output_file):
        header_df = pd.DataFrame(columns=[
            'Query', 'Ground_truth', 'Response', 'Retrievaled_context',
            'Context_Precision', 'Context_Relevance', 'Answer_Accuracy',
            'Semantic_Similarity', 'Faithfulness', 'Evaluation_Time'
        ])
        header_df.to_excel(output_file, index=False)

    for index, row in tqdm(df.iterrows(), total=len(df), desc="评估进度"):
        start_time = time.time()

        user_input = row['Query']
        reference = row['Ground_truth']
        response = row['Response']
        retrieved_context_str = row['Retrievaled_context']

        # 尝试解析 retrieved_contexts 为列表，如果失败则视为单个字符串的列表
        try:
            retrieved_contexts = ast.literal_eval(retrieved_context_str)
            if not isinstance(retrieved_contexts, list):
                retrieved_contexts = [retrieved_context_str]
        except:
            retrieved_contexts = [retrieved_context_str]

        # 创建样本并计算分数
        sample1 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score1 = await context_precision.single_turn_ascore(sample1)

        sample2 = SingleTurnSample(
            user_input=user_input,
            retrieved_contexts=retrieved_contexts,
        )
        score2 = await scorer_relevance.single_turn_ascore(sample2)

        sample3 = SingleTurnSample(
            user_input=user_input,
            response=response,
            reference=reference
        )
        score3 = await scorer_accuracy.single_turn_ascore(sample3)

        sample4 = SingleTurnSample(
            response=response,
            reference=reference
        )
        score4 = await scorer_similarity.single_turn_ascore(sample4)

        sample5 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score5 = await scorer_faithfulness.single_turn_ascore(sample5)

        end_time = time.time()
        evaluation_time = end_time - start_time

        # 记录单行结果
        result = {
            'Query': user_input,
            'Ground_truth': reference,
            'Response': response,
            'Retrievaled_context': retrieved_context_str,
            'Context_Precision': score1,
            'Context_Relevance': score2,
            'Answer_Accuracy': score3,
            'Semantic_Similarity': score4,
            'Faithfulness': score5,
            'Evaluation_Time': evaluation_time
        }

        # 将单行结果追加到 Excel 文件（使用 openpyxl 引擎支持追加），添加重试机制以处理 PermissionError
        result_df = pd.DataFrame([result])
        retries = 3
        for attempt in range(retries):
            try:
                with pd.ExcelWriter(output_file, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:
                    # 读取现有行数
                    book = writer.book
                    sheet = book.active
                    startrow = sheet.max_row
                    # 追加数据（不包括表头）
                    result_df.to_excel(writer, index=False, header=False, startrow=startrow)
                print(f"Row {index + 1} saved to {output_file}")
                break
            except PermissionError:
                print(f"Permission denied on attempt {attempt + 1} for row {index + 1}. Please close the file if it's open. Retrying in 5 seconds...")
                time.sleep(5)
        else:
            raise PermissionError(f"Failed to write to {output_file} after {retries} attempts. Ensure the file is closed.")

if __name__ == "__main__":
    asyncio.run(main())

  from .autonotebook import tqdm as notebook_tqdm
                response_format was transferred to model_kwargs.
                Please confirm that response_format is what you intended.
  if await self.run_code(code, result, async_=asy):
  evaluator_embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)
评估进度:   3%|▎         | 1/35 [01:42<58:15, 102.80s/it]

Row 1 saved to evaluation_results_1.xlsx


评估进度:   6%|▌         | 2/35 [03:12<52:13, 94.97s/it] 

Row 2 saved to evaluation_results_1.xlsx


评估进度:   9%|▊         | 3/35 [04:23<44:56, 84.26s/it]

Row 3 saved to evaluation_results_1.xlsx


评估进度:  11%|█▏        | 4/35 [05:26<39:06, 75.68s/it]

Row 4 saved to evaluation_results_1.xlsx


评估进度:  14%|█▍        | 5/35 [06:16<33:16, 66.54s/it]

Row 5 saved to evaluation_results_1.xlsx


评估进度:  17%|█▋        | 6/35 [07:37<34:33, 71.51s/it]

Row 6 saved to evaluation_results_1.xlsx


评估进度:  20%|██        | 7/35 [08:36<31:22, 67.24s/it]

Row 7 saved to evaluation_results_1.xlsx


评估进度:  23%|██▎       | 8/35 [09:26<27:48, 61.79s/it]

Row 8 saved to evaluation_results_1.xlsx


评估进度:  26%|██▌       | 9/35 [10:35<27:46, 64.09s/it]

Row 9 saved to evaluation_results_1.xlsx


评估进度:  29%|██▊       | 10/35 [11:28<25:19, 60.76s/it]

Row 10 saved to evaluation_results_1.xlsx


评估进度:  31%|███▏      | 11/35 [13:04<28:35, 71.49s/it]

Row 11 saved to evaluation_results_1.xlsx


评估进度:  34%|███▍      | 12/35 [14:34<29:30, 76.99s/it]

Row 12 saved to evaluation_results_1.xlsx


评估进度:  37%|███▋      | 13/35 [15:52<28:24, 77.47s/it]

Row 13 saved to evaluation_results_1.xlsx


评估进度:  40%|████      | 14/35 [17:29<29:11, 83.42s/it]

Row 14 saved to evaluation_results_1.xlsx


评估进度:  43%|████▎     | 15/35 [18:47<27:12, 81.62s/it]

Row 15 saved to evaluation_results_1.xlsx


评估进度:  46%|████▌     | 16/35 [19:35<22:38, 71.50s/it]

Row 16 saved to evaluation_results_1.xlsx


评估进度:  49%|████▊     | 17/35 [20:37<20:37, 68.74s/it]

Row 17 saved to evaluation_results_1.xlsx


评估进度:  51%|█████▏    | 18/35 [21:34<18:25, 65.02s/it]

Row 18 saved to evaluation_results_1.xlsx


评估进度:  54%|█████▍    | 19/35 [22:42<17:36, 66.02s/it]

Row 19 saved to evaluation_results_1.xlsx


评估进度:  57%|█████▋    | 20/35 [23:26<14:50, 59.37s/it]

Row 20 saved to evaluation_results_1.xlsx


评估进度:  60%|██████    | 21/35 [24:27<13:57, 59.83s/it]

Row 21 saved to evaluation_results_1.xlsx


评估进度:  63%|██████▎   | 22/35 [25:26<12:53, 59.51s/it]

Row 22 saved to evaluation_results_1.xlsx


评估进度:  66%|██████▌   | 23/35 [27:33<16:00, 80.04s/it]

Row 23 saved to evaluation_results_1.xlsx


评估进度:  69%|██████▊   | 24/35 [28:30<13:24, 73.10s/it]

Row 24 saved to evaluation_results_1.xlsx


评估进度:  71%|███████▏  | 25/35 [29:30<11:30, 69.09s/it]

Row 25 saved to evaluation_results_1.xlsx


评估进度:  74%|███████▍  | 26/35 [30:18<09:24, 62.67s/it]

Row 26 saved to evaluation_results_1.xlsx


评估进度:  77%|███████▋  | 27/35 [31:46<09:22, 70.34s/it]

Row 27 saved to evaluation_results_1.xlsx


评估进度:  80%|████████  | 28/35 [32:43<07:43, 66.21s/it]

Row 28 saved to evaluation_results_1.xlsx


评估进度:  83%|████████▎ | 29/35 [33:35<06:12, 62.15s/it]

Row 29 saved to evaluation_results_1.xlsx


评估进度:  86%|████████▌ | 30/35 [34:36<05:08, 61.77s/it]

Row 30 saved to evaluation_results_1.xlsx


评估进度:  89%|████████▊ | 31/35 [35:34<04:02, 60.60s/it]

Row 31 saved to evaluation_results_1.xlsx


评估进度:  91%|█████████▏| 32/35 [37:24<03:46, 75.40s/it]

Row 32 saved to evaluation_results_1.xlsx


评估进度:  94%|█████████▍| 33/35 [38:32<02:26, 73.35s/it]

Row 33 saved to evaluation_results_1.xlsx


评估进度:  97%|█████████▋| 34/35 [40:01<01:17, 77.82s/it]

Row 34 saved to evaluation_results_1.xlsx


评估进度: 100%|██████████| 35/35 [41:04<00:00, 70.40s/it]

Row 35 saved to evaluation_results_1.xlsx



