In [None]:
import os
import asyncio
import pandas as pd
import time
import ast
from tqdm import tqdm
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import (
    LLMContextPrecisionWithoutReference,
    ContextRelevance,
    AnswerAccuracy,
    SemanticSimilarity,
    Faithfulness
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# 设置 API Key 和 Base URL
os.environ["OPENAI_API_KEY"] = "your api key"

# 创建 LangChain LLM 实例，支持自定义 base_url
llm_instance = ChatOpenAI(
    model="deepseek-chat",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
)

# 包装成 Ragas LLM
evaluator_llm = LangchainLLMWrapper(llm_instance)

# 创建 LangChain 嵌入实例，使用 text-embedding-3-small 模型
evaluator_embedding = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
)

# 包装成 Ragas 嵌入
evaluator_embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)

async def main():
    # 读取输入 CSV 文件
    input_file = 'input.csv'
    df = pd.read_csv(input_file)

    # 创建评估器（在循环外创建以复用）
    context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)
    scorer_relevance = ContextRelevance(llm=evaluator_llm)
    scorer_accuracy = AnswerAccuracy(llm=evaluator_llm)
    scorer_similarity = SemanticSimilarity(embeddings=evaluator_embeddings_wrapper)
    scorer_faithfulness = Faithfulness(llm=evaluator_llm)

    results = []

    for index, row in tqdm(df.iterrows(), total=len(df), desc="评估进度"):
        start_time = time.time()

        user_input = row['Query']
        reference = row['Ground_truth']
        response = row['Response']
        retrieved_context_str = row['Retrievaled_context']

        # 尝试解析 retrieved_contexts 为列表，如果失败则视为单个字符串的列表
        try:
            retrieved_contexts = ast.literal_eval(retrieved_context_str)
            if not isinstance(retrieved_contexts, list):
                retrieved_contexts = [retrieved_context_str]
        except:
            retrieved_contexts = [retrieved_context_str]

        # 创建样本并计算分数
        sample1 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score1 = await context_precision.single_turn_ascore(sample1)

        sample2 = SingleTurnSample(
            user_input=user_input,
            retrieved_contexts=retrieved_contexts,
        )
        score2 = await scorer_relevance.single_turn_ascore(sample2)

        sample3 = SingleTurnSample(
            user_input=user_input,
            response=response,
            reference=reference
        )
        score3 = await scorer_accuracy.single_turn_ascore(sample3)

        sample4 = SingleTurnSample(
            response=response,
            reference=reference
        )
        score4 = await scorer_similarity.single_turn_ascore(sample4)

        sample5 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score5 = await scorer_faithfulness.single_turn_ascore(sample5)

        end_time = time.time()
        evaluation_time = end_time - start_time

        # 记录结果
        results.append({
            'Query': user_input,
            'Ground_truth': reference,
            'Response': response,
            'Retrievaled_context': retrieved_context_str,
            'Context_Precision': score1,
            'Context_Relevance': score2,
            'Answer_Accuracy': score3,
            'Semantic_Similarity': score4,
            'Faithfulness': score5,
            'Evaluation_Time': evaluation_time
        })

    # 保存结果到新的 CSV 文件
    output_df = pd.DataFrame(results)
    output_file = 'evaluation_results.csv'
    output_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    asyncio.run(main())

In [None]:
import os
import asyncio
import pandas as pd
import time
import ast
from tqdm import tqdm
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import (
    LLMContextPrecisionWithoutReference,
    ContextRelevance,
    AnswerAccuracy,
    SemanticSimilarity,
    Faithfulness
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import openpyxl  # 确保安装 openpyxl 以支持 Excel 追加

# 设置 API Key 和 Base URL
os.environ["OPENAI_API_KEY"] = "your api key"

# 创建 LangChain LLM 实例，支持自定义 base_url，并强制 JSON 输出以修复解析错误
llm_instance = ChatOpenAI(
    model="deepseek-chat",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
    response_format={"type": "json_object"}  # 强制 JSON 输出模式，修复 OutputParserException
)

# 包装成 Ragas LLM
evaluator_llm = LangchainLLMWrapper(llm_instance)

# 创建 LangChain 嵌入实例，使用 text-embedding-3-small 模型
evaluator_embedding = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
)

# 包装成 Ragas 嵌入
evaluator_embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)

async def main():
    # 读取输入 XLSX 文件
    input_file = 'merged_test_dataset_1.xlsx'
    df = pd.read_excel(input_file)

    # 创建评估器（在循环外创建以复用）
    context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)
    scorer_relevance = ContextRelevance(llm=evaluator_llm)
    scorer_accuracy = AnswerAccuracy(llm=evaluator_llm)
    scorer_similarity = SemanticSimilarity(embeddings=evaluator_embeddings_wrapper)
    scorer_faithfulness = Faithfulness(llm=evaluator_llm)

    output_file = 'evaluation_results_1.xlsx'

    # 如果输出文件不存在，创建空文件并写入表头
    if not os.path.exists(output_file):
        header_df = pd.DataFrame(columns=[
            'Query', 'Ground_truth', 'Response', 'Retrievaled_context',
            'Context_Precision', 'Context_Relevance', 'Answer_Accuracy',
            'Semantic_Similarity', 'Faithfulness', 'Evaluation_Time'
        ])
        header_df.to_excel(output_file, index=False)

    for index, row in tqdm(df.iterrows(), total=len(df), desc="评估进度"):
        start_time = time.time()

        user_input = row['Query']
        reference = row['Ground_truth']
        response = row['Response']
        retrieved_context_str = row['Retrievaled_context']

        # 尝试解析 retrieved_contexts 为列表，如果失败则视为单个字符串的列表
        try:
            retrieved_contexts = ast.literal_eval(retrieved_context_str)
            if not isinstance(retrieved_contexts, list):
                retrieved_contexts = [retrieved_context_str]
        except:
            retrieved_contexts = [retrieved_context_str]

        # 创建样本并计算分数
        sample1 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score1 = await context_precision.single_turn_ascore(sample1)

        sample2 = SingleTurnSample(
            user_input=user_input,
            retrieved_contexts=retrieved_contexts,
        )
        score2 = await scorer_relevance.single_turn_ascore(sample2)

        sample3 = SingleTurnSample(
            user_input=user_input,
            response=response,
            reference=reference
        )
        score3 = await scorer_accuracy.single_turn_ascore(sample3)

        sample4 = SingleTurnSample(
            response=response,
            reference=reference
        )
        score4 = await scorer_similarity.single_turn_ascore(sample4)

        sample5 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score5 = await scorer_faithfulness.single_turn_ascore(sample5)

        end_time = time.time()
        evaluation_time = end_time - start_time

        # 记录单行结果
        result = {
            'Query': user_input,
            'Ground_truth': reference,
            'Response': response,
            'Retrievaled_context': retrieved_context_str,
            'Context_Precision': score1,
            'Context_Relevance': score2,
            'Answer_Accuracy': score3,
            'Semantic_Similarity': score4,
            'Faithfulness': score5,
            'Evaluation_Time': evaluation_time
        }

        # 将单行结果追加到 Excel 文件（使用 openpyxl 引擎支持追加），添加重试机制以处理 PermissionError
        result_df = pd.DataFrame([result])
        retries = 3
        for attempt in range(retries):
            try:
                with pd.ExcelWriter(output_file, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:
                    # 读取现有行数
                    book = writer.book
                    sheet = book.active
                    startrow = sheet.max_row
                    # 追加数据（不包括表头）
                    result_df.to_excel(writer, index=False, header=False, startrow=startrow)
                print(f"Row {index + 1} saved to {output_file}")
                break
            except PermissionError:
                print(f"Permission denied on attempt {attempt + 1} for row {index + 1}. Please close the file if it's open. Retrying in 5 seconds...")
                time.sleep(5)
        else:
            raise PermissionError(f"Failed to write to {output_file} after {retries} attempts. Ensure the file is closed.")

if __name__ == "__main__":
    asyncio.run(main())

                response_format was transferred to model_kwargs.
                Please confirm that response_format is what you intended.
  if await self.run_code(code, result, async_=asy):
  evaluator_embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)
评估进度:   3%|▎         | 1/32 [01:40<51:57, 100.57s/it]

Row 1 saved to evaluation_results_1.xlsx


评估进度:   6%|▋         | 2/32 [02:47<40:24, 80.81s/it] 

Row 2 saved to evaluation_results_1.xlsx


评估进度:   9%|▉         | 3/32 [03:26<29:55, 61.92s/it]

Row 3 saved to evaluation_results_1.xlsx


评估进度:  12%|█▎        | 4/32 [04:31<29:21, 62.92s/it]

Row 4 saved to evaluation_results_1.xlsx


评估进度:  16%|█▌        | 5/32 [05:01<23:00, 51.12s/it]

Row 5 saved to evaluation_results_1.xlsx


评估进度:  19%|█▉        | 6/32 [05:42<20:34, 47.49s/it]

Row 6 saved to evaluation_results_1.xlsx


评估进度:  22%|██▏       | 7/32 [06:13<17:38, 42.35s/it]

Row 7 saved to evaluation_results_1.xlsx


评估进度:  25%|██▌       | 8/32 [06:48<15:54, 39.77s/it]

Row 8 saved to evaluation_results_1.xlsx


评估进度:  28%|██▊       | 9/32 [07:26<15:03, 39.27s/it]

Row 9 saved to evaluation_results_1.xlsx


评估进度:  31%|███▏      | 10/32 [08:01<13:58, 38.13s/it]

Row 10 saved to evaluation_results_1.xlsx


评估进度:  34%|███▍      | 11/32 [08:53<14:50, 42.40s/it]

Row 11 saved to evaluation_results_1.xlsx


评估进度:  38%|███▊      | 12/32 [09:28<13:22, 40.12s/it]

Row 12 saved to evaluation_results_1.xlsx


评估进度:  41%|████      | 13/32 [09:55<11:22, 35.91s/it]

Row 13 saved to evaluation_results_1.xlsx


评估进度:  44%|████▍     | 14/32 [10:39<11:32, 38.47s/it]

Row 14 saved to evaluation_results_1.xlsx


评估进度:  47%|████▋     | 15/32 [11:14<10:35, 37.35s/it]

Row 15 saved to evaluation_results_1.xlsx


评估进度:  50%|█████     | 16/32 [11:33<08:30, 31.92s/it]

Row 16 saved to evaluation_results_1.xlsx


评估进度:  53%|█████▎    | 17/32 [11:57<07:23, 29.57s/it]

Row 17 saved to evaluation_results_1.xlsx


评估进度:  56%|█████▋    | 18/32 [12:23<06:38, 28.43s/it]

Row 18 saved to evaluation_results_1.xlsx


评估进度:  59%|█████▉    | 19/32 [12:55<06:23, 29.53s/it]

Row 19 saved to evaluation_results_1.xlsx


评估进度:  62%|██████▎   | 20/32 [13:35<06:31, 32.65s/it]

Row 20 saved to evaluation_results_1.xlsx


评估进度:  66%|██████▌   | 21/32 [13:57<05:25, 29.55s/it]

Row 21 saved to evaluation_results_1.xlsx


评估进度:  69%|██████▉   | 22/32 [14:25<04:50, 29.09s/it]

Row 22 saved to evaluation_results_1.xlsx


评估进度:  72%|███████▏  | 23/32 [14:55<04:23, 29.32s/it]

Row 23 saved to evaluation_results_1.xlsx


评估进度:  75%|███████▌  | 24/32 [16:16<05:58, 44.76s/it]

Row 24 saved to evaluation_results_1.xlsx


评估进度:  78%|███████▊  | 25/32 [17:00<05:10, 44.42s/it]

Row 25 saved to evaluation_results_1.xlsx


评估进度:  81%|████████▏ | 26/32 [18:09<05:11, 51.89s/it]

Row 26 saved to evaluation_results_1.xlsx


评估进度:  84%|████████▍ | 27/32 [19:07<04:28, 53.63s/it]

Row 27 saved to evaluation_results_1.xlsx


评估进度:  88%|████████▊ | 28/32 [19:39<03:09, 47.32s/it]

Row 28 saved to evaluation_results_1.xlsx


评估进度:  91%|█████████ | 29/32 [20:27<02:22, 47.54s/it]

Row 29 saved to evaluation_results_1.xlsx


评估进度:  94%|█████████▍| 30/32 [20:49<01:19, 39.82s/it]

Row 30 saved to evaluation_results_1.xlsx


评估进度:  97%|█████████▋| 31/32 [21:11<00:34, 34.56s/it]

Row 31 saved to evaluation_results_1.xlsx


评估进度: 100%|██████████| 32/32 [22:26<00:00, 42.07s/it]

Row 32 saved to evaluation_results_1.xlsx



