In [None]:
import os
import asyncio
import pandas as pd
import time
import ast
from tqdm import tqdm
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import (
    LLMContextPrecisionWithoutReference,
    ContextRelevance,
    AnswerAccuracy,
    SemanticSimilarity,
    Faithfulness
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# 设置 API Key 和 Base URL
os.environ["OPENAI_API_KEY"] = "your api key"

# 创建 LangChain LLM 实例，支持自定义 base_url
llm_instance = ChatOpenAI(
    model="deepseek-chat",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
)

# 包装成 Ragas LLM
evaluator_llm = LangchainLLMWrapper(llm_instance)

# 创建 LangChain 嵌入实例，使用 text-embedding-3-small 模型
evaluator_embedding = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
)

# 包装成 Ragas 嵌入
evaluator_embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)

async def main():
    # 读取输入 CSV 文件
    input_file = 'input.csv'
    df = pd.read_csv(input_file)

    # 创建评估器（在循环外创建以复用）
    context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)
    scorer_relevance = ContextRelevance(llm=evaluator_llm)
    scorer_accuracy = AnswerAccuracy(llm=evaluator_llm)
    scorer_similarity = SemanticSimilarity(embeddings=evaluator_embeddings_wrapper)
    scorer_faithfulness = Faithfulness(llm=evaluator_llm)

    results = []

    for index, row in tqdm(df.iterrows(), total=len(df), desc="评估进度"):
        start_time = time.time()

        user_input = row['Query']
        reference = row['Ground_truth']
        response = row['Response']
        retrieved_context_str = row['Retrievaled_context']

        # 尝试解析 retrieved_contexts 为列表，如果失败则视为单个字符串的列表
        try:
            retrieved_contexts = ast.literal_eval(retrieved_context_str)
            if not isinstance(retrieved_contexts, list):
                retrieved_contexts = [retrieved_context_str]
        except:
            retrieved_contexts = [retrieved_context_str]

        # 创建样本并计算分数
        sample1 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score1 = await context_precision.single_turn_ascore(sample1)

        sample2 = SingleTurnSample(
            user_input=user_input,
            retrieved_contexts=retrieved_contexts,
        )
        score2 = await scorer_relevance.single_turn_ascore(sample2)

        sample3 = SingleTurnSample(
            user_input=user_input,
            response=response,
            reference=reference
        )
        score3 = await scorer_accuracy.single_turn_ascore(sample3)

        sample4 = SingleTurnSample(
            response=response,
            reference=reference
        )
        score4 = await scorer_similarity.single_turn_ascore(sample4)

        sample5 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score5 = await scorer_faithfulness.single_turn_ascore(sample5)

        end_time = time.time()
        evaluation_time = end_time - start_time

        # 记录结果
        results.append({
            'Query': user_input,
            'Ground_truth': reference,
            'Response': response,
            'Retrievaled_context': retrieved_context_str,
            'Context_Precision': score1,
            'Context_Relevance': score2,
            'Answer_Accuracy': score3,
            'Semantic_Similarity': score4,
            'Faithfulness': score5,
            'Evaluation_Time': evaluation_time
        })

    # 保存结果到新的 CSV 文件
    output_df = pd.DataFrame(results)
    output_file = 'evaluation_results.csv'
    output_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    asyncio.run(main())

In [None]:
import os
import asyncio
import pandas as pd
import time
import ast
from tqdm import tqdm
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import (
    LLMContextPrecisionWithoutReference,
    ContextRelevance,
    AnswerAccuracy,
    SemanticSimilarity,
    Faithfulness
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import openpyxl  # 确保安装 openpyxl 以支持 Excel 追加

# 设置 API Key 和 Base URL
os.environ["OPENAI_API_KEY"] = "your api key"

# 创建 LangChain LLM 实例，支持自定义 base_url，并强制 JSON 输出以修复解析错误
llm_instance = ChatOpenAI(
    model="deepseek-chat",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
    response_format={"type": "json_object"}  # 强制 JSON 输出模式，修复 OutputParserException
)

# 包装成 Ragas LLM
evaluator_llm = LangchainLLMWrapper(llm_instance)

# 创建 LangChain 嵌入实例，使用 text-embedding-3-small 模型
evaluator_embedding = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    base_url="your base url",
)

# 包装成 Ragas 嵌入
evaluator_embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)

async def main():
    # 读取输入 XLSX 文件
    input_file = 'merged_test_dataset_1.xlsx'
    df = pd.read_excel(input_file)

    # 创建评估器（在循环外创建以复用）
    context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)
    scorer_relevance = ContextRelevance(llm=evaluator_llm)
    scorer_accuracy = AnswerAccuracy(llm=evaluator_llm)
    scorer_similarity = SemanticSimilarity(embeddings=evaluator_embeddings_wrapper)
    scorer_faithfulness = Faithfulness(llm=evaluator_llm)

    output_file = 'evaluation_results_1.xlsx'

    # 如果输出文件不存在，创建空文件并写入表头
    if not os.path.exists(output_file):
        header_df = pd.DataFrame(columns=[
            'Query', 'Ground_truth', 'Response', 'Retrievaled_context',
            'Context_Precision', 'Context_Relevance', 'Answer_Accuracy',
            'Semantic_Similarity', 'Faithfulness', 'Evaluation_Time'
        ])
        header_df.to_excel(output_file, index=False)

    for index, row in tqdm(df.iterrows(), total=len(df), desc="评估进度"):
        start_time = time.time()

        user_input = row['Query']
        reference = row['Ground_truth']
        response = row['Response']
        retrieved_context_str = row['Retrievaled_context']

        # 尝试解析 retrieved_contexts 为列表，如果失败则视为单个字符串的列表
        try:
            retrieved_contexts = ast.literal_eval(retrieved_context_str)
            if not isinstance(retrieved_contexts, list):
                retrieved_contexts = [retrieved_context_str]
        except:
            retrieved_contexts = [retrieved_context_str]

        # 创建样本并计算分数
        sample1 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score1 = await context_precision.single_turn_ascore(sample1)

        sample2 = SingleTurnSample(
            user_input=user_input,
            retrieved_contexts=retrieved_contexts,
        )
        score2 = await scorer_relevance.single_turn_ascore(sample2)

        sample3 = SingleTurnSample(
            user_input=user_input,
            response=response,
            reference=reference
        )
        score3 = await scorer_accuracy.single_turn_ascore(sample3)

        sample4 = SingleTurnSample(
            response=response,
            reference=reference
        )
        score4 = await scorer_similarity.single_turn_ascore(sample4)

        sample5 = SingleTurnSample(
            user_input=user_input,
            response=response,
            retrieved_contexts=retrieved_contexts,
        )
        score5 = await scorer_faithfulness.single_turn_ascore(sample5)

        end_time = time.time()
        evaluation_time = end_time - start_time

        # 记录单行结果
        result = {
            'Query': user_input,
            'Ground_truth': reference,
            'Response': response,
            'Retrievaled_context': retrieved_context_str,
            'Context_Precision': score1,
            'Context_Relevance': score2,
            'Answer_Accuracy': score3,
            'Semantic_Similarity': score4,
            'Faithfulness': score5,
            'Evaluation_Time': evaluation_time
        }

        # 将单行结果追加到 Excel 文件（使用 openpyxl 引擎支持追加），添加重试机制以处理 PermissionError
        result_df = pd.DataFrame([result])
        retries = 3
        for attempt in range(retries):
            try:
                with pd.ExcelWriter(output_file, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:
                    # 读取现有行数
                    book = writer.book
                    sheet = book.active
                    startrow = sheet.max_row
                    # 追加数据（不包括表头）
                    result_df.to_excel(writer, index=False, header=False, startrow=startrow)
                print(f"Row {index + 1} saved to {output_file}")
                break
            except PermissionError:
                print(f"Permission denied on attempt {attempt + 1} for row {index + 1}. Please close the file if it's open. Retrying in 5 seconds...")
                time.sleep(5)
        else:
            raise PermissionError(f"Failed to write to {output_file} after {retries} attempts. Ensure the file is closed.")

if __name__ == "__main__":
    asyncio.run(main())

  from .autonotebook import tqdm as notebook_tqdm
                response_format was transferred to model_kwargs.
                Please confirm that response_format is what you intended.
  if await self.run_code(code, result, async_=asy):
  evaluator_embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)
评估进度:   1%|▏         | 1/72 [01:11<1:24:31, 71.43s/it]

Row 1 saved to evaluation_results_1.xlsx


评估进度:   3%|▎         | 2/72 [02:56<1:46:26, 91.23s/it]

Row 2 saved to evaluation_results_1.xlsx


评估进度:   4%|▍         | 3/72 [03:40<1:20:11, 69.74s/it]

Row 3 saved to evaluation_results_1.xlsx


评估进度:   6%|▌         | 4/72 [04:20<1:05:41, 57.97s/it]

Row 4 saved to evaluation_results_1.xlsx


评估进度:   7%|▋         | 5/72 [04:59<56:54, 50.96s/it]  

Row 5 saved to evaluation_results_1.xlsx


评估进度:   8%|▊         | 6/72 [05:38<51:51, 47.14s/it]

Row 6 saved to evaluation_results_1.xlsx


评估进度:  10%|▉         | 7/72 [06:07<44:34, 41.14s/it]

Row 7 saved to evaluation_results_1.xlsx


评估进度:  11%|█         | 8/72 [06:20<34:22, 32.23s/it]

Row 8 saved to evaluation_results_1.xlsx


评估进度:  12%|█▎        | 9/72 [06:40<29:34, 28.17s/it]

Row 9 saved to evaluation_results_1.xlsx


评估进度:  14%|█▍        | 10/72 [07:12<30:24, 29.42s/it]

Row 10 saved to evaluation_results_1.xlsx


评估进度:  15%|█▌        | 11/72 [07:25<24:50, 24.44s/it]

Row 11 saved to evaluation_results_1.xlsx


评估进度:  17%|█▋        | 12/72 [08:17<32:53, 32.89s/it]

Row 12 saved to evaluation_results_1.xlsx


评估进度:  18%|█▊        | 13/72 [08:48<31:41, 32.23s/it]

Row 13 saved to evaluation_results_1.xlsx


评估进度:  19%|█▉        | 14/72 [09:07<27:17, 28.23s/it]

Row 14 saved to evaluation_results_1.xlsx


评估进度:  21%|██        | 15/72 [09:20<22:34, 23.77s/it]

Row 15 saved to evaluation_results_1.xlsx


评估进度:  22%|██▏       | 16/72 [09:36<20:00, 21.43s/it]

Row 16 saved to evaluation_results_1.xlsx


评估进度:  24%|██▎       | 17/72 [09:53<18:17, 19.95s/it]

Row 17 saved to evaluation_results_1.xlsx


评估进度:  25%|██▌       | 18/72 [10:08<16:47, 18.66s/it]

Row 18 saved to evaluation_results_1.xlsx


评估进度:  26%|██▋       | 19/72 [10:27<16:21, 18.51s/it]

Row 19 saved to evaluation_results_1.xlsx


评估进度:  28%|██▊       | 20/72 [10:44<15:47, 18.23s/it]

Row 20 saved to evaluation_results_1.xlsx


评估进度:  29%|██▉       | 21/72 [10:58<14:18, 16.83s/it]

Row 21 saved to evaluation_results_1.xlsx


评估进度:  31%|███       | 22/72 [11:12<13:23, 16.06s/it]

Row 22 saved to evaluation_results_1.xlsx


评估进度:  32%|███▏      | 23/72 [11:26<12:39, 15.50s/it]

Row 23 saved to evaluation_results_1.xlsx


评估进度:  33%|███▎      | 24/72 [11:45<13:06, 16.38s/it]

Row 24 saved to evaluation_results_1.xlsx


评估进度:  35%|███▍      | 25/72 [12:01<12:53, 16.46s/it]

Row 25 saved to evaluation_results_1.xlsx


评估进度:  36%|███▌      | 26/72 [12:21<13:21, 17.43s/it]

Row 26 saved to evaluation_results_1.xlsx


评估进度:  38%|███▊      | 27/72 [13:00<18:01, 24.03s/it]

Row 27 saved to evaluation_results_1.xlsx


评估进度:  39%|███▉      | 28/72 [13:13<15:11, 20.71s/it]

Row 28 saved to evaluation_results_1.xlsx


评估进度:  40%|████      | 29/72 [13:29<13:47, 19.25s/it]

Row 29 saved to evaluation_results_1.xlsx


评估进度:  42%|████▏     | 30/72 [14:00<15:50, 22.63s/it]

Row 30 saved to evaluation_results_1.xlsx


评估进度:  43%|████▎     | 31/72 [14:38<18:42, 27.37s/it]

Row 31 saved to evaluation_results_1.xlsx


评估进度:  44%|████▍     | 32/72 [15:10<19:13, 28.83s/it]

Row 32 saved to evaluation_results_1.xlsx


评估进度:  46%|████▌     | 33/72 [15:36<18:01, 27.73s/it]

Row 33 saved to evaluation_results_1.xlsx


评估进度:  47%|████▋     | 34/72 [15:54<15:53, 25.09s/it]

Row 34 saved to evaluation_results_1.xlsx


评估进度:  49%|████▊     | 35/72 [16:17<15:02, 24.40s/it]

Row 35 saved to evaluation_results_1.xlsx


评估进度:  50%|█████     | 36/72 [16:51<16:23, 27.33s/it]

Row 36 saved to evaluation_results_1.xlsx


评估进度:  51%|█████▏    | 37/72 [17:04<13:20, 22.87s/it]

Row 37 saved to evaluation_results_1.xlsx


评估进度:  53%|█████▎    | 38/72 [17:31<13:43, 24.23s/it]

Row 38 saved to evaluation_results_1.xlsx


评估进度:  54%|█████▍    | 39/72 [18:02<14:24, 26.19s/it]

Row 39 saved to evaluation_results_1.xlsx


评估进度:  56%|█████▌    | 40/72 [18:22<13:00, 24.39s/it]

Row 40 saved to evaluation_results_1.xlsx


评估进度:  57%|█████▋    | 41/72 [18:56<14:00, 27.11s/it]

Row 41 saved to evaluation_results_1.xlsx


评估进度:  58%|█████▊    | 42/72 [19:20<13:08, 26.29s/it]

Row 42 saved to evaluation_results_1.xlsx


评估进度:  60%|█████▉    | 43/72 [19:36<11:09, 23.09s/it]

Row 43 saved to evaluation_results_1.xlsx


评估进度:  61%|██████    | 44/72 [19:55<10:15, 21.99s/it]

Row 44 saved to evaluation_results_1.xlsx


评估进度:  62%|██████▎   | 45/72 [20:11<09:07, 20.29s/it]

Row 45 saved to evaluation_results_1.xlsx


评估进度:  64%|██████▍   | 46/72 [20:25<07:52, 18.16s/it]

Row 46 saved to evaluation_results_1.xlsx


评估进度:  65%|██████▌   | 47/72 [20:45<07:47, 18.71s/it]

Row 47 saved to evaluation_results_1.xlsx


评估进度:  67%|██████▋   | 48/72 [20:57<06:44, 16.85s/it]

Row 48 saved to evaluation_results_1.xlsx


评估进度:  68%|██████▊   | 49/72 [21:24<07:38, 19.91s/it]

Row 49 saved to evaluation_results_1.xlsx


评估进度:  69%|██████▉   | 50/72 [21:42<07:05, 19.32s/it]

Row 50 saved to evaluation_results_1.xlsx


评估进度:  71%|███████   | 51/72 [22:17<08:25, 24.08s/it]

Row 51 saved to evaluation_results_1.xlsx


评估进度:  72%|███████▏  | 52/72 [22:40<07:51, 23.58s/it]

Row 52 saved to evaluation_results_1.xlsx


评估进度:  74%|███████▎  | 53/72 [23:13<08:23, 26.51s/it]

Row 53 saved to evaluation_results_1.xlsx


评估进度:  75%|███████▌  | 54/72 [24:02<09:56, 33.12s/it]

Row 54 saved to evaluation_results_1.xlsx


评估进度:  76%|███████▋  | 55/72 [24:19<08:03, 28.43s/it]

Row 55 saved to evaluation_results_1.xlsx


评估进度:  78%|███████▊  | 56/72 [24:54<08:04, 30.30s/it]

Row 56 saved to evaluation_results_1.xlsx


评估进度:  79%|███████▉  | 57/72 [25:07<06:19, 25.30s/it]

Row 57 saved to evaluation_results_1.xlsx


评估进度:  81%|████████  | 58/72 [25:26<05:24, 23.18s/it]

Row 58 saved to evaluation_results_1.xlsx


评估进度:  82%|████████▏ | 59/72 [25:43<04:38, 21.39s/it]

Row 59 saved to evaluation_results_1.xlsx


评估进度:  83%|████████▎ | 60/72 [26:09<04:34, 22.86s/it]

Row 60 saved to evaluation_results_1.xlsx


评估进度:  85%|████████▍ | 61/72 [26:24<03:44, 20.38s/it]

Row 61 saved to evaluation_results_1.xlsx


评估进度:  86%|████████▌ | 62/72 [26:42<03:17, 19.73s/it]

Row 62 saved to evaluation_results_1.xlsx


评估进度:  88%|████████▊ | 63/72 [27:02<02:58, 19.88s/it]

Row 63 saved to evaluation_results_1.xlsx


评估进度:  89%|████████▉ | 64/72 [27:24<02:43, 20.44s/it]

Row 64 saved to evaluation_results_1.xlsx


评估进度:  90%|█████████ | 65/72 [27:44<02:22, 20.35s/it]

Row 65 saved to evaluation_results_1.xlsx


评估进度:  92%|█████████▏| 66/72 [28:02<01:58, 19.68s/it]

Row 66 saved to evaluation_results_1.xlsx


评估进度:  93%|█████████▎| 67/72 [28:20<01:35, 19.16s/it]

Row 67 saved to evaluation_results_1.xlsx


评估进度:  94%|█████████▍| 68/72 [28:51<01:30, 22.74s/it]

Row 68 saved to evaluation_results_1.xlsx


评估进度:  96%|█████████▌| 69/72 [29:31<01:23, 27.81s/it]

Row 69 saved to evaluation_results_1.xlsx


评估进度:  97%|█████████▋| 70/72 [29:58<00:55, 27.60s/it]

Row 70 saved to evaluation_results_1.xlsx


评估进度:  99%|█████████▊| 71/72 [30:47<00:34, 34.06s/it]

Row 71 saved to evaluation_results_1.xlsx


评估进度: 100%|██████████| 72/72 [31:10<00:00, 25.98s/it]

Row 72 saved to evaluation_results_1.xlsx



