In [None]:
%pip install knext
import os
import json
import pandas as pd
from tqdm import tqdm

# EVA 평가를 위한 모듈 임포트 (evaFor2wiki.py와 동일한 Evaluate 사용)
from kag.common.benchmarks.evaluate import Evaluate

# LangChain 관련
from langchain_huggingface import HuggingFaceEmbeddings  # 업데이트된 패키지 사용
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.faiss import FAISS
from langchain.schema import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage

# EnsembleRetriever, KiwiBM25Retriever 추가
from langchain.retrievers import EnsembleRetriever
from langchain_teddynote.retrievers import KiwiBM25Retriever

# deeoseek_r1 모델 불러오기 (deeoseek_r1_model_load.py에 정의되어 있다고 가정)
from ollama_model_load import deepseek_r1

os.environ["CUDA_VISIBLE_DEVICES"] = "0"


# 1) RAG 검색 수행 함수 (EnsembleRetriever 적용)
def perform_rag(
    question: str,
    text_file: str = "2wiki_corpus.json",
    chunk_size: int = 100,
    chunk_overlap: int = 50,
    device: str = "cuda"
) -> str:
    with open(text_file, "r", encoding="utf-8") as f:
        full_text = f.read()

    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = [Document(page_content=t) for t in splitter.split_text(full_text)]

    model_kwargs = {"device": device}
    encode_kwargs = {"normalize_embeddings": True}
    embeddings = HuggingFaceEmbeddings(
        model_name="intfloat/multilingual-e5-large",
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
    )

    # FAISS Retriever 생성
    db_faiss = FAISS.from_documents(chunks, embedding=embeddings)
    faiss_retriever = db_faiss.as_retriever(search_kwargs={"k": 2})

    # KiwiBM25Retriever 생성
    kiwi_bm25_retriever = KiwiBM25Retriever.from_documents(chunks)

    # EnsembleRetriever 생성
    retriever = EnsembleRetriever(
        retrievers=[kiwi_bm25_retriever, faiss_retriever],
        weights=[0.5, 0.5],
        search_type="mmr",
    )

    context_docs = retriever.invoke(question)
    context_text = "\n\n".join(doc.page_content for doc in context_docs)
    return context_text


# 2) LLM 질의 함수 (RAG Prompt 포함) - deeoseek_r1 모델 사용
def query_llm(context: str, question: str) -> str:
    RAG_PROMPT_TEMPLATE = """
    아래 정보(context)를 참고하여 사용자 질문에 답해주세요:
    {context}

    질문:
    {question}

    답변 시, 질문의 핵심만 파악하여 간결하게 1~2문장으로 답변하고, 
    불필요한 설명은 피합니다.

    답변:
    """
    prompt = ChatPromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
    formatted_prompt = prompt.format(context=context, question=question)
    message = HumanMessage(content=formatted_prompt)

    response = deepseek_r1.invoke([message])
    return response.content.strip()


# 3) 전체 평가 함수: JSON 파일("2wiki_qa.json")에서 _id, type, question, answer를 읽고 EVA 평가(em, f1, answer_similarity)를 계산 후 CSV에 기록
def evaluate_model_responses(
    json_file: str = "2wiki_qa.json",
    text_file: str = "2wiki_corpus.json",
    output_file: str = "2wiki_result_rag.csv",
    batch_size: int = 5,
    chunk_size: int = 100,
    chunk_overlap: int = 50,
    device: str = "cuda"
):
    processed_count = 0
    if os.path.exists(output_file):
        try:
            existing_df = pd.read_csv(output_file, encoding='utf-8-sig')
        except Exception as e:
            print(f"출력 파일 읽기 오류: {e}")
            existing_df = None

        if existing_df is not None and not existing_df.empty:
            if "전체 평균" in str(existing_df.iloc[-1, 0]):
                processed_count = len(existing_df) - 1
                existing_df = existing_df.iloc[:-1]
                existing_df.to_csv(output_file, index=False, encoding='utf-8-sig')
            else:
                processed_count = len(existing_df)

    with open(json_file, "r", encoding="utf-8") as f:
        qa_list = json.load(f)

    total_rows = len(qa_list)
    if processed_count >= total_rows:
        print("이미 모든 행이 처리되었습니다.")
        return pd.read_csv(output_file, encoding='utf-8-sig')

    evaluation_results = []
    em_list, f1_list, ans_sim_list = [], [], []
    evalObj = Evaluate()

    for idx in tqdm(range(processed_count, total_rows), desc="평가 진행 중"):
        sample = qa_list[idx]
        _id = sample["_id"]
        type_field = sample["type"]
        question = sample["question"]
        gold = sample["answer"]

        context = perform_rag(
            question=question,
            text_file=text_file,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            device=device
        )
        generated_response = query_llm(context, question)

        metrics = evalObj.getBenchMark([generated_response], [gold])
        em_val, f1_val, ans_sim_val = metrics["em"], metrics["f1"], metrics["answer_similarity"]

        em_list.append(em_val)
        f1_list.append(f1_val)
        ans_sim_list.append(ans_sim_val)

        evaluation_results.append({
            "_id": _id,
            "type": type_field,
            "question": question,
            "answer": gold,
            "model_response": generated_response,
            "em": em_val,
            "f1": f1_val,
            "answer_similarity": ans_sim_val
        })

        if (len(evaluation_results) % batch_size == 0) or (idx == total_rows - 1):
            partial_df = pd.DataFrame(evaluation_results)
            partial_df.to_csv(
                output_file,
                mode='a' if os.path.exists(output_file) and processed_count > 0 else 'w',
                index=False,
                header=not (os.path.exists(output_file) and processed_count > 0),
                encoding='utf-8-sig'
            )
            evaluation_results = []
            processed_count = idx + 1

    avg_em = sum(em_list) / len(em_list) if em_list else 0
    avg_f1 = sum(f1_list) / len(f1_list) if f1_list else 0
    avg_ans_sim = sum(ans_sim_list) / len(ans_sim_list) if ans_sim_list else 0

    summary_row = {
        "_id": "전체 평균",
        "type": "",
        "question": "",
        "answer": "",
        "model_response": "",
        "em": avg_em,
        "f1": avg_f1,
        "answer_similarity": avg_ans_sim
    }

    final_df = pd.read_csv(output_file, encoding='utf-8-sig')
    final_df = pd.concat([final_df, pd.DataFrame([summary_row])], ignore_index=True)
    final_df.to_csv(output_file, index=False, encoding='utf-8-sig')

    print(f"평가 완료! 결과는 '{output_file}'에 저장되었습니다.")
    return final_df

# 4) 메인 실행 예시
if __name__ == "__main__":
    final_df = evaluate_model_responses(
        json_file="2wiki_qa.json",       # 2wiki_qa.json 파일 사용
        text_file="2wiki_corpus.json",    
        output_file="2wiki_result_rag.csv", 
        batch_size=5,
        chunk_size=100,
        chunk_overlap=50,
        device="cuda"
    )
    print(final_df.head())


Note: you may need to restart the kernel to use updated packages.


평가 진행 중:  46%|████▌     | 206/450 [2:22:10<3:06:07, 45.77s/it]