In [7]:
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch


In [8]:
# 유사도 계산 함수 정의
def compute_similarity_cross_encoder(question, documents, tokenizer, model, top_k=3):
    scores = []

    for doc in documents:
        # 질문과 문서를 입력으로 토크나이즈
        inputs = tokenizer(
            question,
            doc,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512,
        ).to("cuda")

        # 모델 추론으로 유사도 점수 계산
        with torch.no_grad():
            logits = model(**inputs).logits
            score = torch.sigmoid(logits).squeeze().item()  # 시그모이드로 점수 변환
            scores.append(score)

    # Top-k 문서 선택
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    top_documents = [{"content": documents[i], "score": scores[i]} for i in top_indices]

    return top_documents


In [9]:
# 모델과 토크나이저 로드
model_name = "dragonkue/bge-reranker-v2-m3-ko"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()
model.to("cuda")  # GPU 사용




XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(8194, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_f

In [10]:
# test.csv 파일 로드
test_file_path = "../rag_data/test.csv"  # test.csv 파일 경로
test_df = pd.read_csv(test_file_path)


In [11]:
# test.csv에서 질문 추출
test_questions = test_df["problems"].apply(lambda x: eval(x)["question"]).tolist()


In [12]:
# Wiki 데이터 로드
wiki_file_path = "../rag_data/wiki_test.jsonl"  # Wiki JSONL 파일 경로
wiki_data = []
with open(wiki_file_path, "r", encoding="utf-8") as file:
    for line in file:
        wiki_data.append(json.loads(line))


In [13]:
# Wiki 데이터에서 문서 내용 추출
example_documents = [doc["content"] for doc in wiki_data]

In [14]:
# 질문 리스트에서 wiki_data와 유사한 문서 추출
results = []

for question in test_questions:
    # 각 질문에 대해 유사한 문서 검색
    top_documents = compute_similarity_cross_encoder(question, example_documents, tokenizer, model, top_k=3)
    # 검색 결과 저장
    results.append({
        "question": question,
        "top_documents": top_documents
    })

# 결과를 DataFrame으로 변환
final_results = []
for result in results:
    question = result["question"]
    for doc in result["top_documents"]:
        final_results.append({
            "question": question,
            "document_content": doc["content"],
            "score": doc["score"]
        })

results_df = pd.DataFrame(final_results)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_850658/608308242.py", line 6, in <module>
    top_documents = compute_similarity_cross_encoder(question, example_documents, tokenizer, model, top_k=3)
  File "/tmp/ipykernel_850658/1021219155.py", line 7, in compute_similarity_cross_encoder
    inputs = tokenizer(
  File "/opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3055, in __call__
    verbose: bool = True,
  File "/opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3163, in _call_one
    stride: int = 0,
  File "/opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3237, in encode_plus
    max_length: Optional[int] = None,
  File "/opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py",

In [None]:
# 결과 저장 및 출력
results_df.to_csv("retrieval_results.csv", index=False)  # CSV 파일로 저장
print(results_df.head())  # 결과 미리보기