## 번역 성능 평가 

Kor - Eng - Jpn

In [19]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score

## 번역 시스템 준비 

In [10]:
# Define LLM
import os
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate

llm = Ollama(model="llama3.1:70b", request_timeout=600,temperature=0)
llm.complete("hello")

CompletionResponse(text='Hello! How can I assist you today?', additional_kwargs={'model': 'llama3.1:70b', 'created_at': '2024-12-28T06:41:47.65819851Z', 'done': True, 'done_reason': 'stop', 'context': [128009, 128006, 882, 128007, 271, 15339, 128009, 128006, 78191, 128007, 271, 9906, 0, 2650, 649, 358, 7945, 499, 3432, 30], 'total_duration': 823681499, 'load_duration': 42072342, 'prompt_eval_count': 12, 'prompt_eval_duration': 81689000, 'eval_count': 10, 'eval_duration': 654783000}, raw={'model': 'llama3.1:70b', 'created_at': '2024-12-28T06:41:47.65819851Z', 'response': 'Hello! How can I assist you today?', 'done': True, 'done_reason': 'stop', 'context': [128009, 128006, 882, 128007, 271, 15339, 128009, 128006, 78191, 128007, 271, 9906, 0, 2650, 649, 358, 7945, 499, 3432, 30], 'total_duration': 823681499, 'load_duration': 42072342, 'prompt_eval_count': 12, 'prompt_eval_duration': 81689000, 'eval_count': 10, 'eval_duration': 654783000}, logprobs=None, delta=None)

In [71]:
prompt = PromptTemplate(
    template="""This is an {source_lang} to {target_lang} translation, please provide the {target_lang} translation for this text in as polite a tone as possible. \
Do not provide any explanations or text apart from the translation.
The translation result must be written in {target_lang}.

{source_lang}: {source_text}

{target_lang}:"""
)

# prompt = PromptTemplate(
#     template="""This is an {source_lang} to {target_lang} translation, please provide the {target_lang} translation for this text in as polite a tone as possible. \
# The original text sentence structure must be preserved.
# Do not provide any explanations or text apart from the translation.
# The translation result must be written in {target_lang}.

# {source_lang}: {source_text}

# {target_lang}:"""
# )

In [26]:
# 번역함수 
def translate(source_text, source_lang, target_lang, prompt):
    full_prompt = prompt.format(source_lang=source_lang, target_lang=target_lang, source_text=source_text)
    result = llm.complete(full_prompt)
    return result.text

In [28]:
result = translate("안녕하세요. 좋은 아침입니다.", 'kor', 'eng', prompt)
print(result)

Good morning, hello.


## 평가 시스템 준비

In [62]:
# 언어별 토크나이저
from nltk.tokenize import word_tokenize
from janome.tokenizer import Tokenizer
from kiwipiepy import Kiwi

def tokenize(text, lang):
    if lang == "eng":
        # 영어: NLTK word_tokenize
        return word_tokenize(text)
    elif lang == "kor":
        # 한국어: Kiwi 형태소 분석기
        kiwi = Kiwi()
        tokens = kiwi.tokenize(text)
        return [token.form for token in tokens]  # 형태소만 추출
    elif lang == "jpn":
        # 일본어: Janome 형태소 분석기
        tokenizer = Tokenizer()
        return [token.surface for token in tokenizer.tokenize(text)]
    else:
        raise ValueError(f"Unsupported language: {lang}")

In [92]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score
import numpy as np

# 평가 함수
def evaluate(
    reference: str,
    candidate: str,
    tgt_lang: str,
    bert_model="microsoft/deberta-xlarge-mnli",
):
    # 텍스트를 토큰화
    reference_tokens = tokenize(reference, tgt_lang)
    candidate_tokens = tokenize(candidate, tgt_lang)

    # BLEU 점수 계산 (스무딩 적용)
    bleu = sentence_bleu(
        [reference_tokens],
        candidate_tokens,
        smoothing_function=SmoothingFunction().method1,
    )

    # METEOR 점수 계산 (리스트 형태로 전달)
    meteor = meteor_score([reference_tokens], candidate_tokens)

    # BERTScore 계산
    P, R, F1 = score(
        [candidate],
        [reference],
        lang=tgt_lang,
        model_type=bert_model,
        device="cuda",  # GPU 사용
    )

    return {
        "BLEU": round(bleu, 4),
        "METEOR": round(meteor, 4),
        "BERT": round(F1.item(), 4),
    }

데이터셋 FLORES-Plus 준비

In [7]:
# 텍스트 파일 로더
def load_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    return [line.strip() for line in lines]

# 데이터 위치 
data_dir = "/home/dudaji/Jun/llm-rag-chatbot/data/flores/"

data_eng = load_text_file(f"{data_dir}/devtest.eng_Latn")
data_kor = load_text_file(f"{data_dir}/devtest.kor_Hang")
data_jpn = load_text_file(f"{data_dir}/devtest.jpn_Jpan")

print(data_eng[0])
print(data_kor[0])
print(data_jpn[0])

"We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.
"그는 ""현재 4개월 된 당뇨병에서 치료된 생쥐가 있다""고 덧붙였다."
「我々が飼っている生後4か月のマウスはかつて糖尿病でしたが現在は糖尿病ではない、」と彼は付け加えました。


In [75]:
test_idx = 1
print(f'original: {data_kor[test_idx]}')
result = translate(data_kor[test_idx], 'kor', 'eng', prompt)
print(f'translated: {result}')

eval_result = evaluate(data_eng[test_idx], result, 'eng')
print(eval_result)

original: 노바스코샤주 핼리팩스의 댈하우지대학교 의과 교수이자 캐나다 당뇨 협회 임상과학부 의장인 Ehud Ur 박사는 이 연구가 아직 초기 단계라고 경고했습니다.
translated: Dr. Ehud Ur, a professor of medicine at Dalhousie University in Halifax, Nova Scotia and chair of the Clinical and Scientific Section of the Canadian Diabetes Association, warned that this study is still in its early stages.




{'BLEU': 0.5855, 'METEOR': 0.8431, 'ROUGE': 0.8493, 'BERT': 0.9479}


In [90]:
print(f'original: {data_eng[1]}')
result = translate(data_eng[1], 'eng', 'kor', prompt)
print(f'translated: {result}')

eval_result = evaluate(data_kor[1], result, 'kor')
print(eval_result)

original: Dr. Ehud Ur, professor of medicine at Dalhousie University in Halifax, Nova Scotia and chair of the clinical and scientific division of the Canadian Diabetes Association cautioned that the research is still in its early days.
translated: 노바스코샤 주 핼리팩스에 있는 달하우스 대학교 의과대학 교수이자 캐나다 당뇨병 협회의 임상 및 과학 부문 위원장인 에후드 어 박사는 이 연구가 아직 초기 단계에 있기 때문에 주의를 요한다고 경고했다.




{'BLEU': 0.2444, 'METEOR': 0.5935, 'ROUGE': 0.0, 'BERT': 0.8518}


In [91]:
print(f'original: {data_eng[1]}')
result = translate(data_eng[1], 'eng', 'jpn', prompt)
print(f'translated: {result}')

eval_result = evaluate(data_jpn[1], result, 'jpn')
print(eval_result)

original: Dr. Ehud Ur, professor of medicine at Dalhousie University in Halifax, Nova Scotia and chair of the clinical and scientific division of the Canadian Diabetes Association cautioned that the research is still in its early days.
translated: ノバスコシア州ハリファックスのダルハウジー大学医学部教授であり、カナダ糖尿病協会臨床・科学部門の委員長であるエフード・アーウル博士は、この研究がまだ初期段階にあることを警告した。




{'BLEU': 0.4396, 'METEOR': 0.7287, 'ROUGE': 0.0, 'BERT': 0.9035}


## 번역 평가 코드 

In [97]:
len(data_eng)

1012

In [94]:
import time

pairs = [
    ("eng", "kor", data_eng, data_kor),
    ("eng", "jpn", data_eng, data_jpn),
    ("kor", "eng", data_kor, data_eng),
    ("kor", "jpn", data_kor, data_jpn),
    ("jpn", "eng", data_jpn, data_eng),
    ("jpn", "kor", data_jpn, data_kor),
]

# 평가 수행
results = []
timings = []  # 각 페어 처리 시간을 저장

for source_lang, target_lang, sources, references in pairs:
    start_time = time.time()  # 시작 시간 기록
    for source, reference in zip(sources, references):
        candidate = translate(source, source_lang, target_lang, prompt)
        evaluation = evaluate(reference, candidate, target_lang)
        results.append({
            "Source Language": source_lang,
            "Target Language": target_lang,
            "Source": source,
            "Reference": reference,
            "Candidate": candidate,
            **evaluation,
        })
    elapsed_time = time.time() - start_time  # 처리 시간 계산
    timings.append({
        "Source Language": source_lang,
        "Target Language": target_lang,
        "Elapsed Time (s)": round(elapsed_time, 4)
    })

# 결과 데이터프레임 생성
df = pd.DataFrame(results)

# 시간 데이터프레임 생성
timing_df = pd.DataFrame(timings)

# 결과와 시간 모두 출력
print("\nProcessing Times:")
print(timing_df)

# 평가 결과 저장
df.to_csv("translation_evaluation_results.csv", index=False, encoding="utf-8")

# 처리 시간 저장
timing_df.to_csv("translation_processing_times.csv", index=False, encoding="utf-8")

print("Results saved:")
print("1. Translation evaluation results: translation_evaluation_results.csv")
print("2. Processing times: translation_processing_times.csv")



   Source Language Target Language  \
0              eng             kor   
1              eng             kor   
2              eng             kor   
3              eng             kor   
4              eng             jpn   
5              eng             jpn   
6              eng             jpn   
7              eng             jpn   
8              kor             eng   
9              kor             eng   
10             kor             eng   
11             kor             eng   
12             kor             jpn   
13             kor             jpn   
14             kor             jpn   
15             kor             jpn   
16             jpn             eng   
17             jpn             eng   
18             jpn             eng   
19             jpn             eng   
20             jpn             kor   
21             jpn             kor   
22             jpn             kor   
23             jpn             kor   

                                               So

In [96]:
print(df)

   Source Language Target Language  \
0              eng             kor   
1              eng             kor   
2              eng             kor   
3              eng             kor   
4              eng             jpn   
5              eng             jpn   
6              eng             jpn   
7              eng             jpn   
8              kor             eng   
9              kor             eng   
10             kor             eng   
11             kor             eng   
12             kor             jpn   
13             kor             jpn   
14             kor             jpn   
15             kor             jpn   
16             jpn             eng   
17             jpn             eng   
18             jpn             eng   
19             jpn             eng   
20             jpn             kor   
21             jpn             kor   
22             jpn             kor   
23             jpn             kor   

                                               So

In [None]:
# 평가 수행
results = []
for _, row in data:  # 각 샘플에 대해 평가 수행
    source = row["source"]
    reference = row["target"]

    # 번역 수행
    candidate = translate(source)

    # 평가 실행
    evaluation = evaluate(reference, candidate, "ko")
    results.append(
        {
            "Source": source,
            "Reference": reference,
            "Candidate": candidate,
            **evaluation,
        }
    )

# 결과 데이터프레임 생성
df = pd.DataFrame(results)
print(df)


In [None]:

# 결과 저장 
df.to_csv("translation_evaluation_results.csv", index=False)
