## 번역 성능 평가 

Kor - Eng - Jpn

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score

## 번역 시스템 준비 

In [7]:
# Define LLM
import os
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate

# llm = Ollama(model="llama3.1:70b", request_timeout=600,temperature=0)
llm = Ollama(model="llama3.1", request_timeout=600,temperature=0)
llm.complete("hello")

CompletionResponse(text='Hello! How are you today? Is there something I can help you with or would you like to chat?', additional_kwargs={'model': 'llama3.1', 'created_at': '2024-12-28T15:03:14.654431234Z', 'done': True, 'done_reason': 'stop', 'context': [128009, 128006, 882, 128007, 271, 15339, 128009, 128006, 78191, 128007, 271, 9906, 0, 2650, 527, 499, 3432, 30, 2209, 1070, 2555, 358, 649, 1520, 499, 449, 477, 1053, 499, 1093, 311, 6369, 30], 'total_duration': 24668128819, 'load_duration': 24382624673, 'prompt_eval_count': 12, 'prompt_eval_duration': 20482000, 'eval_count': 23, 'eval_duration': 262927000}, raw={'model': 'llama3.1', 'created_at': '2024-12-28T15:03:14.654431234Z', 'response': 'Hello! How are you today? Is there something I can help you with or would you like to chat?', 'done': True, 'done_reason': 'stop', 'context': [128009, 128006, 882, 128007, 271, 15339, 128009, 128006, 78191, 128007, 271, 9906, 0, 2650, 527, 499, 3432, 30, 2209, 1070, 2555, 358, 649, 1520, 499, 44

In [3]:
prompt = PromptTemplate(
    template="""This is an {source_lang} to {target_lang} translation, please provide the {target_lang} translation for this text in as polite a tone as possible. \
Do not provide any explanations or text apart from the translation.
The translation result must be written in {target_lang}.

{source_lang}: {source_text}

{target_lang}:"""
)

# prompt = PromptTemplate(
#     template="""This is an {source_lang} to {target_lang} translation, please provide the {target_lang} translation for this text in as polite a tone as possible. \
# The original text sentence structure must be preserved.
# Do not provide any explanations or text apart from the translation.
# The translation result must be written in {target_lang}.

# {source_lang}: {source_text}

# {target_lang}:"""
# )

In [4]:
# 번역함수 
def translate(source_text, source_lang, target_lang, prompt):
    full_prompt = prompt.format(source_lang=source_lang, target_lang=target_lang, source_text=source_text)
    result = llm.complete(full_prompt)
    return result.text

In [9]:
for ii in np.arange(10):
    result = translate("안녕하세요. 좋은 아침입니다.", 'kor', 'eng', prompt)
print(result)

Hello. Good morning.


## 평가 시스템 준비

In [10]:
# 언어별 토크나이저
from nltk.tokenize import word_tokenize
from janome.tokenizer import Tokenizer
from kiwipiepy import Kiwi

def tokenize(text, lang):
    if lang == "eng":
        # 영어: NLTK word_tokenize
        return word_tokenize(text)
    elif lang == "kor":
        # 한국어: Kiwi 형태소 분석기
        kiwi = Kiwi()
        tokens = kiwi.tokenize(text)
        return [token.form for token in tokens]  # 형태소만 추출
    elif lang == "jpn":
        # 일본어: Janome 형태소 분석기
        tokenizer = Tokenizer()
        return [token.surface for token in tokenizer.tokenize(text)]
    else:
        raise ValueError(f"Unsupported language: {lang}")

In [11]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score
import numpy as np

# 평가 함수
def evaluate(
    reference: str,
    candidate: str,
    tgt_lang: str,
    bert_model="microsoft/deberta-xlarge-mnli",
):
    # 텍스트를 토큰화
    reference_tokens = tokenize(reference, tgt_lang)
    candidate_tokens = tokenize(candidate, tgt_lang)

    # BLEU 점수 계산 (스무딩 적용)
    bleu = sentence_bleu(
        [reference_tokens],
        candidate_tokens,
        smoothing_function=SmoothingFunction().method1,
    )

    # METEOR 점수 계산 (리스트 형태로 전달)
    meteor = meteor_score([reference_tokens], candidate_tokens)

    # BERTScore 계산
    P, R, F1 = score(
        [candidate],
        [reference],
        lang=tgt_lang,
        model_type=bert_model,
        device="cuda",  # GPU 사용
    )

    return {
        "BLEU": round(bleu, 4),
        "METEOR": round(meteor, 4),
        "BERT": round(F1.item(), 4),
    }

데이터셋 FLORES-Plus 준비

In [12]:
# 텍스트 파일 로더
def load_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    return [line.strip() for line in lines]

# 데이터 위치 
data_dir = "/home/dudaji/Jun/llm-rag-chatbot/data/flores/"

data_eng = load_text_file(f"{data_dir}/devtest.eng_Latn")
data_kor = load_text_file(f"{data_dir}/devtest.kor_Hang")
data_jpn = load_text_file(f"{data_dir}/devtest.jpn_Jpan")

print(data_eng[0])
print(data_kor[0])
print(data_jpn[0])

"We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.
"그는 ""현재 4개월 된 당뇨병에서 치료된 생쥐가 있다""고 덧붙였다."
「我々が飼っている生後4か月のマウスはかつて糖尿病でしたが現在は糖尿病ではない、」と彼は付け加えました。


In [13]:
test_idx = 1
print(f'original: {data_kor[test_idx]}')
result = translate(data_kor[test_idx], 'kor', 'eng', prompt)
print(f'translated: {result}')

eval_result = evaluate(data_eng[test_idx], result, 'eng')
print(eval_result)

original: 노바스코샤주 핼리팩스의 댈하우지대학교 의과 교수이자 캐나다 당뇨 협회 임상과학부 의장인 Ehud Ur 박사는 이 연구가 아직 초기 단계라고 경고했습니다.
translated: Dr. Ehud Ur, a professor at Dalhousie University's medical school in Halifax, Nova Scotia and president of the Canadian Diabetes Association's clinical science department, has warned that this research is still in its early stages.




{'BLEU': 0.4075, 'METEOR': 0.6594, 'BERT': 0.9297}


In [14]:
print(f'original: {data_eng[1]}')
result = translate(data_eng[1], 'eng', 'kor', prompt)
print(f'translated: {result}')

eval_result = evaluate(data_kor[1], result, 'kor')
print(eval_result)

original: Dr. Ehud Ur, professor of medicine at Dalhousie University in Halifax, Nova Scotia and chair of the clinical and scientific division of the Canadian Diabetes Association cautioned that the research is still in its early days.
translated: 드. 에후드 우르(Dr. Ehud Ur) 교수는 다로시 대학교의 의학 교수이자 캐나다 당뇨병 협회 임상 및 과학 부서 위원장입니다. 그는 연구가 아직 초기 단계에 있음을 경고했습니다.

( Note: I corrected the translation to reflect that "Dr. Ehud Ur" is a person's name, and not a title. Also, I translated "cautioned" as "" which means "warned" or "advised", but in a polite tone.)




{'BLEU': 0.1072, 'METEOR': 0.5105, 'BERT': 0.7465}


In [15]:
print(f'original: {data_eng[1]}')
result = translate(data_eng[1], 'eng', 'jpn', prompt)
print(f'translated: {result}')

eval_result = evaluate(data_jpn[1], result, 'jpn')
print(eval_result)

original: Dr. Ehud Ur, professor of medicine at Dalhousie University in Halifax, Nova Scotia and chair of the clinical and scientific division of the Canadian Diabetes Association cautioned that the research is still in its early days.
translated: ドクター・エフード・ウール博士は、ハリファックスにあるノバスコティア州のダルハウス大学医学部教授であり、カナダ糖尿病協会臨床科学部門委員長として、研究がまだ初期段階であることを注意した。
{'BLEU': 0.2401, 'METEOR': 0.5821, 'BERT': 0.867}


## 번역 평가 코드 

In [None]:
import time
import pandas as pd

nTest = 200

pairs = [
    ("eng", "kor", data_eng[:nTest], data_kor[:nTest]),
    ("eng", "jpn", data_eng[:nTest], data_jpn[:nTest]),
    ("kor", "eng", data_kor[:nTest], data_eng[:nTest]),
    ("kor", "jpn", data_kor[:nTest], data_jpn[:nTest]),
    ("jpn", "eng", data_jpn[:nTest], data_eng[:nTest]),
    ("jpn", "kor", data_jpn[:nTest], data_kor[:nTest]),
]

# 평가 수행 및 저장 디렉토리
save_dir = "../../data/translate_flores"
all_results = []  # 모든 결과를 모아서 저장할 리스트

for source_lang, target_lang, sources, references in pairs:
    pair_results = []  # 현재 언어 쌍의 결과 저장
    ic = 0
    for source, reference in zip(sources, references):
        ic += 1
        if ic % 20 == 0:
            print(ic)
        # 번역 및 시간 측정
        start_time = time.time()
        candidate = translate(source, source_lang, target_lang, prompt)
        elapsed_time = time.time() - start_time  # 번역 소요 시간

        # 평가 실행
        evaluation = evaluate(reference, candidate, target_lang)

        # 결과 저장
        pair_results.append({
            "Source Language": source_lang,
            "Target Language": target_lang,
            "Source": source,
            "Reference": reference,
            "Candidate": candidate,
            "Translation Time (s)": round(elapsed_time, 4),  # 번역 소요 시간 추가
            **evaluation,
        })

    # 현재 언어 쌍의 결과를 데이터프레임으로 변환 및 저장
    pair_df = pd.DataFrame(pair_results)
    pair_filename = f"{save_dir}/Eval_results_{source_lang}_to_{target_lang}.csv"
    pair_df.to_csv(pair_filename, index=False, encoding="utf-8")
    print(f"Results for {source_lang} to {target_lang} saved: {pair_filename}")

    # 전체 결과 통합
    all_results.extend(pair_results)

# 모든 결과를 통합한 데이터프레임 생성 및 저장
final_df = pd.DataFrame(all_results)
final_filename = f"{save_dir}/Eval_results_flores_all.csv"
final_df.to_csv(final_filename, index=False, encoding="utf-8")
print(f"All results saved: {final_filename}")



20
40
60
