## 번역 성능 평가 

Kor - Eng - Jpn

In [19]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score

## 번역 시스템 준비 

In [10]:
# Define LLM
import os
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate

llm = Ollama(model="llama3.1:70b", request_timeout=600,temperature=0)
llm.complete("hello")

CompletionResponse(text='Hello! How can I assist you today?', additional_kwargs={'model': 'llama3.1:70b', 'created_at': '2024-12-28T06:41:47.65819851Z', 'done': True, 'done_reason': 'stop', 'context': [128009, 128006, 882, 128007, 271, 15339, 128009, 128006, 78191, 128007, 271, 9906, 0, 2650, 649, 358, 7945, 499, 3432, 30], 'total_duration': 823681499, 'load_duration': 42072342, 'prompt_eval_count': 12, 'prompt_eval_duration': 81689000, 'eval_count': 10, 'eval_duration': 654783000}, raw={'model': 'llama3.1:70b', 'created_at': '2024-12-28T06:41:47.65819851Z', 'response': 'Hello! How can I assist you today?', 'done': True, 'done_reason': 'stop', 'context': [128009, 128006, 882, 128007, 271, 15339, 128009, 128006, 78191, 128007, 271, 9906, 0, 2650, 649, 358, 7945, 499, 3432, 30], 'total_duration': 823681499, 'load_duration': 42072342, 'prompt_eval_count': 12, 'prompt_eval_duration': 81689000, 'eval_count': 10, 'eval_duration': 654783000}, logprobs=None, delta=None)

In [9]:
prompt = PromptTemplate(
    template="""This is an {source_lang} to {target_lang} translation, please provide the {target_lang} translation for this text in as polite a tone as possible. \
The original text sentence structure must be preserved.
Do not provide any explanations or text apart from the translation.
The translation result must be written in {target_lang}.

{source_lang}: {source_text}

{target_lang}:"""
)

In [26]:
# 번역함수 
def translate(source_text, source_lang, target_lang, prompt):
    full_prompt = prompt.format(source_lang=source_lang, target_lang=target_lang, source_text=source_text)
    result = llm.complete(full_prompt)
    return result.text

In [28]:
result = translate("안녕하세요. 좋은 아침입니다.", 'kor', 'eng', prompt)
print(result)

Good morning, hello.


## 평가 시스템 준비

In [39]:
# 언어별 토크나이저
from nltk.tokenize import word_tokenize
from konlpy.tag import Mecab
from janome.tokenizer import Tokenizer

def tokenize(text, lang):
    if lang == "eng":
        # 영어: NLTK word_tokenize
        return word_tokenize(text)
    elif lang == "kor":
        # 한국어: Mecab 형태소 분석기
        mecab = Mecab()
        return mecab.morphs(text)
    elif lang == "jpn":
        # 일본어: Janome 형태소 분석기
        tokenizer = Tokenizer()
        return [token.surface for token in tokenizer.tokenize(text)]
    else:
        raise ValueError(f"Unsupported language: {lang}")

In [40]:
from nltk.tokenize import word_tokenize

def evaluate(
    reference: str,
    candidate: str,
    tgt_lang: str,
    bert_model="microsoft/deberta-xlarge-mnli",
):
    # 텍스트를 토큰화
    reference_tokens = tokenize(reference, tgt_lang)
    candidate_tokens = tokenize(candidate, tgt_lang)

    # BLEU 점수 계산 (스무딩 적용)
    bleu = sentence_bleu(
        [reference_tokens],
        candidate_tokens,
        smoothing_function=SmoothingFunction().method1,
    )

    # METEOR 점수 계산
    meteor = meteor_score([reference_tokens], candidate_tokens)

    # ROUGE 점수 계산
    scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
    scores = scorer.score(reference, candidate)

    # BERTScore 계산
    P, R, F1 = score(
        [candidate],
        [reference],
        lang=tgt_lang,
        model_type=bert_model,
        device="cuda",  # GPU 사용
    )

    return {
        "BLEU": round(bleu, 4),
        "METEOR": round(meteor, 4),
        "ROUGE": round(
            np.mean(np.mean([score.precision for score in scores.values()])), 4
        ),
        "BERT": round(F1.item(), 4),
    }

In [None]:
def evaluate(reference: str, candidate: str, tgt_lang: str, bert_model="bert-base-multilingual-cased"):
    # Tokenize Japanese sentences
    reference_tokens = mecab_tokenize(reference)
    candidate_tokens = mecab_tokenize(candidate)
    
    # Calculate BLEU
    bleu = sentence_bleu([reference_tokens], candidate_tokens)
    
    # Calculate ROUGE
    scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=False)
    scores = scorer.score(" ".join(reference_tokens), " ".join(candidate_tokens))
    
    # Calculate BERT Score
    P, R, F1 = score(
        [candidate],
        [reference],
        lang=tgt_lang,
        model_type=bert_model,
        device="cuda",
    )
    
    return {
        "BLEU": round(bleu, 4),
        "ROUGE1": round(scores["rouge1"].fmeasure, 4),
        "ROUGEL": round(scores["rougeL"].fmeasure, 4),
        "BERT": round(F1.item(), 4),
    }

데이터셋 FLORES-Plus 준비

In [7]:
# 텍스트 파일 로더
def load_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    return [line.strip() for line in lines]

# 데이터 위치 
data_dir = "/home/dudaji/Jun/llm-rag-chatbot/data/flores/"

data_eng = load_text_file(f"{data_dir}/devtest.eng_Latn")
data_kor = load_text_file(f"{data_dir}/devtest.kor_Hang")
data_jpn = load_text_file(f"{data_dir}/devtest.jpn_Jpan")

print(data_eng[0])
print(data_kor[0])
print(data_jpn[0])

"We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.
"그는 ""현재 4개월 된 당뇨병에서 치료된 생쥐가 있다""고 덧붙였다."
「我々が飼っている生後4か月のマウスはかつて糖尿病でしたが現在は糖尿病ではない、」と彼は付け加えました。


In [41]:
print(f'original: {data_kor[0]}')
result = translate(data_kor[0], 'kor', 'eng', prompt)
print(f'translated: {result}')

eval_result = evaluate(data_eng[0], result, 'eng')
print(eval_result)

original: "그는 ""현재 4개월 된 당뇨병에서 치료된 생쥐가 있다""고 덧붙였다."
translated: "He added that there are currently four-month-old mice cured of diabetes."




{'BLEU': 0.0163, 'METEOR': 0.3156, 'ROUGE': 0.4615, 'BERT': 0.7634}


In [42]:
print(f'original: {data_eng[0]}')
result = translate(data_eng[0], 'eng', 'kor', prompt)
print(f'translated: {result}')

eval_result = evaluate(data_eng[0], result, 'kor')
print(eval_result)

original: "We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.
translated: "우리는 이제 당뇨병이었던 것이 아닌, 당뇨병이 아니었던 4개월 된 생쥐들을 가지고 있습니다."


Exception: Install MeCab in order to use it: http://konlpy.org/en/latest/install/

In [20]:
full_prompt = prompt.format(source_lang='kor', target_lang='eng', source_text="안녕하세요. 좋은 아침입니다.")
result = llm.complete(full_prompt)
print(result)

eval_result = evaluate("안녕하세요. 좋은 아침입니다.", result.text, 'kor')
print(eval_result)

Good morning, hello.


AttributeError: 'CompletionResponse' object has no attribute 'split'

In [17]:
result.text

'Good morning, hello.'

In [None]:
result.

evaluate(data_eng[0], data_kor[0])

In [None]:
# 평가 수행
results = []
for _, row in data:  # 각 샘플에 대해 평가 수행
    source = row["source"]
    reference = row["target"]

    # 번역 수행
    candidate = translate(source)

    # 평가 실행
    evaluation = evaluate(reference, candidate, "ko")
    results.append(
        {
            "Source": source,
            "Reference": reference,
            "Candidate": candidate,
            **evaluation,
        }
    )

# 결과 데이터프레임 생성
df = pd.DataFrame(results)
print(df)


In [None]:

# 결과 저장 
df.to_csv("translation_evaluation_results.csv", index=False)
