# 번역 성능 평가 

In [10]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score


def preprocess_text(text: str) -> str:
    """
    텍스트 전처리: 특수 문자 제거 및 소문자 변환
    """
    text = re.sub(r"[^\w\s]", "", text)  # 특수 문자 제거
    return text.lower().strip()  # 소문자 변환 및 양쪽 공백 제거


def tokenize_text(text: str) -> list:
    """
    NLTK를 활용한 정교한 토큰화
    """
    return word_tokenize(text)


def evaluate(
    reference: str,
    candidate: str,
    tgt_lang: str,
    bert_model="microsoft/deberta-xlarge-mnli",
):
    # 텍스트 전처리
    reference = preprocess_text(reference)
    candidate = preprocess_text(candidate)

    # 텍스트 토큰화
    reference_tokens = tokenize_text(reference)
    candidate_tokens = tokenize_text(candidate)

    # BLEU 점수 계산
    bleu = sentence_bleu([reference_tokens], candidate_tokens)

    # METEOR 점수 계산 (토큰화된 리스트를 사용)
    meteor = meteor_score([reference_tokens], candidate_tokens)

    # ROUGE 점수 계산
    scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
    scores = scorer.score(reference, candidate)

    # BERTScore 계산
    P, R, F1 = score(
        [candidate],
        [reference],
        lang=tgt_lang,
        model_type=bert_model,
        device="cuda",
    )

    return {
        "BLEU": round(bleu, 4),
        "METEOR": round(meteor, 4),
        "ROUGE": round(
            np.mean(np.mean([score.precision for score in scores.values()])), 4
        ),
        "BERT": round(F1.item(), 4),
    }

테스트

In [9]:
reference_text = "This is a correct translation."
candidate_text = "This is an accurate translation."
target_language = "en"

result = evaluate(
    reference=reference_text,
    candidate=candidate_text,
    tgt_lang=target_language,
)

print(result)

{'BLEU': 0.0, 'METEOR': 0.5111, 'ROUGE': 0.6, 'BERT': 0.9825}


In [11]:
# 텍스트 전처리
reference = preprocess_text(reference_text)
candidate = preprocess_text(candidate_text)

# 텍스트 토큰화
reference_tokens = tokenize_text(reference)
candidate_tokens = tokenize_text(candidate)

# BLEU 점수 계산
bleu = sentence_bleu([reference_tokens], candidate_tokens)

In [19]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# BLEU 점수 계산
bleu = sentence_bleu(
    [reference_tokens],  # 참조 텍스트 리스트
    candidate_tokens,    # 후보 텍스트 리스트
    weights=(0.5, 0.5),  # 1-gram과 2-gram에 동일한 가중치
    smoothing_function=SmoothingFunction().method1,  # 스무딩 함수 적용
)

print("BLEU:", bleu)

TypeError: object of type 'float' has no len()

In [20]:
# 1-gram BLEU 점수 계산
bleu = sentence_bleu(
    [reference_tokens],  # 참조 텍스트 리스트
    candidate_tokens,    # 후보 텍스트 리스트
    weights=(1.0, 0, 0, 0)  # 1-gram만 고려
)

print("1-gram BLEU:", bleu)

1-gram BLEU: 0.6


In [22]:
# 2-gram BLEU 점수 계산
bleu = sentence_bleu(
    [reference_tokens],  # 참조 텍스트 리스트
    candidate_tokens,    # 후보 텍스트 리스트
    weights=(0, 1, 0, 0)  # 1-gram만 고려
)

print("2-gram BLEU:", bleu)

2-gram BLEU: 0.25


In [23]:
# 2-gram BLEU 점수 계산
bleu = sentence_bleu(
    [reference_tokens],  # 참조 텍스트 리스트
    candidate_tokens,    # 후보 텍스트 리스트
    weights=(0.5, .5, 0, 0)  # 1-gram만 고려
)

print("2-gram BLEU:", bleu)

2-gram BLEU: 0.3872983346207417


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [26]:
# 2-gram BLEU 점수 계산
bleu = sentence_bleu(
    [reference_tokens],  # 참조 텍스트 리스트
    candidate_tokens,    # 후보 텍스트 리스트
    weights=(0.25, 0.25, 0.25, 0.25),  # 1-gram만 고려
    smoothing_function=SmoothingFunction().method1
)

print("BLEU:", bleu)

BLEU: 0.12574334296829354


In [29]:
from datasets import load_dataset

# 데이터셋 로드
dataset = load_dataset("openlanguagedata/flores_plus")

KeyError: 'tags'

In [27]:
from datasets import load_dataset

# 데이터셋 로드
dataset = load_dataset("openlanguagedata/flores_plus")

# 데이터셋 정보 출력
print(dataset)

KeyError: 'tags'