In [None]:
# Gerekli kütüphaneleri yükleyelim
!pip install transformers datasets sentencepiece torch
!pip install evaluate sacrebleu

In [3]:
import pandas as pd
from datasets import Dataset

# 1. Download the raw data file directly
data_url = "https://huggingface.co/datasets/Helsinki-NLP/tatoeba_mt/resolve/main/test/tatoeba-test.eng-spa.tsv"
df = pd.read_csv(data_url, sep="\t", header=None, names=["src_lang", "tgt_lang", "sourceString", "targetString"])

# 2. Convert to Hugging Face Dataset
base = Dataset.from_pandas(df)

# 3. Shuffle with seed 42 to get the EXACT same 1000 sentences
N = 1000
subset = base.shuffle(seed=42).select(range(N))

# 4. Preprocess: Swap columns to match Spanish -> English task
# Note: In 'eng-spa', sourceString is English, targetString is Spanish.
# We swap them because your model is Spanish (Input) -> English (Target)
dataset = subset.map(
    lambda ex: {
        "src": ex["targetString"].strip(),  # Spanish
        "tgt": ex["sourceString"].strip()   # English
    },
    remove_columns=subset.column_names
)

print(f"Test Set Size: {len(dataset)}")
print("Example Input (Spa):", dataset[0]["src"])
print("Example Target (Eng):", dataset[0]["tgt"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Test Set Size: 1000
Example Input (Spa): El niño está bebiendo leche.
Example Target (Eng): The boy is drinking milk.


In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model_name = "facebook/mbart-large-50-many-to-many-mmt"

# Tokenizer ve Model yükleme
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Kaynak dili İspanyolca olarak ayarla
tokenizer.src_lang = "es_XX"

In [5]:
import torch

def translate_spa_to_eng(text):
    # Metni modelin anlayacağı sayılara (token) çevir
    inputs = tokenizer(text, return_tensors="pt", padding=True)

    # Çeviriyi oluştur (İngilizce hedef dili belirterek)
    generated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
    )

    # Tokenları tekrar okunabilir metne dönüştür
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

# Örnek test
print(translate_spa_to_eng("Hola, ¿cómo estás?"))
# Çıktı: Hello, how are you?

Hello, how are you?


In [6]:
from tqdm import tqdm
import torch

model_translations = []

spanish_sentences = dataset["src"]

print(f"{len(spanish_sentences)} cümle çevriliyor...")

model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

for text in tqdm(spanish_sentences):
    inputs = tokenizer(text, return_tensors="pt", padding=True).to(model.device)

    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
        )

    decoded_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    model_translations.append(decoded_text)

print("Çeviriler tamamlandı!")

1000 cümle çevriliyor...


100%|██████████| 1000/1000 [05:02<00:00,  3.31it/s]

Çeviriler tamamlandı!





In [11]:
import evaluate

bleu = evaluate.load("bleu")

# veri setindeki orijinal İngilizce cümleler
english_references = dataset["tgt"]

# BLEU için referansları liste içinde liste formatına getir
formatted_references = [[ref] for ref in english_references]

results = bleu.compute(predictions=model_translations, references=formatted_references)
bleu_score = results["bleu"] * 100

print("\n" + "="*30)
print(f"PROJECT RESULT (N={len(dataset)})")
print(f"Model: {model_name}")
print(f"BLEU Score: {bleu_score:.2f}")
print("="*30)


PROJECT RESULT (N=1000)
Model: facebook/mbart-large-50-many-to-many-mmt
BLEU Score: 44.99


In [10]:
import pandas as pd

# İlk 10 örneği kıyaslayalım
analysis_df = pd.DataFrame({
    "İspanyolca (Input)": dataset["src"][:10],
    "Gerçek İngilizce (Reference)": dataset["tgt"][:10],
    "Model Çevirisi (Prediction)": model_translations[:10]
})

print("\nÖrnek Çeviri Kıyaslaması:")
print(analysis_df)


Örnek Çeviri Kıyaslaması:
                                  İspanyolca (Input)  \
0                       El niño está bebiendo leche.   
1  Tom me llevó a un restaurante en donde puedes ...   
2                           Mis gatas son mis hijas.   
3      Su autocomplacencia no le dará muchos amigos.   
4                   Me tengo que ir temprano mañana.   
5            Observaré tu avance desde mi ordenador.   
6                   Gasté 3000 yenes en un nuevo CD.   
7                 ¿Por qué no intentamos hacer algo?   
8            Su casa está demasiado lejos de la mía.   
9                           Juro que no hice trampa.   

                        Gerçek İngilizce (Reference)  \
0                          The boy is drinking milk.   
1  Tom took me to a restaurant where you can eat ...   
2                           My cats are my children.   
3       His complacency won't make him many friends.   
4                    I have to leave early tomorrow.   
5       I'll monitor