In [None]:
import datasets
print(datasets.__version__)


In [None]:
!pip uninstall -y datasets
!pip install -q datasets==2.19.2


In [None]:
import os, shutil

cache_path = os.path.expanduser("~/.cache/huggingface/datasets")
# tatoeba_mt cache klasörlerini sil (güvenli)
for name in os.listdir(cache_path):
    if "tatoeba_mt" in name.lower():
        shutil.rmtree(os.path.join(cache_path, name), ignore_errors=True)

print("Cache cleaned.")


In [None]:
from datasets import load_dataset

ds = load_dataset(
    "Helsinki-NLP/tatoeba_mt",
    "eng-spa",
    trust_remote_code=True,
    download_mode="force_redownload",
    verification_mode="no_checks"
)

print(ds)

# train yoksa validation kullan
base_split = "train" if "train" in ds else "validation"
base = ds[base_split]

print("Using split:", base_split)
print("Base size:", len(base))
print("Test size:", len(ds["test"]) if "test" in ds else "N/A")


In [None]:
# hangi splitler var?
print(ds.keys())

# train yoksa validation kullan
base = ds["train"] if "train" in ds else ds["validation"]

print("Using split:", "train" if "train" in ds else "validation")
print("Base size:", len(base))


In [None]:
N = 1000
subset = base.shuffle(seed=42).select(range(N))

# eng-spa config olduğundan:
# sourceString = English, targetString = Spanish
# biz ES->EN istiyoruz => src=Spanish(targetString), tgt=English(sourceString)

def preprocess(ex):
    return {
        "src": ex["targetString"].strip(),  # Spanish
        "tgt": ex["sourceString"].strip()   # English
    }

processed = subset.map(preprocess, remove_columns=subset.column_names)

print(processed[0])


In [None]:
for i in range(5):
    print("\nSRC:", processed[i]["src"])
    print("TGT:", processed[i]["tgt"])


In [None]:
!pip install -q transformers sentencepiece accelerate


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "Helsinki-NLP/opus-mt-es-en"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("Device:", device)


In [None]:
def translate_one(text, max_len=128, num_beams=4):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_len).to(device)
    with torch.no_grad():
        out = model.generate(**inputs, max_length=max_len, num_beams=num_beams)
    return tokenizer.decode(out[0], skip_special_tokens=True)

# hızlı kontrol
for i in range(5):
    src = processed[i]["src"]
    ref = processed[i]["tgt"]
    pred = translate_one(src)
    print("\nSRC:", src)
    print("PRED:", pred)
    print("REF:", ref)


In [None]:
from tqdm.auto import tqdm

def translate_batch(texts, batch_size=16, max_len=128, num_beams=4):
    preds = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len).to(device)
        with torch.no_grad():
            out = model.generate(**inputs, max_length=max_len, num_beams=num_beams)
        preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    return preds

src_texts = [processed[i]["src"] for i in range(len(processed))]
refs = [processed[i]["tgt"] for i in range(len(processed))]

preds = translate_batch(src_texts, batch_size=16)
print("Preds:", len(preds), "Refs:", len(refs))
print("\nExample:\n", src_texts[0], "\n=>", preds[0])


In [None]:
import pandas as pd

df = pd.DataFrame({
    "src_es": src_texts,
    "ref_en": refs,
    "pred_marian_es_en": preds
})

df.to_csv("marian_es_en_preds.csv", index=False, encoding="utf-8")
df.head()


In [None]:
!pip install -q sacrebleu


In [None]:
import sacrebleu

bleu = sacrebleu.corpus_bleu(preds, [refs])
print("BLEU score:", bleu.score)


In [None]:
!pip install -q nltk


In [None]:
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")

from nltk.translate.meteor_score import meteor_score

meteor_scores = [
    meteor_score([ref.split()], pred.split())
    for ref, pred in zip(refs, preds)
]

meteor_avg = sum(meteor_scores) / len(meteor_scores)
print("METEOR (avg):", meteor_avg)
print("METEOR (%):", meteor_avg * 100)


In [None]:
!pip install -q bert-score


In [None]:
from bert_score import score

P, R, F1 = score(
    preds,
    refs,
    lang="en",
    verbose=True
)

print("BERTScore F1 (avg):", F1.mean().item())
