In [None]:
# 1. Load and Clean Data from Hugging Face (Tatoeba)
from datasets import load_dataset
import pandas as pd

def clean_text(text):
    return " ".join(str(text).strip().lower().split())

# Load Tatoeba from Hugging Face (filtered to Arabic-English)
dataset = load_dataset("tatoeba", lang1="ar", lang2="en", split="train", trust_remote_code=True)

# Convert to pandas and clean
df = pd.DataFrame(dataset)
df = df.dropna(subset=["translation"])
df['source'] = df['translation'].apply(lambda x: clean_text(x['ar']))
df['target'] = df['translation'].apply(lambda x: clean_text(x['en']))
df = df.drop(columns=["translation"])

# Use a subset for demo
df = df.sample(2000, random_state=42).reset_index(drop=True)

Downloading data:   0%|          | 0.00/912k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [2]:
# 2. Preprocessing for MarianMT
from datasets import Dataset as HFDataset
from transformers import MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-ar-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)

dataset = HFDataset.from_pandas(df.rename(columns={"source": "ar", "target": "en"}))

def preprocess(example):
    inputs = tokenizer(example["ar"], truncation=True, padding="max_length", max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["en"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=False)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ImportError: 
MarianTokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
# 3. Load Pretrained Model
from transformers import MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

model = MarianMTModel.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# 4. Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="marian-ar-en",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    evaluation_strategy="no",
    save_total_limit=1,
    remove_unused_columns=True,
    fp16=False,
    report_to=[]
)

In [None]:
# 5. Trainer Setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)
# 6. Train the Model
trainer.train()

In [None]:

# 7. Evaluate the Fine-Tuned Model
from sacrebleu import corpus_bleu
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def translate_texts(texts, model, tokenizer):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    translated = model.generate(**inputs)
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

# Sample test set
test_samples = df.sample(100, random_state=1)
sources = test_samples['source'].tolist()
targets = test_samples['target'].tolist()

# Fine-tuned predictions
preds_finetuned = translate_texts(sources, model, tokenizer)
bleu_finetuned = corpus_bleu(preds_finetuned, [targets])
print(f"[Fine-Tuned] BLEU Score: {bleu_finetuned.score:.2f}")

# Pretrained predictions
model_pretrained = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
pretrained_preds = translate_texts(sources, model_pretrained, tokenizer)
bleu_pretrained = corpus_bleu(pretrained_preds, [targets])
print(f"[Pretrained] BLEU Score: {bleu_pretrained.score:.2f}")

# Additional Metrics
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smooth = SmoothingFunction().method4

# Word-level comparison for accuracy and F1
flatten = lambda l: [item for sublist in l for item in sublist]
true_words = flatten([t.split() for t in targets])
pred_words = flatten([p.split() for p in preds_finetuned])

# Ensure equal length
min_len = min(len(true_words), len(pred_words))
true_words, pred_words = true_words[:min_len], pred_words[:min_len]

acc = accuracy_score(true_words, pred_words)
prec, recall, f1, _ = precision_recall_fscore_support(true_words, pred_words, average='macro')

print(f"\nWord-level Evaluation:")
print(f"Accuracy: {acc:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

# BLEU Score Comparison Chart
scores = [bleu_pretrained.score, bleu_finetuned.score]
labels = ["Pretrained", "Fine-Tuned"]

plt.figure(figsize=(6,4))
sns.barplot(x=labels, y=scores, palette="Blues")
plt.title("BLEU Score Comparison")
plt.ylabel("BLEU Score")
plt.ylim(0, 100)
plt.show()

# Translate Sample
example = "أنا أحب البرمجة"
translated = translate_texts([example], model, tokenizer)[0]
print(f"\nArabic: {example}\nEnglish (Fine-tuned): {translated}")


## Evaluation 