In [1]:
# Requirements:
# !pip install datasets transformers evaluate accelerate
# !pip install sacrebleu
# !pip install accelerate bitsandbytes
# !pip install -U bitsandbytes
import numpy as np
from datasets import load_dataset
from evaluate import load as load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
import warnings
import tkinter as tk
from tkinter import ttk
from tkinter import scrolledtext
warnings.filterwarnings("ignore", message="Was asked to gather along dimension 0")


In [2]:
MODEL_NAME    = "Helsinki-NLP/opus-mt-ar-en"
MAX_SRC_LEN   = 128
MAX_TGT_LEN   = 128
BATCH_TRAIN   = 16
BATCH_EVAL    = 16
NUM_EPOCHS    = 2
LEARNING_RATE = 5e-5
OUTPUT_DIR    = "./opus_mt_ar_en_full"

In [3]:
train_ds = load_dataset("opus100", "ar-en", split="train")
eval_ds  = load_dataset("opus100", "ar-en", split="validation")


README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/214k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/99.3M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/979k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [4]:
print("=== Raw example ===")
print(train_ds[0])  

=== Raw example ===
{'translation': {'ar': 'و هذه؟', 'en': 'And this?'}}


In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [6]:
def preprocess(batch):
    inputs = [ex["ar"] for ex in batch["translation"]]
    targets = [ex["en"] for ex in batch["translation"]]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_SRC_LEN,
        truncation=True,
        padding='max_length'
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_TGT_LEN,
            truncation=True,
            padding='max_length'
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
train_tokenized = train_ds.map(
    preprocess,
    batched=True,
    remove_columns=["translation"]
)
eval_tokenized = eval_ds.map(
    preprocess,
    batched=True,
    remove_columns=["translation"]
)

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [9]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
)

In [10]:
batch = train_tokenized.select(range(4))   # first 4 examples
collated = data_collator(batch)

print("\n=== Collated batch shapes ===")
for k, v in collated.items():
    print(f"{k:15}: {v.shape}")


=== Collated batch shapes ===
input_ids      : torch.Size([4, 128])
attention_mask : torch.Size([4, 128])
labels         : torch.Size([4, 128])
decoder_input_ids: torch.Size([4, 128])


In [9]:
bleu_metric = load_metric("sacrebleu")


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [10]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    bleu = bleu_metric.compute(
        predictions=decoded_preds,
        references=[[ref] for ref in decoded_labels]
    )["score"]
    em_list = [int(p.strip() == l.strip()) for p, l in zip(decoded_preds, decoded_labels)]
    acc = float(np.mean(em_list))
    return {"bleu": bleu, "exact_match_accuracy": acc}

In [11]:
tokenizer.model_max_length = MAX_SRC_LEN
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    generation_max_length=MAX_TGT_LEN,
    generation_num_beams=5,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    report_to="none",
    fp16=True,
)

In [12]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


In [13]:
trainer.train()
final_metrics = trainer.evaluate()
print("Final evaluation metrics:", final_metrics)

Epoch,Training Loss,Validation Loss,Bleu,Exact Match Accuracy
1,0.1604,0.152615,39.837895,0.12
2,0.1406,0.149022,43.117302,0.1295


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


Final evaluation metrics: {'eval_loss': 0.14902229607105255, 'eval_bleu': 43.11730207587384, 'eval_exact_match_accuracy': 0.1295, 'eval_runtime': 163.9332, 'eval_samples_per_second': 12.2, 'eval_steps_per_second': 0.384, 'epoch': 2.0}


In [14]:
def translate_ar_to_en(text: str) -> str:
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_SRC_LEN
    ).to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=MAX_TGT_LEN,
        num_beams=5,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [15]:
print(translate_ar_to_en("التعرف على الأنماط"))


Pattern recognition.


In [2]:
save_path = "/kaggle/working/helsenki_finetuned"

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model and tokenizer saved to {save_path}")


NameError: name 'trainer' is not defined

In [3]:
def load_mt_model(model_dir: str = 'models/helsenki_finetuned'):
    tok = AutoTokenizer.from_pretrained(model_dir)
    mdl = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
    return tok, mdl

In [6]:
def translate_ar_to_en(text: str, tokenizer, model) -> str:
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_SRC_LEN
    ).to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=MAX_TGT_LEN,
        num_beams=5,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [23]:
tok, mdl = load_mt_model()
print(translate_ar_to_en("التعرف على الأنماط", tok, mdl))


Pattern recognition.


In [17]:
!zip -r file.zip /kaggle/working/helsenki_finetuned
from IPython.display import FileLink
FileLink(r'file.zip')

'zip' is not recognized as an internal or external command,
operable program or batch file.


In [7]:
from tkinter import messagebox


class TranslatorApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Arabic to English Translator")
        self.geometry("700x500")
        self.resizable(False, False)

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
            self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR)
            self.model.eval()
        except Exception as e:
            messagebox.showerror("Model Load Error", f"Failed to load model: {e}")
            self.destroy()
            return

        self._build_ui()

    def _build_ui(self):
        style = ttk.Style(self)
        style.theme_use('clam')
        style.configure('TButton', font=("Helvetica", 10), padding=6)
        style.configure('TLabel', font=("Helvetica", 11))

        input_frame = ttk.LabelFrame(self, text="Arabic Text", padding=(10, 10))
        input_frame.place(x=20, y=20, width=660, height=180)
        self.input_text = scrolledtext.ScrolledText(input_frame, wrap=tk.WORD, font=("Segoe UI", 10))
        self.input_text.pack(fill=tk.BOTH, expand=True)

        output_frame = ttk.LabelFrame(self, text="English Translation", padding=(10, 10))
        output_frame.place(x=20, y=260, width=660, height=180)
        self.output_text = scrolledtext.ScrolledText(output_frame, wrap=tk.WORD, font=("Segoe UI", 10), state='disabled')
        self.output_text.pack(fill=tk.BOTH, expand=True)

        btn_translate = ttk.Button(self, text="Translate", command=self._on_translate)
        btn_translate.place(x=300, y=215)
        btn_clear = ttk.Button(self, text="Clear", command=self._on_clear)
        btn_clear.place(x=380, y=215)

    def _on_translate(self):
        arabic_text = self.input_text.get("1.0", tk.END).strip()
        print(f"GUI Input: {repr(arabic_text)}")  # Add this to see the exact input
        if not arabic_text:
            messagebox.showinfo("Input Required", "Please enter Arabic text to translate.")
            return
        try:
            translation = translate_ar_to_en(arabic_text, self.tokenizer, self.model)
            self._display_translation(translation)
        except Exception as e:
            messagebox.showerror("Translation Error", str(e))

    def _display_translation(self, text):
        self.output_text.config(state='normal')
        self.output_text.delete("1.0", tk.END)
        self.output_text.insert(tk.END, text)
        self.output_text.config(state='disabled')

    def _on_clear(self):
        self.input_text.delete("1.0", tk.END)
        self.output_text.config(state='normal')
        self.output_text.delete("1.0", tk.END)
        self.output_text.config(state='disabled')



In [11]:
if __name__ == "__main__":
    MODEL_DIR = 'models/helsenki_finetuned'

    app = TranslatorApp()
    app.mainloop()

GUI Input: 'فوزى'
