In [None]:
# إذا كنت في Google Colab
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# نزّل الداتا
!kaggle datasets download -d samirmoustafa/arabic-to-english-translation-sentences

# فك الضغط
!unzip arabic-to-english-translation-sentences.zip


Dataset URL: https://www.kaggle.com/datasets/samirmoustafa/arabic-to-english-translation-sentences
License(s): copyright-authors
Archive:  arabic-to-english-translation-sentences.zip
  inflating: ara_eng.txt             


In [None]:
!pip install datasets transformers evaluate sacrebleu
!pip install -U transformers evaluate rich


Collecting rich
  Downloading rich-14.0.0-py3-none-any.whl.metadata (18 kB)
Downloading rich-14.0.0-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rich
  Attempting uninstall: rich
    Found existing installation: rich 13.9.4
    Uninstalling rich-13.9.4:
      Successfully uninstalled rich-13.9.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.1.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed rich-14.0.0


In [None]:
from datasets import Dataset
from transformers import MBartForConditionalGeneration, MBartTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate

# 1. تحميل البيانات من ملف txt
def load_data_txt(file_path):
    en_texts, ar_texts = [], []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) == 2:
                en_texts.append(parts[0])
                ar_texts.append(parts[1])
    return Dataset.from_dict({"en": en_texts, "ar": ar_texts})

dataset = load_data_txt("ara_eng.txt")

# 2. تقسيم البيانات
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# 3. إعداد النموذج والـ tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

source_lang = "en_XX"
target_lang = "ar_AR"
tokenizer.src_lang = source_lang
tokenizer.tgt_lang = target_lang # This line is added to set the target language

# 4. الترميز
MAX_LENGTH = 256

def preprocess_function(examples):
    inputs = tokenizer(examples["en"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(examples["ar"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    inputs["labels"] = targets["input_ids"]
    return inputs

train_tokenized = train_dataset.map(preprocess_function, batched=False)
eval_tokenized = eval_dataset.map(preprocess_function, batched=False)

# 5. BLEU metric
bleu_metrics = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    bleu = bleu_metrics.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])["score"]
    em_list = [int(p.strip() == l.strip()) for p, l in zip(decoded_preds, decoded_labels)]
    acc = float(np.mean(em_list))
    return {"bleu": bleu, "exact_match_accuracy": acc}

# 6. إعدادات التدريب
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart-ar-en",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=3e-5,
    gradient_accumulation_steps=4,

    num_train_epochs=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    metric_for_best_model="bleu",
    predict_with_generate=True,
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

# 7. التدريب
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
final_metrics = trainer.evaluate()
print("Final evaluation metrics:", final_metrics)

Epoch,Training Loss,Validation Loss,Bleu,Exact Match Accuracy
1,0.1604,0.152615,39.837895,0.12
2,0.1406,0.149022,43.117302,0.1295


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


Final evaluation metrics: {'eval_loss': 0.14902229607105255, 'eval_bleu': 43.11730207587384, 'eval_exact_match_accuracy': 0.1295, 'eval_runtime': 163.9332, 'eval_samples_per_second': 12.2, 'eval_steps_per_second': 0.384, 'epoch': 2.0}


In [None]:
import torch

sentence = "I am very tired."
inputs = tokenizer(sentence, return_tensors="pt")

# تحديد الجهاز
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# نقل المدخلات والموديل إلى نفس الجهاز
model = model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

# فرض اللغة العربية كمخرجات
inputs["forced_bos_token_id"] = tokenizer.lang_code_to_id["ar_AR"]

# الترجمة
outputs = model.generate(**inputs, max_length=128)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
from huggingface_hub import login

# حط التوكين هنا بدل YOUR_TOKEN
login(token="")


In [None]:
model_name = "mbart-en-ar-translation"  # غيّر الاسم حسب ما تحب

model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)


# **Load From HF**

In [None]:
!pip install gradio transformers --quiet

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import gradio as gr
import torch

# 1. حمّل الموديل من Hugging Face (استبدل بالرابط بتاعك)
model_name = "anassaleh218/mbart-en-ar-translation"  # <-- غيّره باسمك
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 2. دالة الترجمة
def translate_text(text, direction):
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}

    if direction == "English to Arabic":
        tokenizer.src_lang = "en_XX"
        inputs["forced_bos_token_id"] = tokenizer.lang_code_to_id["ar_AR"]
    else:
        tokenizer.src_lang = "ar_AR"
        inputs["forced_bos_token_id"] = tokenizer.lang_code_to_id["en_XX"]

    output_tokens = model.generate(**inputs, max_length=100)
    translated = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return translated

# 3. واجهة Gradio
iface = gr.Interface(
    fn=translate_text,
    inputs=[
        gr.Textbox(lines=3, label="Enter Text"),
        # gr.Radio(["English to Arabic", "Arabic to English"], label="Translation Direction")
    ],
    outputs=gr.Textbox(label="Translated Text"),
    title="mBART Translation",
    description="Translate between English and Arabic using mBART"
)

iface.launch(debug=True)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.1/54.1 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.9/322.9 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/516 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/993 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBartTokenizer'. 
The class this function is called from is 'MBart50Tokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBartTokenizer'. 
The class this function is called from is 'MBart50TokenizerFast'.


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://df6fdc9f84828848b6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


