In [3]:
pip install evaluate


Note: you may need to restart the kernel to use updated packages.


In [4]:
!pip install wandb -qU

In [1]:
import os
import logging
import wandb
import pandas as pd
from datasets import load_dataset
import evaluate
import re
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    set_seed,    EarlyStoppingCallback
)

In [2]:
# Set your WandB API key directly
os.environ["WANDB_API_KEY"] = "4cf3591f262cd568777e73fcda947286ee03b410"
os.environ["WANDB_MODE"] = "online"
# Initialize WandB
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mabdelaziz67[0m ([33mabdelaziz67-ain-shams-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
# 🎯 Start a new WandB run
wandb.init(
    project="egyptian-english-translation-finetuning",
    entity="abdelaziz67-ain-shams-university",  # Your WandB username or organization name
    config={
        "model_checkpoint": "Helsinki-NLP/opus-mt-en-ar",
        # "model_checkpoint": "facebook/mbart-large-50-many-to-many-mmt",
        "learning_rate": 4e-5,  # Lower LR for stable fine-tuning
        "batch_size": 12,  # Increased batch size
        "num_train_epochs": 35,  # Extended training duration
        "seed": 42,
        "label_smoothing": 0.1  # Prevent overconfidence
    }
)

In [4]:
max_length = 164

In [5]:
# 🪵 Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [6]:
# Set a seed for reproducibility
set_seed(wandb.config.seed)


In [7]:
logger.info("Loading dataset from Hugging Face Hub...")
dataset = load_dataset("HeshamHaroon/ArzEn-MultiGenre")
logger.info("Dataset loaded: %s", dataset)


INFO:__main__:Loading dataset from Hugging Face Hub...
INFO:__main__:Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['EGY', 'ENG'],
        num_rows: 26047
    })
})


In [8]:
# Check the cleaned dataset
print(dataset)
import pandas as pd
dt=pd.DataFrame(dataset)
dt.head(20)

DatasetDict({
    train: Dataset({
        features: ['EGY', 'ENG'],
        num_rows: 26047
    })
})


Unnamed: 0,train
0,"{'EGY': '‫لحق؟‬', 'ENG': 'Already?'}"
1,"{'EGY': '‫معلش يا جماعة أخرتكم.‬', 'ENG': 'Sor..."
2,"{'EGY': '‫لا، ولا يهمك.‬', 'ENG': 'No problem.'}"
3,"{'EGY': '‫بس الsystem down.‬', 'ENG': 'The sys..."
4,{'EGY': '‫طيب. خلاص إحنا كدا قفلنا الjoint acc...
5,"{'EGY': '‫كل ده والsystem down؟‬', 'ENG': 'All..."
6,{'EGY': '‫لا إنتي بتصنعي المعجزات يا سها طول ع...
7,"{'EGY': '‫لا خالص. إحنا بس اتعودنا.‬', 'ENG': ..."
8,{'EGY': '‫من كتر حالات الطلاق ومشاكل النفقة.‬ ...
9,{'EGY': '‫أنا بس عشان عارف إن علا مش بتشتغل‬ ‫...


In [9]:
def clean_text(text):
    text = re.sub(r"[^\w\s.,!?؛،]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces
    return text

In [10]:
# Function to check if a value is None or only numbers
def is_invalid(value):
    return value is None or (isinstance(value, str) and value.strip().isdigit())

# Function to filter dataset
def filter_rows(example):
    return not any(is_invalid(v) for v in example.values())

# Apply filter to all splits
dataset_cleaned = dataset.filter(filter_rows)

# Check the cleaned dataset
print(dataset_cleaned)

DatasetDict({
    train: Dataset({
        features: ['EGY', 'ENG'],
        num_rows: 22466
    })
})


In [11]:
df=pd.DataFrame(dataset_cleaned)
df.head(20)


Unnamed: 0,train
0,"{'EGY': '‫لحق؟‬', 'ENG': 'Already?'}"
1,"{'EGY': '‫معلش يا جماعة أخرتكم.‬', 'ENG': 'Sor..."
2,"{'EGY': '‫لا، ولا يهمك.‬', 'ENG': 'No problem.'}"
3,"{'EGY': '‫بس الsystem down.‬', 'ENG': 'The sys..."
4,{'EGY': '‫طيب. خلاص إحنا كدا قفلنا الjoint acc...
5,"{'EGY': '‫كل ده والsystem down؟‬', 'ENG': 'All..."
6,{'EGY': '‫لا إنتي بتصنعي المعجزات يا سها طول ع...
7,"{'EGY': '‫لا خالص. إحنا بس اتعودنا.‬', 'ENG': ..."
8,{'EGY': '‫من كتر حالات الطلاق ومشاكل النفقة.‬ ...
9,{'EGY': '‫أنا بس عشان عارف إن علا مش بتشتغل‬ ‫...


In [12]:
# Split the dataset into training and validation sets if not already split.
if "train" not in dataset_cleaned.keys() or "validation" not in dataset_cleaned.keys():
    dataset_cleaned = dataset_cleaned["train"].train_test_split(test_size=0.1, seed=42)
    train_dataset = dataset_cleaned["train"]
    val_dataset = dataset_cleaned["test"]
else:
    train_dataset = dataset_cleaned["train"]
    val_dataset = dataset_cleaned["validation"]

In [13]:
# Show a few examples (e.g., first 5)
for idx, example in enumerate(train_dataset.select(range(100,110))):
    print(f"Example {idx+1}:")
    print("Source:", example.get("EGY", "Field 'tgt' not found"))
    print("Target:", example.get("ENG", "Field 'src' not found"))
    print("-" * 50)

Example 1:
Source: بصلي بطريقة غريبة، زي ما أكون خليته يحس بشوية إشمئزاز. وقالي بطريقة أقرب للعنف إن في كل الأحوال شهادة مدير وموظفين الدار هتتسمع وإن "ده ممكن يقلب عليا التربيزة بطريقة وحشة". 
Target: He gave me a queer look, as if I slightly revolted him; then informed me, in an almost hostile tone, that in any case the head of the Home and some of the staff would be cited as witnesses. And that might do you a very nasty turn, he concluded. 
--------------------------------------------------
Example 2:
Source: مكنش مفروض أقول اللي قولته ده خالص.
Target: I shouldn't have said those things.
--------------------------------------------------
Example 3:
Source: اطلعيلي فوق عشان أرشق الشوكه فيكي. 
Target: Come up easy and let me put the harpoon into you.
--------------------------------------------------
Example 4:
Source: رفعت
Target: Refaat!
--------------------------------------------------
Example 5:
Source: المركب كانت خفيفه دلوقتي وماكانش عنده أي أفكار ولا مشاعر من أي نوع. 
Target: 

In [18]:
logger.info("Dataset loaded successfully!")
logger.info(f"Training samples: {len(train_dataset)}")
logger.info(f"Validation samples: {len(val_dataset)}")

INFO:__main__:Dataset loaded successfully!
INFO:__main__:Training samples: 20219
INFO:__main__:Validation samples: 2247


In [15]:
# Choose a pre-trained translation model
model_checkpoint = wandb.config.model_checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)



In [16]:
model.config.dropout = 0.2
model.config.attention_dropout = 0.2
model.config.activation_dropout = 0.2


In [17]:
# Define source and target languages (adjust codes as needed)
source_lang = "ar_EG" # for English (if translating from English to Egyptian Arabic)
target_lang = "en_XX"# Egyptian Arabic variant (if supported)
# Note: If your target dialect is not explicitly supported, you can still fine-tune the model
# with your dialect-specific data.

In [None]:
def preprocess_function(examples):
    # Ensure inputs are strings
    inputs = [clean_text(str(text)) for text in examples["EGY"]]
    targets = [clean_text(str(text)) for text in examples["ENG"]]
    
    # Tokenize the input texts
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)
    
    # Tokenize the target texts using the new text_target argument
    labels = tokenizer(text_target=targets, max_length=max_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Before tokenizing, set the source and target language codes on the tokenizer.
# For mBART50, for example:
tokenizer.src_lang = "ar_EG"  # assuming source is English
tokenizer.tgt_lang = "en_XX"  # assuming target is Arabic; change if you have a dialect-specific code


In [19]:

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)


In [20]:

# Data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [21]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,
    logging_strategy="steps",
    logging_steps=500,
    save_steps=500,
    save_total_limit=3,  # Keep only the last 2 checkpoints,
    save_only_model=True,   # Save only the model
    per_device_train_batch_size=wandb.config.batch_size,
    per_device_eval_batch_size=wandb.config.batch_size,
    gradient_accumulation_steps=4,
    num_train_epochs=wandb.config.num_train_epochs,
    learning_rate=wandb.config.learning_rate,
    lr_scheduler_type="linear",  # Learning rate decay,
    weight_decay=0.03,
    predict_with_generate=True,
    generation_max_length=256,  # Limit generation length
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    label_smoothing_factor=wandb.config.label_smoothing,
    report_to=["wandb"],
    run_name="optimized-egyptian-arabic-translation",
    fp16=True  # Enable mixed precision training
)

In [26]:
pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting tabulate>=0.8.9 (from sacrebleu)
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: tabulate, portalocker, sacrebleu
Successfully installed portalocker-3.1.1 sacrebleu-2.5.1 tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [22]:
# Load the sacreBLEU metric using the evaluate library.
bleu_metric = evaluate.load("sacrebleu")

In [23]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Safely convert predictions to integers
    preds = preds.astype(object)
    labels = labels.astype(object)

    # Decode the generated texts and labels directly using the tokenizer
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace all -100 in labels with the padding token ID
    labels = [[(int(token) if token >= 0 else tokenizer.pad_token_id) for token in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Prepare references in the required format
    decoded_labels = [[label] for label in decoded_labels]

    # Compute BLEU score using sacrebleu
    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Log BLEU score to wandb
    wandb.log({"BLEU score": result["score"]})

    return {"bleu": result["score"]}


In [29]:
#model.gradient_checkpointing_enable()


In [24]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [31]:
# import gc

# def clear_memory():
#     gc.collect()
#     torch.cuda.empty_cache()

# # Call this after each evaluation step or at the end of training
# clear_memory()

In [25]:

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,  # Already handles tokenization and padding
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] 
)


In [27]:
tokenized_val

Dataset({
    features: ['EGY', 'ENG', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2247
})

In [None]:
# Start fine-tuning
logger.info("Starting fine-tuning...")
trainer.train()


INFO:__main__:Starting fine-tuning...


Step,Training Loss,Validation Loss


In [None]:
# Evaluate the model
logger.info("Evaluating the model on the validation set...")
evaluation_results = trainer.evaluate()
logger.info("Evaluation results: %s", evaluation_results)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

In [None]:
# Save the final model and tokenizer
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")
logger.info("Model and tokenizer saved.")


In [None]:
# 🚀 Log the model as a WandB artifact
artifact = wandb.Artifact("translation_model", type="model")
artifact.add_dir("./final_model")
wandb.log_artifact(artifact)
logger.info("Model saved and logged to WandB!")

[34m[1mwandb[0m: Adding directory to artifact (./final_model)... Done. 1.1s


In [None]:

import torch


def generate_translation(text,model,tokenizer):
    # Move the model to the correct device (GPU if available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input text and move to the same device as the model
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)

    # Generate the output using the model
    outputs = model.generate(**inputs)

    # Decode the generated text
    translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return translation
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    # Generate the output using the model
    outputs = model.generate(**inputs)
    # Decode the generated text
    translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return translation

# Example predictions
examples = [
    "Hello, how are you?",
    "The weather today is beautiful.",
    "Artificial Intelligence is changing the world.",
]

print("Example Translations:")
for text in examples:
    translation = generate_translation(text,model,tokenizer)
    print(f"English: {text}")
    print(f"Arabic: {translation}")
    print("-" * 50)




Example Translations:
English: Hello, how are you?
Arabic: ألو، إيه الأخبار
--------------------------------------------------
English: The weather today is beautiful.
Arabic: الجو النهارده جميل
--------------------------------------------------
English: Artificial Intelligence is changing the world.
Arabic: لا يعني إن العالم كله بيعدي من customicial cream
--------------------------------------------------


In [None]:
examples = [
    # Casual Conversation
    "Good morning! How was your weekend?",
    "I'm feeling great today, thank you!",
    "Can you help me with this problem?",
    
    # Technical Text
    "Machine learning algorithms can significantly improve predictive accuracy.",
    "The software update includes several bug fixes and performance improvements.",
    
    # News and Current Events
    "The president addressed the nation last night regarding the economic crisis.",
    "Scientists have discovered a new species of fish in the Amazon River.",
    
    # Quotes and Sayings
    "The early bird catches the worm.",
    "Actions speak louder than words.",
    "Knowledge is power.",
    
    # Educational Content
    "The water cycle involves evaporation, condensation, and precipitation.",
    "Photosynthesis is the process by which plants convert sunlight into energy.",
    
    # Scientific Statements
    "Quantum physics explores the behavior of particles at the smallest scales.",
    "Genetic engineering allows scientists to modify the DNA of living organisms.",
    
    # Cultural References
    "Egypt is known for its ancient pyramids and the Nile River.",
    "The traditional dance at Egyptian weddings is vibrant and joyful.",
]

print("Example Translations:")
for text in examples:
    translation = generate_translation(text,model,tokenizer)
    print(f"English: {text}")
    print(f"Arabic: {translation}")
    print("-" * 50)

Example Translations:
English: Good morning! How was your weekend?
Arabic: صباح الخير، إيه أخبار الويك اند
--------------------------------------------------
English: I'm feeling great today, thank you!
Arabic: أنا حاسس إني تمام النهاردة، شكرا.
--------------------------------------------------
English: Can you help me with this problem?
Arabic: هو أنت ممكن تساعدني في المشكلة دي
--------------------------------------------------
English: Machine learning algorithms can significantly improve predictive accuracy.
Arabic: على فكرة إن التعلب هيعمل coolnessive يحسن.
--------------------------------------------------
English: The software update includes several bug fixes and performance improvements.
Arabic: في clientie clientes، collections و climetings.
--------------------------------------------------
English: The president addressed the nation last night regarding the economic crisis.
Arabic: جو الصباحية، الوردة خاطبوا جوه جوة جوه الأزمة.
-----------------------------------------------

In [None]:
# Example predictions
examples = [
    # Casual Greetings
    "Hey! What's up?",
    "Good morning! Did you sleep well?",
    "I'm so happy to see you!",

    # Daily Life Conversations
    "Can we go to the mall later?",
    "I forgot my phone at home!",
    "Let's grab a coffee together.",

    # Family & Friends
    "Mom made my favorite food today!",
    "My little brother keeps annoying me!",
    "Are you coming to the party tonight?",

    # Food & Ordering
    "I want a burger with extra cheese.",
    "Do you like spicy food?",
    "This pizza is amazing!",

    # Travel & Directions
    "Where is the nearest metro station?",
    "How much does a taxi to downtown cost?",
    "I need a ticket to Cairo, please.",

    # Social Media & Tech
    "I just posted a new picture on Instagram!",
    "Can you send me that video?",
    "My phone battery is almost dead!",

    # Emotions & Feelings
    "I'm really tired today.",
    "That movie made me cry!",
    "I can't stop laughing at this joke!",

    # Shopping & Money
    "How much is this dress?",
    "Do you accept credit cards?",
    "I got a great discount on my new shoes!",

    # Weather & Plans
    "It's so hot today!",
    "Let's go to the beach this weekend.",
    "It's raining, so I'll stay home.",
]


print("\nExample Translations:")
for text in examples:
    translation = generate_translation(text,model,tokenizer)
    print(f"English: {text}")
    print(f"Arabic: {translation}")
    print("-" * 50)


Example Translations:
English: Hey! What's up?
Arabic: إيه يا علا
--------------------------------------------------
English: Good morning! Did you sleep well?
Arabic: صباح الخير، نمتي كويس
--------------------------------------------------
English: I'm so happy to see you!
Arabic: أنا عايزة أشوفك بجد.
--------------------------------------------------
English: Can we go to the mall later?
Arabic: ممكن بعدين نروح جو الصباحية
--------------------------------------------------
English: I forgot my phone at home!
Arabic: أنا نسيت تليفوني في البيت.
--------------------------------------------------
English: Let's grab a coffee together.
Arabic: يالا بينا ناخد قهوه مع بعض
--------------------------------------------------
English: Mom made my favorite food today!
Arabic: النهاردة ماما عملا الأكل اللي بحبه
--------------------------------------------------
English: My little brother keeps annoying me!
Arabic: أخويا الصغير كل يوم يزعجني.
--------------------------------------------------
Eng

In [None]:
# Finish the wandb run
wandb.finish()

# Sync offline runs with wandb using os.system()
os.system("wandb sync ./wandb/offline-run-*")

0,1
BLEU score,▁▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇▇▇▇▇▇████████████████
eval/bleu,▁▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇▇▇▇▇▇████████████████
eval/loss,█▄▃▂▂▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
eval/runtime,▂▄█▁▃▂▁▁▃▃▂▃▃▃▂▃▃▃▃▂▃▃▃▃▃▃▃▃▂▄▂▃▃▃▂▃▃
eval/samples_per_second,▇▅▁█▅▇██▆▆▇▆▆▆▆▆▆▆▆▇▆▆▆▆▆▅▆▆▆▅▇▅▆▆▇▆▆
eval/steps_per_second,▇▅▁█▅▇██▆▆▇▅▆▆▆▆▆▆▆▇▆▆▆▆▆▅▆▅▆▅▇▅▆▆▇▆▆
train/epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████
train/grad_norm,█▇██▇▇▆▆▆▆▆▅▆▅▅▄▄▃▅▃▃▄▄▂▃▃▂▂▃▁▂▁▂▂▁▁
train/learning_rate,███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
BLEU score,21.30826
eval/bleu,21.30826
eval/loss,3.69418
eval/runtime,106.7895
eval/samples_per_second,24.394
eval/steps_per_second,0.768
total_flos,2.362819849224192e+16
train/epoch,99.45839
train/global_step,18300.0
train/grad_norm,353346.84375


512

In [None]:
# # Initialize WandB run
# wandb.init(project="egyptian-arabic-translation-finetuning", entity="abdelaziz67-ain-shams-university")

# # Download the artifact
# artifact = wandb.use_artifact("abdelaziz67-ain-shams-university/egyptian-arabic-translation-finetuning/translation_model:latest", type="model")
# artifact_dir = artifact.download()

# # Load the model and tokenizer
# model_l = AutoModelForSeq2SeqLM.from_pretrained(artifact_dir)
# tokenizer_l = AutoTokenizer.from_pretrained(artifact_dir)

# print("Model and tokenizer loaded from WandB!")

[34m[1mwandb[0m: Downloading large artifact translation_model:latest, 295.08MB. 8 files... 
[34m[1mwandb[0m:   8 of 8 files downloaded.  
Done. 0:0:0.4


Model and tokenizer loaded from WandB!


