## Preprocessing 

In [1]:
import os
import warnings
from datasets import load_dataset
import pandas as pd
import re 
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
import nltk
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
import emoji
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from datetime import datetime
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Attention
from tensorflow.keras.models import Model


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ModuleNotFoundError: No module named 'gensim'

In [None]:

# 1. Initial Setup with Detailed Logging
def log_step(message):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"\n[{timestamp}] {message}")
    print("-" * 60)


os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
warnings.filterwarnings('ignore')

log_step("Starting Arabic-English Dataset Preprocessing Pipeline")

# 2. NLTK Resources Setup with Progress
log_step("Setting up NLTK resources")
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    print("✓ Successfully downloaded NLTK resources")
except Exception as e:
    print(f"✗ Error downloading NLTK resources: {e}")
    exit()


In [None]:

# 3. Load Stopwords with Verification
log_step("Loading stopwords")
try:
    arabic_stopwords = set(stopwords.words('arabic'))
    english_stopwords = set(stopwords.words('english'))
    print(f"✓ Loaded {len(arabic_stopwords)} Arabic stopwords")
    print(f"✓ Loaded {len(english_stopwords)} English stopwords")
except Exception as e:
    print(f"✗ Error loading stopwords: {e}")
    exit()

# 4. Dataset Loading with Detailed Stats
log_step("Loading Tatoeba dataset")
try:
    start_time = datetime.now()
    dataset = load_dataset("tatoeba", lang1="ar", lang2="en", trust_remote_code=True)

    if 'translation' in dataset['train'].features:
        dataset = dataset.map(lambda x: {'ar': x['translation']['ar'], 'en': x['translation']['en']})

    load_time = (datetime.now() - start_time).total_seconds()
    print(f"✓ Successfully loaded dataset in {load_time:.2f} seconds")
    print(f"• Total samples: {len(dataset['train']):,}")
    print(f"• First Arabic sample: {dataset['train'][0]['ar'][:50]}...")
    print(f"• First English sample: {dataset['train'][0]['en'][:50]}...")
except Exception as e:
    print(f"✗ Error loading dataset: {e}")
    exit()



In [None]:

# 5. Enhanced Text Cleaning
def clean_text(text, lang, show_progress=False):
    if show_progress:
        print(f"Processing {lang} text: {text[:30]}...")

    if not isinstance(text, str):
        text = str(text)

    if lang == 'ar':
        text = re.sub(r'[إأٱآا]', 'ا', text)
        text = re.sub(r'ى', 'ي', text)
        text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text)
    else:
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text)

    text = re.sub(r'\s+', ' ', text).strip()

    stopwords_set = arabic_stopwords if lang == 'ar' else english_stopwords
    words = [word for word in text.split() if word not in stopwords_set]
    ## stem words of the same root 
    if lang == 'ar':
        stemmer = ISRIStemmer()
        words = [stemmer.stem(word) for word in words]

    text = ' '.join(words)
    text = emoji.replace_emoji(text, replace='')

    return text if len(text) > 1 else None


log_step("Cleaning dataset")
try:
    start_time = datetime.now()
    cleaned_dataset = dataset.map(lambda x: {
        'ar': clean_text(x['ar'], 'ar'),
        'en': clean_text(x['en'], 'en')
    }).filter(lambda x: x['ar'] is not None and x['en'] is not None)

    clean_time = (datetime.now() - start_time).total_seconds()
    original_count = len(dataset['train'])
    cleaned_count = len(cleaned_dataset['train'])
    removed_count = original_count - cleaned_count

    print(f"✓ Cleaning completed in {clean_time:.2f} seconds")
    print(f"• Original samples: {original_count:,}")
    print(f"• Removed samples: {removed_count:,} ({removed_count / original_count:.2%})")
    print(f"• Remaining samples: {cleaned_count:,}")
    print("\nSample after cleaning:")
    print(f"Arabic: {cleaned_dataset['train'][0]['ar']}")
    print(f"English: {cleaned_dataset['train'][0]['en']}")
except Exception as e:
    print(f"✗ Error during cleaning: {e}")
    exit()


In [None]:

# 6. Length Filtering
log_step("Filtering by sentence length")
try:
    start_time = datetime.now()


    def filter_by_length(example):
        ar_len = len(example['ar'].split())
        en_len = len(example['en'].split())
        return 3 <= ar_len <= 50 and 3 <= en_len <= 50


    filtered_dataset = cleaned_dataset.filter(filter_by_length)
    filter_time = (datetime.now() - start_time).total_seconds()

    before_count = len(cleaned_dataset['train'])
    after_count = len(filtered_dataset['train'])
    removed = before_count - after_count

    print(f"✓ Filtering completed in {filter_time:.2f} seconds")
    print(f"• Samples before filtering: {before_count:,}")
    print(f"• Samples removed: {removed:,} ({removed / before_count:.2%})")
    print(f"• Samples remaining: {after_count:,}")

    ar_lengths = [len(x['ar'].split()) for x in filtered_dataset['train']]
    en_lengths = [len(x['en'].split()) for x in filtered_dataset['train']]

    print("\nLength distribution after filtering:")
    print(f"• Arabic - Avg: {np.mean(ar_lengths):.1f}, Min: {min(ar_lengths)}, Max: {max(ar_lengths)}")
    print(f"• English - Avg: {np.mean(en_lengths):.1f}, Min: {min(en_lengths)}, Max: {max(en_lengths)}")
except Exception as e:
    print(f"✗ Error during filtering: {e}")
    exit()


In [None]:

# 7. Dataset Splitting
log_step("Splitting dataset")
try:
    start_time = datetime.now()
    df = pd.DataFrame(filtered_dataset['train'])

    train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

    final_dataset = DatasetDict({
        'train': Dataset.from_pandas(train_df),
        'validation': Dataset.from_pandas(val_df),
        'test': Dataset.from_pandas(test_df)
    })

    split_time = (datetime.now() - start_time).total_seconds()
    print(f"✓ Splitting completed in {split_time:.2f} seconds")
    print(f"• Training set: {len(final_dataset['train']):,} samples")
    print(f"• Validation set: {len(final_dataset['validation']):,} samples")
    print(f"• Test set: {len(final_dataset['test']):,} samples")
except Exception as e:
    print(f"✗ Error during splitting: {e}")
    exit()


In [None]:

# 8. Tokenization
log_step("Tokenizing data")
try:
    start_time = datetime.now()
    tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")


    def tokenize_function(examples):
        return tokenizer(
            examples['ar'],
            text_target=examples['en'],
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='np'
        )


    tokenized_dataset = final_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['ar', 'en']
    )

    tokenize_time = (datetime.now() - start_time).total_seconds()
    print(f"\n✓ Tokenization completed in {tokenize_time:.2f} seconds")
except Exception as e:
    print(f"✗ Error during tokenization: {e}")
    exit()

# 9. Saving Data
log_step("Saving processed data")
try:
    start_time = datetime.now()
    os.makedirs("processed_data", exist_ok=True)

    final_dataset.save_to_disk("processed_data/final_dataset")
    tokenized_dataset.save_to_disk("processed_data/tokenized_dataset")

    save_time = (datetime.now() - start_time).total_seconds()
    print(f"✓ Data saved in {save_time:.2f} seconds")
except Exception as e:
    print(f"✗ Error during saving: {e}")
    exit()


## Training 

In [None]:

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Bidirectional, Attention
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from datasets import load_from_disk

log_step("Starting Training Pipeline")

# 1. Load Tokenized Data
try:
    tokenized_dataset = load_from_disk("processed_data/tokenized_dataset")
    print("✓ Tokenized data loaded successfully")
except Exception as e:
    print(f"✗ Error loading tokenized data: {e}")
    exit()


# 2. Prepare Data for Training
def prepare_data(dataset, max_length=64):
    input_ids = []
    labels = []

    for example in dataset:
        input_seq = example['input_ids'][:max_length]
        label_seq = example['labels'][:max_length]

        input_seq = np.pad(input_seq, (0, max_length - len(input_seq)),
                           'constant', constant_values=tokenizer.pad_token_id)
        label_seq = np.pad(label_seq, (0, max_length - len(label_seq)),
                           'constant', constant_values=tokenizer.pad_token_id)

        input_ids.append(input_seq)
        labels.append(label_seq)

    return np.array(input_ids), np.array(labels)


log_step("Preparing training data")
try:
    train_input_ids, train_labels = prepare_data(tokenized_dataset['train'])
    val_input_ids, val_labels = prepare_data(tokenized_dataset['validation'])

    train_decoder_input = np.roll(train_labels, 1, axis=1)
    train_decoder_input[:, 0] = tokenizer.pad_token_id

    val_decoder_input = np.roll(val_labels, 1, axis=1)
    val_decoder_input[:, 0] = tokenizer.pad_token_id

    print("✓ Data prepared successfully")
    print(f"• Training data shape: {train_input_ids.shape}")
    print(f"• Validation data shape: {val_input_ids.shape}")
except Exception as e:
    print(f"✗ Error preparing data: {e}")
    exit()


In [None]:

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Bidirectional, Attention
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from datasets import load_from_disk

log_step("Starting Training Pipeline")

# Load Tokenized Data
tokenized_dataset = load_from_disk("processed_data/tokenized_dataset")

# استخدم subset صغير للتجريب:
sample_train = tokenized_dataset['train'].select(range(5000))
sample_val = tokenized_dataset['validation'].select(range(1000))

# إعداد البيانات
def prepare_data(dataset, max_length=32):
    input_ids, labels = [], []
    for example in dataset:
        input_seq = example['input_ids'][:max_length]
        label_seq = example['labels'][:max_length]
        input_seq = np.pad(input_seq, (0, max_length - len(input_seq)), 'constant', constant_values=tokenizer.pad_token_id)
        label_seq = np.pad(label_seq, (0, max_length - len(label_seq)), 'constant', constant_values=tokenizer.pad_token_id)
        input_ids.append(input_seq)
        labels.append(label_seq)
    return np.array(input_ids), np.array(labels)

train_input_ids, train_labels = prepare_data(sample_train)
val_input_ids, val_labels = prepare_data(sample_val)

train_decoder_input = np.roll(train_labels, 1, axis=1)
train_decoder_input[:, 0] = tokenizer.pad_token_id

val_decoder_input = np.roll(val_labels, 1, axis=1)
val_decoder_input[:, 0] = tokenizer.pad_token_id


In [None]:

# بناء الموديل بخيارات مخففة
def build_translation_model(vocab_size, max_length=32):
    encoder_inputs = Input(shape=(max_length,))
    encoder_embedding = Embedding(vocab_size, 256)(encoder_inputs)
    encoder_lstm = Bidirectional(LSTM(128, return_sequences=True))(encoder_embedding)

    decoder_inputs = Input(shape=(max_length,))
    decoder_embedding = Embedding(vocab_size, 256)(decoder_inputs)
    decoder_lstm = LSTM(128, return_sequences=True)(decoder_embedding)
    decoder_dense_projection = Dense(256)(decoder_lstm)

    attention = Attention()([encoder_lstm, decoder_dense_projection])
    decoder_output = Dense(vocab_size, activation='softmax')(attention)

    model = Model([encoder_inputs, decoder_inputs], decoder_output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model = build_translation_model(vocab_size=tokenizer.vocab_size)

# Callbacks
callbacks = [
    EarlyStopping(patience=3, restore_best_weights=True),
    ModelCheckpoint("best_model.keras", save_best_only=True)
]

# التدريب
model.fit(
    [train_input_ids, train_decoder_input],
    np.expand_dims(train_labels, -1),
    validation_data=([val_input_ids, val_decoder_input], np.expand_dims(val_labels, -1)),
    epochs=10,
    batch_size=32,
    callbacks=callbacks
)


In [None]:

log_step("Previewing model outputs after training")
for i in range(5):
    input_sample = train_input_ids[i]  # Get the input sequence (train_input_ids)
    true_output = train_labels[i]  # Get the true label sequence (train_labels)

    # Generate the model prediction for the input
    model_output = model.predict([np.expand_dims(input_sample, axis=0), np.expand_dims(np.roll(true_output, 1, axis=1), axis=0)])
    predicted_sequence = np.argmax(model_output, axis=-1).flatten()  # Decode the prediction (get the most probable tokens)

    # Decode the predicted sequence and true labels back to tokens (you can use the tokenizer for this)
    decoded_input = tokenizer.decode(input_sample, skip_special_tokens=True)
    decoded_true_output = tokenizer.decode(true_output, skip_special_tokens=True)
    decoded_predicted_output = tokenizer.decode(predicted_sequence, skip_special_tokens=True)

    print(f"Sample {i+1}:")
    print("Input (Arabic):", decoded_input)
    print("True Output (English):", decoded_true_output)
    print("Predicted Output (English):", decoded_predicted_output)
    print("-" * 50)



In [None]:

# === Training Visualization ===
log_step("Plotting training history")

history = model.history.history

plt.figure(figsize=(12, 5))

# Loss plot
plt.subplot(1, 2, 1)
plt.plot(history['loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.title("Loss over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

# Accuracy plot
plt.subplot(1, 2, 2)
plt.plot(history['accuracy'], label='Train Accuracy')
plt.plot(history['val_accuracy'], label='Val Accuracy')
plt.title("Accuracy over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

plt.tight_layout()
plt.show()


## Final Evaluation — Hugging Face vs Custom Keras Model
### in this Evaluation We perform Translation on same data using the same tokenizer to Ensures fair comparison ( AutoTokenizer )

In [None]:
# Final Evaluation Script — Hugging Face vs Custom Keras Model
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import tensorflow as tf

# Load data
tokenized_dataset = load_from_disk("processed_data/tokenized_dataset")
final_dataset = load_from_disk("processed_data/final_dataset")
test_texts = [ex['ar'] for ex in final_dataset['test'][:100]]
test_refs = [ex['en'] for ex in final_dataset['test'][:100]]

In [None]:
# Load HF model and tokenizer
hf_model_name = "Helsinki-NLP/opus-mt-ar-en"
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
hf_model = AutoModelForSeq2SeqLM.from_pretrained(hf_model_name)

# Translate using Hugging Face
def translate_hf(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=64)
    outputs = hf_model.generate(**inputs)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

hf_preds = translate_hf(test_texts)



In [None]:
# Load custom trained Keras model
keras_model = tf.keras.models.load_model("best_model.keras")

# Prepare inputs for Keras
def prepare_for_keras(texts, max_length=32):
    input_ids = []
    for text in texts:
        ids = tokenizer(text, truncation=True, padding='max_length', max_length=max_length)['input_ids']
        input_ids.append(ids)
    return np.array(input_ids)

keras_inputs = prepare_for_keras(test_texts)
keras_decoder_input = np.zeros_like(keras_inputs)
keras_decoder_input[:, 0] = tokenizer.pad_token_id  # Start decoding from padding

keras_raw_preds = keras_model.predict([keras_inputs, keras_decoder_input])
keras_token_ids = np.argmax(keras_raw_preds, axis=-1)
keras_preds = [tokenizer.decode(ids, skip_special_tokens=True) for ids in keras_token_ids]



In [None]:
# BLEU Score Comparison
from sacrebleu import corpus_bleu
hf_bleu = corpus_bleu(hf_preds, [test_refs])
keras_bleu = corpus_bleu(keras_preds, [test_refs])

# Word-level comparison
smooth = SmoothingFunction().method4

def word_level_metrics(preds, refs):
    true_words = [w for ref in refs for w in ref.split()]
    pred_words = [w for pred in preds for w in pred.split()]
    min_len = min(len(true_words), len(pred_words))
    true_words, pred_words = true_words[:min_len], pred_words[:min_len]
    acc = accuracy_score(true_words, pred_words)
    prec, rec, f1, _ = precision_recall_fscore_support(true_words, pred_words, average='macro')
    return acc, prec, rec, f1

hf_acc, hf_prec, hf_rec, hf_f1 = word_level_metrics(hf_preds, test_refs)
keras_acc, keras_prec, keras_rec, keras_f1 = word_level_metrics(keras_preds, test_refs)

# Print Results
print("\n[BLEU Scores]")
print(f"Hugging Face Model BLEU: {hf_bleu.score:.2f}")
print(f"Keras Custom Model BLEU: {keras_bleu.score:.2f}")

print("\n[Word-Level Metrics — Hugging Face]")
print(f"Accuracy: {hf_acc:.2f}, Precision: {hf_prec:.2f}, Recall: {hf_rec:.2f}, F1: {hf_f1:.2f}")

print("\n[Word-Level Metrics — Keras Model]")
print(f"Accuracy: {keras_acc:.2f}, Precision: {keras_prec:.2f}, Recall: {keras_rec:.2f}, F1: {keras_f1:.2f}")

# Visualization
plt.figure(figsize=(10,5))
sns.barplot(x=["HF BLEU", "Keras BLEU"], y=[hf_bleu.score, keras_bleu.score], palette='Blues')
plt.title("BLEU Score Comparison")
plt.ylabel("BLEU Score")
plt.ylim(0, 100)
plt.show()

# Print Sample Translations
print("\n[Translation Samples from Test Set]")
for i in range(5):
    print("\n--- Example", i+1)
    print("Arabic:", test_texts[i])
    print("Reference:", test_refs[i])
    print("HF Prediction:", hf_preds[i])
    print("Keras Prediction:", keras_preds[i])


## Extran Manual Testing 

In [None]:
# Extra Testing: Translate New Inputs
print("\n[Manual Test Examples]")
new_arabic_inputs = [
    "أين تقع القاهرة؟",
    "أريد تعلم البرمجة",
    "كيف حالك اليوم؟"
]

new_hf_preds = translate_hf(new_arabic_inputs)
new_keras_inputs = prepare_for_keras(new_arabic_inputs)
new_keras_decoder_input = np.zeros_like(new_keras_inputs)
new_keras_decoder_input[:, 0] = tokenizer.pad_token_id
new_keras_raw_preds = keras_model.predict([new_keras_inputs, new_keras_decoder_input])
new_keras_token_ids = np.argmax(new_keras_raw_preds, axis=-1)
new_keras_preds = [tokenizer.decode(ids, skip_special_tokens=True) for ids in new_keras_token_ids]

for ar, hf_pred, kr_pred in zip(new_arabic_inputs, new_hf_preds, new_keras_preds):
    print("\nInput:", ar)
    print("HF Translation:", hf_pred)
    print("Keras Translation:", kr_pred)
