In [1]:
import os
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
import sentencepiece as spm
from google.colab import drive


In [2]:
# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Paths
MODEL_PATH = "/content/drive/My Drive/arabic_model/model.keras"
TOKENIZER_PATH = "/content/drive/My Drive/arabic_model/tokenizer.model"
DATASET_PATH = "/content/drive/My Drive/dataset-nlp"

# Create directories if they don't exist
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)

# Model parameters
MAX_SEQUENCE_LENGTH = 50
VOCAB_SIZE = 16000
EMBEDDING_DIM = 256
HIDDEN_DIM = 512
DROPOUT_RATE = 0.3
BATCH_SIZE = 64
EPOCHS = 15
BEAM_WIDTH = 3

Mounted at /content/drive


In [3]:
def normalize_arabic(text):
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'[ىي]', 'ي', text)
    text = re.sub(r'[ةه]', 'ه', text)
    text = re.sub(r'[گك]', 'ك', text)
    text = re.sub(r'[\u064B-\u065F]', '', text)
    text = re.sub('[\u0640\\s]+', ' ', text).strip()
    return text

In [4]:
def load_data():
    sentences = []
    for root, _, files in os.walk(DATASET_PATH):
        for file in files:
            if file.endswith(".txt"):
                path = os.path.join(root, file)
                with open(path, "r", encoding="utf-8", errors="ignore") as f:
                    sentences.extend([normalize_arabic(line.strip())
                                    for line in f if line.strip()])
    return [s for s in sentences if len(s.split()) >= 3]


In [5]:
def get_tokenizer(sentences):
    sp = spm.SentencePieceProcessor()
    if os.path.exists(TOKENIZER_PATH):
        sp.Load(TOKENIZER_PATH)
    else:
        temp_file = "temp_arabic.txt"
        with open(temp_file, "w", encoding="utf-8") as f:
            f.write("\n".join(sentences))

        spm.SentencePieceTrainer.train(
            input=temp_file,
            model_prefix=os.path.join(os.path.dirname(TOKENIZER_PATH), "tokenizer"),
            vocab_size=VOCAB_SIZE,
            character_coverage=1.0,
            split_by_unicode_script=True,
            pad_id=0,
            unk_id=1,
            bos_id=2,
            eos_id=3
        )
        os.remove(temp_file)
        sp.Load(TOKENIZER_PATH)
    return sp

In [6]:
def create_dataset(sentences, tokenizer):
    encoded = [tokenizer.EncodeAsIds(s) for s in sentences]

    sequences = []
    labels = []
    for seq in encoded:
        for i in range(1, len(seq)):
            context = seq[:i][-MAX_SEQUENCE_LENGTH:]
            padded = context + [0] * (MAX_SEQUENCE_LENGTH - len(context))
            sequences.append(padded)
            labels.append(seq[i])

    X = np.array(sequences, dtype=np.int32)
    y = np.array(labels, dtype=np.int32)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val))

    return (
        train_ds.shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE),
        val_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    )

In [7]:
def build_model():
    inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
    x = Embedding(VOCAB_SIZE, EMBEDDING_DIM, mask_zero=True)(inputs)

    # Add use_cudnn=False to avoid the masking error
    x = Bidirectional(LSTM(HIDDEN_DIM, return_sequences=True, use_bias=True, dropout=0.2, use_cudnn=False))(x)
    x = Bidirectional(LSTM(HIDDEN_DIM, use_bias=True, dropout=0.2, use_cudnn=False))(x)

    x = Dense(HIDDEN_DIM, activation='relu')(x)
    outputs = Dense(VOCAB_SIZE, activation='softmax')(x)

    model = Model(inputs, outputs)
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [8]:
def predict_next_words(text, model, tokenizer, max_predictions=3):
    text = normalize_arabic(text)
    encoded = tokenizer.EncodeAsIds(text)

    # Right padding (to match training data pattern)
    padded = encoded[-MAX_SEQUENCE_LENGTH:] + [0] * (MAX_SEQUENCE_LENGTH - len(encoded))
    input_seq = np.array([padded])

    # Alternative approach: use the model without beam search first to test
    try:
        preds = model.predict(input_seq, verbose=0)[0]
        top_ids = np.argsort(preds)[-BEAM_WIDTH:][::-1]  # Get top predictions and sort them in descending order

        results = []
        for idx in top_ids:
            token = tokenizer.IdToPiece(int(idx))
            results.append((token, float(preds[idx])))

        return results

    except Exception as e:
        print(f"Error during prediction: {e}")

        try:
            tf.config.run_functions_eagerly(True)
            preds = model(input_seq, training=False).numpy()[0]
            tf.config.run_functions_eagerly(False)

            top_ids = np.argsort(preds)[-BEAM_WIDTH:][::-1]
            results = []
            for idx in top_ids:
                token = tokenizer.IdToPiece(int(idx))
                results.append((token, float(preds[idx])))

            return results
        except Exception as e2:
            print(f"Second attempt failed: {e2}")
            # Return a default message
            return [(("prediction failed"), 0.0)]

In [9]:
def interactive_prompt(model, tokenizer):
    print("\\nArabic Autocomplete System (type 'exit' to quit)")
    while True:
        text = input("\\nInput text: ")
        if text.lower() == 'exit':
            break

        predictions = predict_next_words(text, model, tokenizer)

        print("\\nTop Suggestions:")
        for i, (pred, score) in enumerate(predictions):
            print(f"{i+1}. {text} {pred} (confidence: {score:.4f})")

In [10]:
def main():
    # Load sentences
    sentences = load_data()
    print(f"Loaded {len(sentences)} sentences")

    # Get tokenizer
    tokenizer = get_tokenizer(sentences)

    # Try to load existing model or train new one
    if os.path.exists(MODEL_PATH):
        try:
            model = load_model(MODEL_PATH, compile=False)
            # Recompile the model with the custom configuration
            model.compile(
                optimizer=Adam(learning_rate=0.001),
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy']
            )
            print("Loaded existing model")
        except Exception as e:
            print(f"Error loading model: {e}")
            print("Training a new model...")
            train_ds, val_ds = create_dataset(sentences, tokenizer)
            model = build_model()

            callbacks = [
                EarlyStopping(patience=2, restore_best_weights=True),
                ModelCheckpoint(MODEL_PATH, save_best_only=True)
            ]

            model.fit(
                train_ds,
                validation_data=val_ds,
                epochs=EPOCHS,
                callbacks=callbacks
            )

            model.save(MODEL_PATH)
            print(f"Model saved to {MODEL_PATH}")
    else:
        train_ds, val_ds = create_dataset(sentences, tokenizer)
        model = build_model()

        callbacks = [
            EarlyStopping(patience=2, restore_best_weights=True),
            ModelCheckpoint(MODEL_PATH, save_best_only=True)
        ]

        model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=EPOCHS,
            callbacks=callbacks
        )

        model.save(MODEL_PATH)
        print(f"Model saved to {MODEL_PATH}")

    try:
        interactive_prompt(model, tokenizer)
    except Exception as e:
        print(f"Error during interactive prompt: {e}")
        print("Please check your model configuration and try again.")

In [11]:
if __name__ == "__main__":
    main()

Loaded 40000 sentences
Loaded existing model
\nArabic Autocomplete System (type 'exit' to quit)
\nInput text: exite
\nTop Suggestions:
1. exite ، (confidence: 0.0873)
2. exite . (confidence: 0.0464)
3. exite ▁في (confidence: 0.0374)
\nInput text: exit


In [15]:
!pip install gradio==3.50.2 --quiet

In [13]:
def get_tokenizer(sentences):
    sp = spm.SentencePieceProcessor()
    sp.Load(TOKENIZER_PATH)
    return sp

def predict_next_words(text):
    text = normalize_arabic(text)
    encoded = tokenizer.EncodeAsIds(text)
    padded = encoded[-MAX_SEQUENCE_LENGTH:] + [0] * (MAX_SEQUENCE_LENGTH - len(encoded))
    input_seq = np.array([padded])

    preds = model.predict(input_seq, verbose=0)[0]
    top_ids = np.argsort(preds)[-BEAM_WIDTH:][::-1]

    results = []
    for idx in top_ids:
        token = tokenizer.IdToPiece(int(idx)).replace("▁", "").strip()
        if token:
            results.append(f"{text.strip()} {token} (confidence: {preds[idx]:.4f})")
    return "\n".join(results)

# --- Load model and tokenizer ---
model = load_model(MODEL_PATH, compile=False)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

sentences = load_data()
tokenizer = get_tokenizer(sentences)

# --- Gradio UI ---
gr.Interface(
    fn=predict_next_words,
    inputs=gr.Textbox(lines=2, placeholder="Type Arabic text here..."),
    outputs="text",
    title="Arabic Autocomplete",
    description="Enter the beginning of an Arabic sentence and get word suggestions."
).launch(share=True)

IMPORTANT: You are using gradio version 3.50.2, however version 4.44.1 is available, please upgrade.
--------
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://b5b0d6482cb231c9f2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


