<a href="https://colab.research.google.com/github/Yussof-Waleed/nlp-arabic-autocomplete/blob/main/NLP_Arabic_autocomplete_BLSTM_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv(r"/content/arabic_dataset_classifiction.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1717 entries, 0 to 1716
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    1717 non-null   object 
 1   targe   1716 non-null   float64
dtypes: float64(1), object(1)
memory usage: 27.0+ KB


In [None]:
df.shape

(1717, 2)

In [None]:
df.head()

Unnamed: 0,text,targe
0,بين أستوديوهات ورزازات وصحراء مرزوكة وآثار ولي...,0.0
1,قررت النجمة الأمريكية أوبرا وينفري ألا يقتصر ع...,0.0
2,أخبارنا المغربية الوزاني تصوير الشملالي ألهب ا...,0.0
3,اخبارنا المغربية قال ابراهيم الراشدي محامي سعد...,0.0
4,تزال صناعة الجلود في المغرب تتبع الطريقة التقل...,0.0


In [None]:
# Function to clean and normalize Arabic text
def clean_text(text):
    # Remove unnecessary characters (e.g., punctuation, numbers)
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    # Normalize Arabic text (e.g., remove diacritics, normalize letters)
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    text = text.replace("ى", "ي").replace("ة", "ه")
    return text

In [None]:
df['text'] = df['text'].apply(clean_text)

In [None]:
train_texts, val_texts = train_test_split(df['text'], test_size=0.2, random_state=42)

In [None]:
print(f"Number of training samples: {len(train_texts)}")
print(f"Number of validation samples: {len(val_texts)}")

Number of training samples: 1373
Number of validation samples: 344


In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/334k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Tokenize and encode the training and validation texts
def tokenize_and_encode(texts, tokenizer, max_length=128):
    """
    Tokenizes and encodes a list of texts into numerical representations.
    Args:
        texts (list): List of Arabic text strings.
        tokenizer: Tokenizer object for Arabic text.
        max_length (int): Maximum sequence length for padding/truncation.
    Returns:
        torch.Tensor: Encoded token IDs.
    """
    encoded = tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return encoded["input_ids"]


In [None]:
# Tokenize and encode the training and validation sets
train_encodings = tokenize_and_encode(train_texts, tokenizer)
val_encodings = tokenize_and_encode(val_texts, tokenizer)

In [None]:
# Display the shape of the encoded data
print(f"Training encodings shape: {train_encodings.shape}")
print(f"Validation encodings shape: {val_encodings.shape}")

Training encodings shape: torch.Size([1373, 128])
Validation encodings shape: torch.Size([344, 128])


In [None]:
# Create PyTorch Dataset class
class ArabicTextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return self.encodings.size(0)

    def __getitem__(self, idx):
        return self.encodings[idx]

# Create Dataset objects for training and validation sets
train_dataset = ArabicTextDataset(train_encodings)
val_dataset = ArabicTextDataset(val_encodings)

# Create DataLoader objects for batching and shuffling
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

# Display the number of batches in each DataLoader
print(f"Number of training batches: {len(train_loader)}")
print(f"Number of validation batches: {len(val_loader)}")

Number of training batches: 43
Number of validation batches: 11


In [None]:
class BiLSTMAutocomplete(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1, dropout=0.5, pad_token_id=0):
        super(BiLSTMAutocomplete, self).__init__()
        self.pad_token_id = pad_token_id
        # Embedding layer - ignore padding index
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_token_id)
        # Bidirectional LSTM layer
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True, # Input shape: (batch, seq_len, features)
            dropout=dropout if num_layers > 1 else 0 # Apply dropout between LSTM layers if num_layers > 1
        )
        # Dropout layer after LSTM (applied to LSTM output)
        self.dropout = nn.Dropout(dropout)
        # Fully connected output layer: maps concatenated hidden states to vocabulary size
        # Input to fc is (batch, seq_len, hidden_dim * 2) -> Output (batch, seq_len, output_dim)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # *2 for bidirectional

    def forward(self, input_ids, attention_mask=None):
        # x shape: (batch_size, seq_len)
        embedded = self.embedding(input_ids) # shape: (batch_size, seq_len, embedding_dim)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        lstm_out_dropout = self.dropout(lstm_out)
        output_logits = self.fc(lstm_out_dropout)

        return output_logits
    def generate(self,
                 input_ids: torch.LongTensor,
                 attention_mask: torch.LongTensor = None,
                 max_new_tokens: int = 20,
                 eos_token_id: int = None):
        self.eval()
        generated = input_ids
        for _ in range(max_new_tokens):
            logits = self(generated)                              # (1, seq_len, vocab)
            next_id = logits[:, -1, :].argmax(dim=-1, keepdim=True)
            generated = torch.cat([generated, next_id], dim=1)
            if eos_token_id is not None and next_id.item() == eos_token_id:
                break
        return generated

# Define model parameters
if tokenizer:
    vocab_size = tokenizer.vocab_size  # Vocabulary size from tokenizer
    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0 # Get pad token id
    embedding_dim = 128  # Size of word embeddings
    hidden_dim = 256  # Number of LSTM units in each direction
    output_dim = vocab_size  # Output size matches vocabulary size for predicting next token
    num_layers = 2  # Number of LSTM layers
    dropout = 0.3  # Dropout rate

    # Initialize the model
    model = BiLSTMAutocomplete(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, dropout, pad_token_id)

    # Display the model architecture
    print("\nModel Architecture:")
    print(model)
else:
    print("Cannot initialize model parameters without a tokenizer.")
    model = None


Model Architecture:
BiLSTMAutocomplete(
  (embedding): Embedding(32000, 128, padding_idx=0)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=32000, bias=True)
)


In [None]:
# Training loop
num_epochs = 10  # Number of epochs
train_losses = []
val_losses = []

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cpu


BiLSTMAutocomplete(
  (embedding): Embedding(32000, 128, padding_idx=0)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=32000, bias=True)
)

In [None]:
# assume:
#   model       = BiLSTMAutocomplete(...).to(device)
#   criterion   = nn.CrossEntropyLoss(ignore_index=pad_token_id)
#   optimizer   = torch.optim.Adam(model.parameters(), lr=...)
#   train_loader yields (batch_ids, attention_mask) or just batch_ids

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # if your loader returns just token IDs:
        input_ids = batch.to(device)                 # (B, T)
        # else if it also returns attention_mask:
        # input_ids, attn = batch
        # input_ids, attn = input_ids.to(device), attn.to(device)

        # 1) shift for next‐token prediction
        inputs  = input_ids[:, :-1]                  # (B, T-1)
        targets = input_ids[:, 1:]                   # (B, T-1)

        optimizer.zero_grad()
        logits = model(inputs)                       # (B, T-1, V)
        B, Tm1, V = logits.size()
        loss = criterion(
            logits.reshape(-1, V),                   # (B*(T-1), V)
            targets.reshape(-1)                      # (B*(T-1),)
        )
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()

    avg_train = total_loss / len(train_loader)

    # validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch.to(device)
            inputs  = input_ids[:, :-1]
            targets = input_ids[:, 1:]
            logits  = model(inputs)
            B, Tm1, V = logits.size()
            loss = criterion(
                logits.reshape(-1, V),
                targets.reshape(-1)
            )
            val_loss += loss.item()
    avg_val = val_loss / len(val_loader)

    print(f"Epoch {epoch+1}/{num_epochs}: "
          f"Train Loss = {avg_train:.4f}, Val Loss = {avg_val:.4f}")

Epoch 1/10: Train Loss = 8.2996, Val Loss = 7.5669
Epoch 2/10: Train Loss = 7.2505, Val Loss = 6.9960
Epoch 3/10: Train Loss = 6.6331, Val Loss = 6.3924
Epoch 4/10: Train Loss = 5.9790, Val Loss = 5.7627


In [None]:
# save your model
torch.save(model.state_dict(), "bilstm_autocomplete.pt")
# if you have a custom vocab/tokenizer, save it too
tokenizer.save_pretrained("my_tokenizer_dir")

('my_tokenizer_dir/tokenizer_config.json',
 'my_tokenizer_dir/special_tokens_map.json',
 'my_tokenizer_dir/vocab.txt',
 'my_tokenizer_dir/added_tokens.json',
 'my_tokenizer_dir/tokenizer.json')

In [None]:
import torch

# 1) set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2) rebuild model with the same hyper-parameters you trained with
vocab_size   = tokenizer.vocab_size
output_dim   = vocab_size   # for LM head
pad_token_id = tokenizer.pad_token_id or 0

embedding_dim = 128
hidden_dim    = 256
num_layers    = 2
dropout       = 0.3

model = BiLSTMAutocomplete(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    num_layers=num_layers,
    dropout=dropout,
    pad_token_id=pad_token_id
).to(device)

# 3) load your checkpoint
state = torch.load("bilstm_autocomplete.pt", map_location=device)
model.load_state_dict(state)

model.eval()

BiLSTMAutocomplete(
  (embedding): Embedding(32000, 128, padding_idx=0)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=32000, bias=True)
)

In [None]:
def autocomplete(text, max_new_tokens=10):
    enc = tokenizer(
        text,
        return_tensors="pt",
        padding=False,
        truncation=True
    ).to(device)

    with torch.no_grad():
        out_ids = model.generate(
            input_ids      = enc["input_ids"],
            attention_mask = enc.get("attention_mask"),
            max_new_tokens = max_new_tokens,
            eos_token_id   = tokenizer.eos_token_id
        )
    return tokenizer.decode(out_ids[0], skip_special_tokens=True)

print(autocomplete("هذا اختبار"))

هذا اختبار ستوكر خلت الكوكيز ومجلس للانسان ومجلس وشاهد بتعديل البطالة الساحرة


In [None]:
print(autocomplete("مصر دولة قوية ومهمة"))

مصر دولة قوية ومهمة ستوكر كتب ومجلس جز مرجع ェ اساءادور المقاولات اساء


In [None]:
print(autocomplete("عمر طالب حاسبات"))

عمر طالب حاسبات دستوريةη يقع2013 وتنز دستوريةختار وتنز دستورية اطفالنا
