In [1]:
import os
import re
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dropout, LayerNormalization

In [2]:
def normalize_arabic(text):
    text = re.sub(r'[ููููููููู]', '', text)
    text = re.sub(r'[ุฅุฃุขุง]', 'ุง', text)
    text = re.sub(r'ู', 'ู', text)
    text = re.sub(r'ุค', 'ู', text)
    text = re.sub(r'ุฆ', 'ู', text)
    text = re.sub(r'ุฉ', 'ู', text)
    return text

def split_into_sentences(text):
    return [sentence.strip() for sentence in re.split(r'[.!ุ\n]', text) if sentence.strip()]

In [3]:
arabic_keyboard = {
    'ุง': ['ุฃ', 'ุฅ', 'ุก', 'ู'],
    'ุจ': ['ู', 'ุช'],
    'ุช': ['ุจ', 'ู'],
    'ุซ': ['ุช', 'ุณ'],
    'ุฌ': ['ุญ', 'ุฎ'],
    'ุญ': ['ุฌ', 'ุฎ'],
    'ุฎ': ['ุญ', 'ุฌ'],
    'ุฏ': ['ุฐ'],
    'ุฐ': ['ุฏ', 'ุฑ'],
    'ุฑ': ['ุฐ', 'ุฒ'],
    'ุฒ': ['ุฑ', 'ุณ'],
    'ุณ': ['ุด', 'ุต'],
    'ุด': ['ุณ', 'ุต'],
    'ุต': ['ุณ', 'ุด', 'ุถ'],
    'ุถ': ['ุต', 'ุท'],
    'ุท': ['ุถ', 'ุธ'],
    'ุน': ['ุบ'],
    'ุบ': ['ุน'],
    'ู': ['ู'],
    'ู': ['ู'],
    'ู': ['ู'],
    'ู': ['ู'],
    'ู': ['ู'],
    'ู': ['ู', 'ุจ'],
    'ู': ['ุฉ'],
    'ุฉ': ['ู'],
    'ู': ['ู', 'ุจ'],
    'ู': ['ู']
}

def keyboard_substitute(char):
    if char in arabic_keyboard:
        return random.choice(arabic_keyboard[char])
    return char

def insert_typo(word):
    if len(word) < 2:
        return word
    # typo_type = random.choice(['delete', 'insert', 'substitute'])
    typo_type = 'substitute'
    i = random.randint(0, len(word) - 1)
    arabic_chars = 'ุงุจุชุซุฌุญุฎุฏุฐุฑุฒุณุดุตุถุทุธุนุบููููููููู'

    if typo_type == 'delete':
        return word[:i] + word[i+1:]
    elif typo_type == 'insert':
        return word[:i] + random.choice(arabic_chars) + word[i:]
    elif typo_type == 'substitute':
        return word[:i] + keyboard_substitute(word[i]) + word[i+1:]
    return word

def corrupt_sentence(sentence, error_probability=0.1):
    words = sentence.split()
    corrupted = [insert_typo(w) if random.random() < error_probability else w for w in words]
    return ' '.join(corrupted)

In [4]:
df = pd.read_csv('unbalanced_reviews.tsv', sep='\t', header=None)

texts = df[4].astype(str).tolist()

all_sentences = []
for text in texts:
    norm_text = normalize_arabic(text)
    all_sentences.extend(split_into_sentences(norm_text))

clean_sentences = [s for s in all_sentences if 5 < len(s) < 100]
clean_sentences = list(set(clean_sentences))  # Remove duplicates
noisy_sentences = [corrupt_sentence(s) for s in clean_sentences]

print(f"Clean samples: {len(clean_sentences)}")

Clean samples: 2182769


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510599 entries, 0 to 510598
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       510599 non-null  int64 
 1   1       510599 non-null  int64 
 2   2       510599 non-null  int64 
 3   3       510599 non-null  int64 
 4   4       510599 non-null  object
dtypes: int64(4), object(1)
memory usage: 19.5+ MB


In [6]:
df.head()

Unnamed: 0,0,1,2,3,4
0,4,1682581870,57098525,13637412,ุตุฑุงุน ุงูุฌุฐูุฑ ูุงูุงูุชูุงุกุ ุนููุฉ ุณุงู ุงูุฎูุฒุงู ุชูุงุฆู ...
1,5,1682385404,56693085,13637412,ูุชุงุจ ุฑุงุฆุน. ุงุนุชูุฏ ุงู ุงูุฑูุงูู ูููุง ุชูุฎุตุช ุจุฌููู ู...
2,4,1682039752,30836455,13637412,ุฑูุงูุฉ ุชูุงูุณ ุงูุฑูุญ ุจุนูููุงุ ูุฎูุฑุฉ ุงูู ุงุฎูุฑุง ูููุช...
3,5,1681553886,6680940,13637412,ุฑูุงูุฉ ูุญููุฉ ุจูู ุงุฎุชุตุงุฑ. ููุงู ุงูุฌุฒุก ุงูููุถู ุจุงูู...
4,3,1681248984,19011044,13637412,ูุฐุง ุงููุชุงุจ ูุญุฒู ูุฑุงุ ุธูู ูุงุถุทูุงุฏ ุนูุณู ุจูุง ุฐูุจ ...


In [7]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(clean_sentences + noisy_sentences)

vocab_size = len(tokenizer.word_index) + 1
max_len = max(len(s) for s in clean_sentences)

def encode(sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_len, padding='post')

X = encode(noisy_sentences)
Y = encode(clean_sentences)
Y = Y[..., None] 

In [8]:
embedding_dim = 300
lstm_units = 256
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, input_shape=(max_len,)),
    Bidirectional(LSTM(lstm_units, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(128, return_sequences=True)),
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 99, 300)           50400     
_________________________________________________________________
bidirectional (Bidirectional (None, 99, 512)           1140736   
_________________________________________________________________
dropout (Dropout)            (None, 99, 512)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 99, 256)           656384    
_________________________________________________________________
time_distributed (TimeDistri (None, 99, 168)           43176     
Total params: 1,890,696
Trainable params: 1,890,696
Non-trainable params: 0
_________________________________________________________________


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.models import load_model

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=2,
    min_lr=1e-6
)

early_stop = EarlyStopping(
    patience=5,
    restore_best_weights=True,
    monitor='val_accuracy',
    mode='max'
)

checkpoint = ModelCheckpoint(
    filepath='best_model.keras',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

model.fit(
    X_train, Y_train,
    epochs=1,
    batch_size=64,
    validation_data=(X_test, Y_test),
    callbacks=[checkpoint, early_stop, reduce_lr]
)


Epoch 00001: val_accuracy improved from -inf to 0.99774, saving model to best_model.keras


<keras.callbacks.History at 0x2160a1c3be0>

In [10]:
model = load_model('best_model.keras')

def decode_sequence(pred):
    pred_ids = np.argmax(pred, axis=-1)
    index_word = {v: k for k, v in tokenizer.word_index.items()}
    return ''.join([index_word.get(i, '') for i in pred_ids])

def autocorrect(input_text):
    input_text = normalize_arabic(input_text)
    seq = encode([input_text])
    pred = model.predict(seq)
    return decode_sequence(pred[0])

In [11]:
sentences_with_typos = [
    "ุฃูุง ุฃุฌุจ ุงููุฏุฑุณุฉ",     # ุฃุญุจ โ ุฃุฌุจ
    "ูู ููุฃ ูุชุงุจูุง",       # ููุฑุฃ โ ููุฃ
    "ุงูุญู ุฌููู ุงูููู",     # ุงูุฌู โ ุงูุญู
    "ุฐูุจุช ุฅูู ุงูุณูู",      # ุงูุณูู โ ุงูุณูู
    "ูู ุชุทุจุฎ ุงูุทุนุงู",      # ุงูุทุนุงู โ ุงูุทุนุงู
    "ุงููุท ูุญูุณ ููุงู",      # ูุฌูุณ โ ูุญูุณ
    "ุงูููุฏ ููุชุจ ุงููุงุฎุจ",   # ุงููุงุฌุจ โ ุงููุงุฎุจ
    "ุฃูู ุญุฎูุจุชูุ",         # ุญููุจุชู โ ุญุฎูุจุชู
    "ุฃูุง ูุชุนุจ ูุจุจูุงู",     # ููููุงู โ ูุจุจูุงู
    "ูุญู ููุนุจ ูู ุงูุฌุฏููุฉ", # ุงูุญุฏููุฉ โ ุงูุฌุฏููุฉ
    "ุงูุณูุงุก ุฒุฑูุงุก ุตุงูุจุฉ",  # ุตุงููุฉ โ ุตุงูุจุฉ
    "ูู ุฃููุช ุงููุจูุฑุ",     # ุงููุทูุฑ โ ุงููุจูุฑ
    "ุฃุฑูุฏ ุดุฑุจ ุงููุงุก",      # ุงููุงุก โ ุงููุงุก
    "ุงูุณูุงุฑุฉ ุณุฑูุนุฉ ุฌุฐุงู",  # ุฌุฏุงู โ ุฌุฐุงู
    "ุงูููุช ูุชุฃุญุฑ ุงูุขู",    # ูุชุฃุฎุฑ โ ูุชุฃุญุฑ
    "ุฃุญุจ ูุฑุฉ ุงููุฏู",       # ุงููุฏู โ ุงููุฏู
    "ูู ูุนูู ุจุญุฏ",         # ุจุฌุฏ โ ุจุญุฏ
    "ุงูุทูู ูุงุฆู",          # ูุงุฆู โ ูุงุฆู
    "ุฃูู ุงููุบุงุชูุญุ",       # ุงูููุงุชูุญ โ ุงููุบุงุชูุญ
    "ุงูุดุงุฑุน ูุฒุฐุญู"         # ูุฒุฏุญู โ ูุฒุฐุญู
]
for test_input in sentences_with_typos:
    print("Noisy input:    ", test_input)
    print("Autocorrected:  ", autocorrect(test_input))

Noisy input:     ุฃูุง ุฃุฌุจ ุงููุฏุฑุณุฉ
Autocorrected:   ุงูุง ุงุญุจ ุงููุฏุฑุณู
Noisy input:     ูู ููุฃ ูุชุงุจูุง
Autocorrected:   ูู ููุง ูุชุงุจุง
Noisy input:     ุงูุญู ุฌููู ุงูููู
Autocorrected:   ุงูุฌู ุฌููู ุงูููู
Noisy input:     ุฐูุจุช ุฅูู ุงูุณูู
Autocorrected:   ุฐูุจุช ุงูู ุงูุณูู
Noisy input:     ูู ุชุทุจุฎ ุงูุทุนุงู
Autocorrected:   ูู ุชุทุจุฎ ุงูุทุนุงู
Noisy input:     ุงููุท ูุญูุณ ููุงู
Autocorrected:   ุงููุท ูุญูุณ ููุงู
Noisy input:     ุงูููุฏ ููุชุจ ุงููุงุฎุจ
Autocorrected:   ุงูููุฏ ููุชุจ ุงููุงุญุจ
Noisy input:     ุฃูู ุญุฎูุจุชูุ
Autocorrected:   ุงูู ุญุฎูุจุชู
Noisy input:     ุฃูุง ูุชุนุจ ูุจุจูุงู
Autocorrected:   ุงูุง ูุชุนุจ ูุจุจูุง
Noisy input:     ูุญู ููุนุจ ูู ุงูุฌุฏููุฉ
Autocorrected:   ูุญู ููุนุจ ูู ุงูุญุฏููู
Noisy input:     ุงูุณูุงุก ุฒุฑูุงุก ุตุงูุจุฉ
Autocorrected:   ุงูุณูุงุก ุฒุฑูุงุก ุตุงู

In [12]:
from difflib import SequenceMatcher

def sentence_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

scores = [sentence_similarity(clean, autocorrect(noisy)) for clean, noisy in zip(clean_sentences[:100], noisy_sentences[:100])]
print(f"Avg sentence similarity: {np.mean(scores):.4f}")

Avg sentence similarity: 0.9947


In [13]:
def evaluate_sentence_level_accuracy(clean_list, noisy_list):
    correct_count = 0
    total = len(clean_list)
    for clean, noisy in zip(clean_list, noisy_list):
        prediction = autocorrect(noisy)
        if prediction == clean:
            correct_count +=1
        print(clean)
        print(noisy)
    print(f"Sentence-level accuracy: {correct_count / total:.4f}")

evaluate_sentence_level_accuracy(clean_sentences[500:700], noisy_sentences[500:700])

ููุง ูุญู ููุฏูุง ุงูุณุงููุชูุง ููููุชูุง ูุตุฑูุง ุจูุง ูุถูู
ููุง ูุญู ููุฏูุง ุงูุณุงููุชูุง ููููุชูุง ูุตุฑูุง ุจูุก ูุถูู
ู ูููู ุงุญูุงูุง ููุณู ูุฐุง ุงููููุฌ ุญูู ูุณุชุฑุณู ูู ุงููุชุงุจู ู ูุณุฌ ุงูููุฑู
ู ูููู ุงุญูุงูุง ููุณู ูุฐุง ุงููููุฌ ุญูู ูุณุชุฑุณู ูู ุงููุชุงุจู ู ูุณุฌ ุงูููุฑู
ูุชุงุจ ุฑุงูุน ู ูููุฏ ู ุฌุงูุน ูููุณุฑ ู ุงูุจุณุงุทู ูู ุงูุทุฑุญ ู ูุถูุญ ุงูุงุณููุจ
ูุชุงุจ ุฒุงูุน ู ูููุฏ ู ุฌุงูุน ูููุณุฑ ู ุงูุจุณุงุทู ูู ุงูุทุฑุญ ู ูุถูุญ ุงูุงุณููุจ
ูุชุงุจ ุฌููู ุงูุตุญ ุจูุฑุงุกุชู ููุฎุต ุงููุซูุฑ ูู ุงูุงููุงุฑ
ูุชุงุจ ุฌููู ุงูุตุญ ุจูุฑุงุกุชู ููุฎุต ุงููุซูุฑ ูู ุงููููุงุฑ
ุจูุญุงุฐุงู ุงูุดุงุทู ูุตูุน ููุงุณูู ููู ุงูุงุจุญุงุฑ ูุตูุน ูุตู ุญูุงู
ุจูุญุงุฐุงู ุงูุดุงุทู ูุตูุน ููุงุณูู ููู ุงูุงุจุญุงุฑ ูุตูุน ูุตู ุญูุงู
ูู

In [14]:
def evaluate_autocorrect_model(clean_sentences, noisy_sentences, verbose=True):
    assert len(clean_sentences) == len(noisy_sentences), "Mismatched input lengths."
    
    sentence_correct = 0
    word_accuracies = []
    
    for clean, noisy in zip(clean_sentences, noisy_sentences):
        predicted = autocorrect(noisy)
        
        if predicted == clean:
            sentence_correct += 1
        
        clean_words = clean.split()
        predicted_words = predicted.split()
        correct_words = sum(1 for cw, pw in zip(clean_words, predicted_words) if cw == pw)
        word_accuracy = correct_words / max(len(clean_words), 1)
        word_accuracies.append(word_accuracy)

        print("๐ธ Noisy:     ", noisy)
        print("๐ Predicted:", predicted)
        print("โ Target:   ", clean)
        if predicted == clean and noisy != clean: 
            print("Corrected a mistake!")
        if predicted == clean and noisy == clean: 
            print("Correct, but no mistake was found")
        print()

    total = len(clean_sentences)
    print(f"\n๐ Sentence-level accuracy: {sentence_correct}/{total} = {sentence_correct/total:.2%}")
    print(f"๐ Avg. word-level accuracy: {np.mean(word_accuracies):.2%}")

In [19]:
sentence1 = "ุฃูุง ุฃุฌุจ ุงููุฏุฑุตุฉ"
sentence2 = "ุฃูุง ุฃุดุนุฑ ุจุงูุญูุน"
sentence3 = "ุงูุญุฏููุฉ ุญูููุฉ ุฌุฏูุง"
sentence4 = "ุฃุฏุฑุณ ุงููุบุฉ ุงูุนุฒุจูุฉ"
sentence5 = "ุฃุญุจ ุฃุชุบูู ุงูุจุฑูุฌุฉ"
sentence6 = "ุงูุณูุงุก ุฒุฒูุงุก ุตุงููุฉ"
sentence7 = "ูุบุงูุฌุฉ ุงููุบุงุช ุงูุทุจูุนูุฉ"
print(autocorrect(sentence1))

ุงูุง ุงุญุจ ุงููุฏุฑุณู


In [18]:
import tkinter as tk
def correct_sentence():
    input_text = entry.get()
    corrected = autocorrect(input_text)
    result_label.config(text=" ุงูุชุตุญูุญ: " + corrected)

root = tk.Tk()
root.title("Arabic Sentence Autocorrector")
root.geometry("500x200")

label = tk.Label(root, text="ุฃุฏุฎู ุงูุฌููุฉ ุงูุนุฑุจูุฉ:", font=("Arial", 14))
label.pack(pady=10)

entry = tk.Entry(root, font=("Arial", 14), justify='right')
entry.pack(fill='x', padx=20)

button = tk.Button(root, text="ุชุตุญูุญ", font=("Arial", 12), command=correct_sentence)
button.pack(pady=10)

result_label = tk.Label(root, text="", font=("Arial", 14), fg="green", wraplength=480, justify='right')
result_label.pack(pady=10)

root.mainloop()