In [5]:
# imports lib
import re
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Flatten

In [6]:
# Define the ArabicCleaning class for text preprocessing
class ArabicCleaning():
    def __init__(self):
        pass

    def clean(self, text):
        arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
        arabic_pattern_others = r'[^\w\s\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'

        def remove_special_words(text):
            words = text.split()
            text = [word for word in words if '#' not in word and '_' not in word]
            text = ' '.join(text)
            return text

        def keep_only_arabic_letters(text):
            words = text.split()
            processed_words = []
            for word in words:
                arabic_letters_only = ''.join([char for char in word if re.match(arabic_pattern, char) and char not in ["؟", "؛", "،"]])
                processed_words.append(arabic_letters_only)
            return ' '.join(processed_words)

        def check_empty(text):
            if len(text.split()) == 0:
                return ''
            else:
                return text

        text = remove_special_words(text)
        text = re.sub(arabic_pattern_others, '', text)
        text = re.sub(r'[0-9]', '', text)
        text = re.sub(r'[a-zA-Z]', '', text)
        text = keep_only_arabic_letters(text)
        text = check_empty(text)

        return text



In [29]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)

df=pd.read_csv('cleaned_data.csv')

In [48]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])

# save the tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


# Label Encoding
encoder = LabelEncoder()
encoder.fit(df['dialect'])

# save the encoder
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)


In [14]:
#load the tokenizer
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

#load the encoder
with open('encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)
    

X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X)

y = encoder.transform(df['dialect'])

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [1]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [13]:
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_val shape:', X_val.shape)
print('y_val shape:', y_val.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)


X_train shape: (76510, 65)
y_train shape: (76510,)
X_val shape: (9564, 65)
y_val shape: (9564,)
X_test shape: (9564, 65)
y_test shape: (9564,)


# **ANN**

In [14]:
# Model Definition
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=X.shape[1]))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(5, activation='softmax'))

# Model Compilation
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model Training
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=1024)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss:", loss)
print("Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.5082119703292847
Accuracy: 0.865432858467102


# **LSTM**

In [15]:
# Model Definition
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=X.shape[1]))
model.add(LSTM(128))
model.add(Dense(5, activation='softmax'))

# Model Compilation
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model Training
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=1024)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss:", loss)
print("Accuracy:", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.5292786359786987
Accuracy: 0.8655374050140381


# CNN WITH LSTM

In [16]:
# Model Definition
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=X.shape[1]))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(LSTM(128))
model.add(Dense(5, activation='softmax'))

# Model Compilation
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model Training
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=1024)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss:", loss)
print("Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.7656872272491455
Accuracy: 0.8403387665748596
