# NLP Project (Arabic Dialect Classification)-ML

### Importing necessary libraries

In [None]:
import pandas as pd
import emoji
import re
import tashaphyne.normalize as normalize
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.base import TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.linear_model import RidgeClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ihabn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Reading the data and labels files

In [None]:
df1=pd.read_csv(r"C:\Users\ihabn\id_text.csv")
df2=pd.read_csv(r"C:\Users\ihabn\id_dialect.csv")
df = pd.merge(df1, df2, on='id')
df.head(5)

Unnamed: 0,id,text,dialect
0,1009754958479151232,@toha_Altomy @gy_yah قليلين ادب ومنافقين. لو ا...,LY
1,1009794751548313600,@AlmFaisal 😂😂 الليبيين متقلبين!!!\nبس بالنسبة ...,LY
2,1019989115490787200,@smsm071990 @ALMOGRBE كل 20 تانيه شاب ليبي بير...,LY
3,1035479791758135168,@AboryPro @lyranoo85 رانيا عقليتك متخلفة. اولا...,LY
4,1035481122921164800,@lyranoo85 شكلك متعقدة علشان الراجل لي تحبيه ا...,LY


### Our 5 Classes

In [None]:
df.dialect.unique()

array(['LY', 'MA', 'EG', 'LB', 'SD'], dtype=object)

### Preprocessing

##### Replace new lines from the data with a space

In [None]:
def replace_newlines(txt):
    return txt.replace('\n', ' ')

##### Removing tags (@user) and any consecutive spaces

In [None]:
def remove_tag(txt):
    return re.sub(r'@\w+\s*', '', txt)

##### Remove links and any consecutive spaces

In [None]:
def remove_links(txt):
    return re.sub(r'https?\S+\s*', '', txt)

##### Removing English sentences and any consecutive spaces

In [None]:
def remove_english(txt):
    return re.sub(r'[a-zA-Z]+\s*', '', txt)

##### Remove all emojies and any consecutive spaces

In [None]:
def remove_emoji(txt):
    return emoji.replace_emoji(txt, '')

##### Remove unuseful marks <br>( These marks can be used for old style emojis )

In [None]:
def remove_punctuation(txt):
    return re.sub(r'[^\w\s]|[_]', '', txt)

##### Normalize laughter sounds ("ههه", "هههه") to a single instance ("هه")

In [None]:
def map_laughter(txt):
    return re.sub(r'(هه)ه+', 'هه', txt)

##### Remove repeated letters

In [None]:
def remove_repeated_letters(txt):
    return re.sub(r'(.)\1{2,}', r'\1', txt)

##### Remove numbers

In [None]:
def remove_numbers(txt):
    return re.sub(r'\d+', '', txt)

##### Character normalization

In [None]:
def normalize_arabic(txt):
    return normalize.normalize_searchtext(txt)

##### Remove stop words

In [None]:
def remove_stop_words(txt, stop_words):
    return " ".join([word for word in word_tokenize(txt) if word not in stop_words])

##### Remove repeated spaces

In [None]:
def remove_repeated_spaces(txt):
    return re.sub(r'\s{2,}', ' ', txt).strip()

# Preprocessing function 

In [None]:
def Preprocessing(text):
    text = replace_newlines(text)
    text = remove_tag(text)
    text = remove_links(text)
    text = remove_english(text)
    text = remove_emoji(text)
    text = remove_punctuation(text)
    text = map_laughter(text)
    text = normalize_arabic(text)
    text = remove_repeated_letters(text)
    text = remove_numbers(text)
    ar_stop_words = set(stopwords.words('arabic'))
    ar_stop_words = [normalize_arabic(word) for word in ar_stop_words]
    text = remove_stop_words(text,ar_stop_words)
    text = remove_repeated_spaces(text)
    return text

# Data Preparation

In [None]:
df.dialect= df.dialect.replace({'EG': 1,
                                        'LY': 2,
                                        'LB': 3,
                                        'SD': 4,
                                        'MA': 5
                                        })

In [None]:
df.rename(columns={'dialect': 'label'}, inplace=True)

In [None]:
X = df['text']
y = df['label']
t_size = 0.20
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=t_size, random_state=random_state,shuffle=True)

In [None]:
print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (118180,) (118180,)
Test shape: (29545,) (29545,)


In [None]:
sumall = y_train.count()
classes, counts = np.unique(y_train, return_counts=True)
wei = counts / sumall
class_weights_dict = {}
for i in range(1, 6):
    weight = counts[i-1] / sumall
    class_weights_dict[i] = weight
class_weights_dict

{1: 0.39052293112201725,
 2: 0.2473430360467084,
 3: 0.18648671518023355,
 4: 0.0973261127094263,
 5: 0.07832120494161449}

# pipeLine with different classifiers

In [None]:
class PreprocessingClass(TransformerMixin):
    def transform(self, X, **transform_params):
        return [Preprocessing(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

classifiers = {
    'LogisticRegression': LogisticRegression(class_weight=class_weights_dict, max_iter=1000),
    'MultinomialNB': MultinomialNB(),
    'LinearSVC': LinearSVC(class_weight=class_weights_dict,max_iter=1000),
    'RandomForestClassifier': RandomForestClassifier(class_weight=class_weights_dict)
}

results = {}
pipeline_dict = {}

for name, clf in classifiers.items():
    pipeline = Pipeline([
        ('preprocessing', PreprocessingClass()),
        ('tfidf', TfidfVectorizer()),
        ('clf', clf)
    ])

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Predict on the test set
    y_pred_test = pipeline.predict(X_test)

    # Evaluate the performance on the test set
    report_test = classification_report(y_test, y_pred_test, output_dict=True)
    accuracy_test = round(report_test['accuracy'] * 100, 2)
    precision_test = round(report_test['macro avg']['precision'] * 100, 2)
    recall_test = round(report_test['macro avg']['recall'] * 100, 2)
    f1_test = round(report_test['macro avg']['f1-score'] * 100, 2)
    results[name] = {
        'accuracy_test': accuracy_test,
        'precision_test': precision_test,
        'recall_test': recall_test,
        'f1_test-score': f1_test
    }
    pipeline_dict[name] = {
        'Report_test': report_test,
        'pipeline': pipeline,
        'y_pred_test': y_pred_test
    }

df_Result = pd.DataFrame(results).transpose()
print(df_Result)


                        accuracy_test  precision_test  recall_test  \
LogisticRegression              69.74           84.25        51.93   
MultinomialNB                   72.13           87.56        56.09   
LinearSVC                       81.06           85.42        72.30   
RandomForestClassifier          73.69           78.69        63.04   

                        f1_test-score  
LogisticRegression              54.29  
MultinomialNB                   60.12  
LinearSVC                       76.51  
RandomForestClassifier          67.34  


In [None]:
df_Result

Unnamed: 0,accuracy_test,precision_test,recall_test,f1_test-score
LogisticRegression,69.74,84.25,51.93,54.29
MultinomialNB,72.13,87.56,56.09,60.12
LinearSVC,81.06,85.42,72.3,76.51
RandomForestClassifier,73.69,78.69,63.04,67.34


# Linear SVC and Ridge classifier

In [None]:
class PreprocessingClass(TransformerMixin):
    def transform(self, X, **transform_params):
        return [Preprocessing(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self


classifiers = {
    'LinearSVC': LinearSVC(class_weight=class_weights_dict, max_iter=1000),
    'Ridge': RidgeClassifier(class_weight=class_weights_dict)
}
    


results = {}
pipeline_dict = {}

for name, clf in classifiers.items():
    pipeline = Pipeline([
        ('preprocessing', PreprocessingClass()),
        ('tfidf', TfidfVectorizer()),
        ('clf', clf)
    ])

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Predict on the test set
    y_pred_test = pipeline.predict(X_test)

    # Evaluate the performance on the test set
    report_test = classification_report(y_test, y_pred_test, output_dict=True)
    accuracy_test = round(report_test['accuracy'] * 100, 2)
    precision_test = round(report_test['macro avg']['precision'] * 100, 2)
    recall_test = round(report_test['macro avg']['recall'] * 100, 2)
    f1_test = round(report_test['macro avg']['f1-score'] * 100, 2)
    results[name] = {
        'accuracy_test': accuracy_test,
        'precision_test': precision_test,
        'recall_test': recall_test,
        'f1_test-score': f1_test
    }
    pipeline_dict[name] = {
        'Report_test': report_test,
        'pipeline': pipeline,
        'y_pred_test': y_pred_test
    }

df_Result = pd.DataFrame(results).transpose()
df_Result


Unnamed: 0,accuracy_test,precision_test,recall_test,f1_test-score
LinearSVC,81.06,85.42,72.3,76.51
Ridge,74.93,85.97,61.03,65.91


## **Mazaj Vectorizer with Linear SVC**

In [None]:
class PreprocessingClass(TransformerMixin):
    def transform(self, X, **transform_params):
        return [Preprocessing(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self


import numpy as np

class WordEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, embedding_model, max_sequence_length):
        self.embedding_model = embedding_model
        self.max_sequence_length = max_sequence_length

    def transform(self, X, **transform_params):
        embedded_X = []
        for text in X:
            embedded_text = []
            for word in text.split():
                if word in self.embedding_model:
                    embedded_text.append(self.embedding_model[word])
                else:
                    embedded_text.append([0.0] * self.embedding_model.vector_size)
            embedded_X.append(embedded_text)

        # Pad or truncate the sequences to a fixed length
        padded_X = self._pad_sequences(embedded_X)

        return np.stack(padded_X)

    def fit(self, X, y=None, **fit_params):
        return self

    def _pad_sequences(self, sequences):
        padded_sequences = []
        for seq in sequences:
            if len(seq) < self.max_sequence_length:
                # Pad sequence with zeros
                padded_seq = seq + [[0.0] * self.embedding_model.vector_size] * (self.max_sequence_length - len(seq))
            else:
                # Truncate sequence to the maximum length
                padded_seq = seq[:self.max_sequence_length]
            padded_sequences.append(padded_seq)
        return padded_sequences


class FlattenTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return X.reshape(X.shape[0], -1)

    def fit(self, X, y=None, **fit_params):
        return self


classifiers = {
    'LinearSVC': LinearSVC(class_weight=class_weights_dict)
}

results = {}
pipeline_dict = {}

for name, clf in classifiers.items():
    pipeline = Pipeline([
        ('preprocessing', PreprocessingClass()),
        ('embedding', WordEmbeddingTransformer(mazajak_model,20)),
        ('flatten', FlattenTransformer()),
        ('clf', clf)
    ])
    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Predict on the test set
    y_pred_test = pipeline.predict(X_test)

    # Evaluate the performance on the test set
    report_test = classification_report(y_test, y_pred_test, output_dict=True)
    accuracy_test = round(report_test['accuracy'] * 100, 2)
    precision_test = round(report_test['macro avg']['precision'] * 100, 2)
    recall_test = round(report_test['macro avg']['recall'] * 100, 2)
    f1_test = round(report_test['macro avg']['f1-score'] * 100, 2)
    results[name] = {
        'accuracy_test': accuracy_test,
        'precision_test': precision_test,
        'recall_test': recall_test,
        'f1_test-score': f1_test
    }
    pipeline_dict[name] = {
        'Report_test': report_test,
        'pipeline': pipeline,
        'y_pred_test': y_pred_test
    }

df_Result = pd.DataFrame(results).transpose()
print(df_Result)




           accuracy_test  f1_test-score  precision_test  recall_test
LinearSVC          77.61          72.23           77.51        69.44


## `From Results we selected Linear SVC with TF-IDF vectorizer`

# Linear SVC Evaluation

In [None]:
class_names = ['EG', 'LB', 'LY', 'MA', 'SD']
# Evaluate the pipeline on the test data
y_pred = pipeline_dict['LinearSVC']['pipeline'].predict(X_test)
report = classification_report(y_test, y_pred, target_names=class_names, output_dict=True)
print(classification_report(y_test, y_pred, target_names=class_names))


              precision    recall  f1-score   support

          EG       0.78      0.94      0.85     11484
          LB       0.79      0.81      0.80      7268
          LY       0.86      0.81      0.84      5578
          MA       0.90      0.48      0.62      2932
          SD       0.94      0.57      0.71      2283

    accuracy                           0.81     29545
   macro avg       0.85      0.72      0.77     29545
weighted avg       0.82      0.81      0.80     29545



#### Test Model random Scentences(showing the original scentence)

In [None]:
import random
def evaluate_random_sentences(pipeline, X_test, y_test, num_sentences):
    random_indices = random.sample(range(len(X_test)), num_sentences)
    random_sentences = X_test.iloc[random_indices]  
    true_labels = y_test.iloc[random_indices]  
    predicted_labels = pipeline.predict(random_sentences)

    for i in range(num_sentences):
        print("Random sentence:", random_sentences.iloc[i])
        print("True label:", true_labels.iloc[i])
        print("Predicted label:", predicted_labels[i])
        print("------------------------")

num_sentences = 5  
evaluate_random_sentences(pipeline, X_test, y_test, num_sentences)


Random sentence: @aliimortada @amer__h 😏😏😏😏😏 ليه كرمال نبطل رعاع
True label: 3
Predicted label: 3
------------------------
Random sentence: @beINSPORTS_news يريت تحولو الشريط الاحمر يلي يطلع كل شويه كرهتونا في الكوره جددنا بثمن غالي و المعلقين يحكو علي السرقه معش يعلقو عالمباراه .والسلام عليكم
True label: 2
Predicted label: 2
------------------------
Random sentence: @ahmedyassine30 @FatimahGhamlush 😅😅 بغيبتك عم ندير بالنا عالحبايب 😶
True label: 3
Predicted label: 3
------------------------
Random sentence: قولتولي الأولتراس كانوا بيخلطوا الكورة بالسياسة مش كده ... 

😂😂😂😂 https://t.co/pXxIrQkIns
True label: 1
Predicted label: 1
------------------------
Random sentence: هو ده 
 هو ده 
هو ده الحضن الحقيقى https://t.co/cnPbnzWbHa
True label: 2
Predicted label: 1
------------------------


# PipeLine Visualization

In [None]:
pipeline_dict['LinearSVC']['pipeline']


# Saving The Model

In [None]:
import joblib
joblib.dump(pipeline_dict['LinearSVC']['pipeline'], 'model1.pkl')


['model1.pkl']

# Loading The Model

In [None]:
loaded_pipeline = joblib.load('model1.pkl')