In [97]:
#import nessery libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn import metrics
import time
import tensorflow as tf
import spacy

In [98]:
#inspect our data 
pd.set_option('display.max_colwidth', 100000)
pd.set_option("display.min_rows", 20)
text_data = pd.read_csv('text_ArabDialect_dataset.csv')
text_data.head()

Unnamed: 0,id,dialect,text
0,1175358310087892992,IQ,لكن بالنهاية ينتفض يغير
1,1175416117793349632,IQ,يعني هذا محسوب على البشر حيونه ووحشيه وتطلبون من الغرب يحترمكم ويؤمن بدينكم ولاينعتكم بالإرهاب
2,1175450108898565888,IQ,مبين من كلامه خليجي
3,1175471073770573824,IQ,يسلملي مرورك وروحك الحلوه
4,1175496913145217024,IQ,وين هل الغيبه اخ محمد


In [99]:
text_data.isna().sum()

id         0
dialect    0
text       1
dtype: int64

In [100]:
text_data[text_data['text'].isna() == True]

Unnamed: 0,id,dialect,text
212439,1173616403557081088,SA,


In [101]:
#because the jupyter notebook not read any nan values so I will erase this nan here in colab in this file not in preprosessing file
text_data.dropna(axis=0,inplace=True)

In [102]:
text_data.isna().sum()

id         0
dialect    0
text       0
dtype: int64

In [103]:
x = text_data['text']
y = text_data['dialect']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, stratify=y, random_state=42)

In [104]:
x_train.head()

316364                                                          طيب إقص ريال مدريد بس شو ذنب هالكتكوت
132713     الي بدوس على قلبها وبتحكم عقلها وبتفوز اخر شي بتكون قويه محد بيقدر يهزمها  اقويانواعالنساء
106402                                                                           مش بلوزة هاي يا هضبة
190381                                                                  شيلات شنو معنى شيلات بالخليجي
251861    شايفين نفسهم علينا وفاكرينها شطارة بكرة يندموا ويبتدوها من الصفر بس ساعتها اوعى تعملهم فولو
Name: text, dtype: object

## Classification Models
- SVM classifier when using word n-grams
- SGD classifier when using a Tf-idf Vectorizer
- RNN with contextualized embeddings

In [None]:
#intiate the function that apply different accurcy metric on prdicted data 
def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

### SVM classifier when using a combination of character and word n-grams
**The logic behined this approche is that there is uniqe words and expersion that can identify delict easily so using word n-gram will help model to focus in this important featuer**

In [None]:
#intiate word-ngram BOW for embedding and svm for training model and fit them in pipeline 
# also calculate the time that model need to fit for comparison porpose 

# starting time
start = time.time()
vec = CountVectorizer(analyzer='word', ngram_range=(1, 3))
clf = LinearSVC(max_iter=1500)
svm_wordNgram_pipe = make_pipeline(vec, clf)
svm_wordNgram_pipe.fit(x_train, y_train)

# end time
end = time.time()

# total time taken
print(f"Runtime of the Model to fit {end - start}")

Runtime of the Model to fit 1595.9231142997742




In [None]:
print_report(svm_wordNgram_pipe, x_test, y_test)

              precision    recall  f1-score   support

          AE       0.43      0.41      0.42      5259
          BH       0.38      0.33      0.36      5258
          DZ       0.63      0.53      0.57      3237
          EG       0.69      0.86      0.76     11527
          IQ       0.63      0.55      0.58      3099
          JO       0.42      0.35      0.38      5584
          KW       0.48      0.57      0.52      8422
          LB       0.61      0.68      0.64      5524
          LY       0.63      0.71      0.67      7300
          MA       0.76      0.60      0.67      2308
          OM       0.41      0.36      0.38      3823
          PL       0.47      0.55      0.51      8749
          QA       0.49      0.45      0.47      6214
          SA       0.42      0.42      0.42      5367
          SD       0.71      0.55      0.62      2887
          SY       0.47      0.32      0.38      3248
          TN       0.70      0.43      0.53      1849
          YE       0.40    

In [None]:
import joblib
joblib.dump(svm_wordNgram_pipe, 'svm_wordNgram_pipe.pkl')

['svm_wordNgram_pipe.pkl']

### SGD classifier when using a Tf-idf Vectorizer
**The logic behined this approche is making th unique words that identify delict easily and have less repetation so we need to make this words or expersion have more weight than others so tf-idf will be very good for it also for data training model sgd is the stokastic gradiant desent it's an optomization function used to reach to optimal is stokastic way but here in the implementation in sklearn it can implement loss function of linear calssification model like linearSVM and also this model is very good in working with large-scale and sparse problem like our problem so it solve the converge problem in pervious linearSVM model...also applaied it in grid search to tune the hyperparameters like max_df that specifiy the word considration until specific frequancy and max featuers that you can take**

In [None]:
#just for getting the number of combination in our gridsearch to estimate if it will take very long time or not
from sklearn.model_selection import ParameterGrid
parameters = {'tfidf__ngram_range': [(1, 2),(1, 3)],
              'tfidf__max_features': [500, 1000, None],
              'tfidf__norm': ('l1', 'l2'),
              }
gd = ParameterGrid(parameters)
len(gd)

12

In [None]:
# starting time
start = time.time()

sgd_wordNgram_pipe = Pipeline([
  ('tfidf', TfidfVectorizer()),
  ('sgd', SGDClassifier())
  ])

# sgd_wordNgram_pipe = make_pipeline(tfidf, sgd)
parameters = {'tfidf__ngram_range': [(1, 2),(1, 3)],
              'tfidf__max_features': [500, 1000, None],
              'tfidf__norm': ('l1', 'l2'),
              }

grid = GridSearchCV(sgd_wordNgram_pipe, parameters, cv=2, verbose=1)
grid.fit(x_train, y_train)
# end time
end = time.time()

# total time taken
print(f"Runtime of the Model to fit {end - start}")

Fitting 2 folds for each of 12 candidates, totalling 24 fits
Runtime of the Model to fit 604.7531547546387


In [None]:
grid.best_estimator_

Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
                ('sgd', SGDClassifier())])

In [None]:
print_report(grid.best_estimator_, x_test, y_test)

              precision    recall  f1-score   support

          AE       0.47      0.39      0.43      5259
          BH       0.46      0.27      0.34      5258
          DZ       0.57      0.51      0.54      3237
          EG       0.51      0.93      0.66     11527
          IQ       0.60      0.56      0.58      3099
          JO       0.48      0.26      0.34      5584
          KW       0.48      0.60      0.53      8422
          LB       0.52      0.73      0.61      5524
          LY       0.60      0.67      0.63      7300
          MA       0.64      0.61      0.62      2308
          OM       0.51      0.31      0.38      3823
          PL       0.52      0.46      0.49      8749
          QA       0.47      0.49      0.48      6214
          SA       0.47      0.39      0.42      5367
          SD       0.57      0.41      0.47      2887
          SY       0.51      0.21      0.30      3248
          TN       0.68      0.39      0.50      1849
          YE       0.55    

In [None]:
import joblib
joblib.dump(grid.best_estimator_, 'sgd_wordNgram_pipe.pkl')

['sgd_wordNgram_pipe.pkl']

### RNN with contextualized embeddings

In [120]:
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

In [121]:
tknzr = Tokenizer(lower=True, split=" ")
tknzr.fit_on_texts(x)

#making sequences:
X_rnn = tknzr.texts_to_sequences(x)
X_rnn = pad_sequences(X_rnn, padding='post', value=0)

In [122]:
y_rnn = pd.get_dummies(y)
y_rnn.shape

(458196, 18)

In [126]:
model = tf.keras.models.Sequential([    
    tf.keras.layers.Embedding(len(tknzr.word_index), 64),
    tf.keras.layers.SimpleRNN(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(18, activation='sigmoid')
])

In [127]:
model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [129]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, None, 64)          32931648  
                                                                 
 simple_rnn_5 (SimpleRNN)    (None, 64)                8256      
                                                                 
 dense_14 (Dense)            (None, 64)                4160      
                                                                 
 dense_15 (Dense)            (None, 18)                1170      
                                                                 
Total params: 32,945,234
Trainable params: 32,945,234
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_rnn, y_rnn, validation_split = 0.1, epochs = 3, batch_size= 128, shuffle = True)

Epoch 1/3
 602/3222 [====>.........................] - ETA: 15:49 - loss: 0.2123 - accuracy: 0.1283

In [None]:
test_loss, test_acc = model.evaluate(x_test_padded, y_test)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))