In [None]:
!nvidia-smi

In [1]:
!pip install Arabic-Stopwords
!pip install arabic_reshaper
!pip install python-bidi
!pip install tensorflow-addons #to use f1 score in complie's metrics

Collecting Arabic-Stopwords
  Downloading Arabic_Stopwords-0.3-py3-none-any.whl (353 kB)
[K     |████████████████████████████████| 353 kB 4.8 MB/s 
[?25hCollecting pyarabic>=0.6.2
  Downloading PyArabic-0.6.14-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 38.0 MB/s 
Installing collected packages: pyarabic, Arabic-Stopwords
Successfully installed Arabic-Stopwords-0.3 pyarabic-0.6.14
Collecting arabic_reshaper
  Downloading arabic_reshaper-2.1.3-py3-none-any.whl (20 kB)
Installing collected packages: arabic-reshaper
Successfully installed arabic-reshaper-2.1.3
Collecting python-bidi
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Installing collected packages: python-bidi
Successfully installed python-bidi-0.4.2
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.3 MB/s 
Installing collected packages: tensorflo

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM
import tensorflow_addons as tfa
from numpy import array
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

In [3]:
#read files of first dataset (MSA)
res_df=pd.read_csv('https://raw.githubusercontent.com/hadyelsahar/large-arabic-sentiment-analysis-resouces/master/datasets/RES.csv')
prod_df=pd.read_csv('https://raw.githubusercontent.com/hadyelsahar/large-arabic-sentiment-analysis-resouces/master/datasets/PROD.csv')
htl_df=pd.read_csv('https://raw.githubusercontent.com/hadyelsahar/large-arabic-sentiment-analysis-resouces/master/datasets/HTL.csv')
mov_df=pd.read_csv('https://raw.githubusercontent.com/hadyelsahar/large-arabic-sentiment-analysis-resouces/master/datasets/MOV.csv')

In [4]:
#keep only binary classes (pos & neg)
res_df=res_df[res_df['polarity']!=0].reset_index(drop=True)
prod_df=prod_df[prod_df['polarity']!=0].reset_index(drop=True)
htl_df=htl_df[htl_df['polarity']!=0].reset_index(drop=True)
mov_df=mov_df[mov_df['polarity']!=0].reset_index(drop=True)

In [5]:
datasetDict = {"resturants": res_df, "products": prod_df, "hotels": htl_df, "movies": mov_df}

In [6]:
# convert neg label from -1 to 0
for k, v in datasetDict.items():
  v['polarity'].replace({-1: 0}, inplace=True)

In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
import string
import arabicstopwords.arabicstopwords as ar_words

ar_sw=['إذ', 'إذا', 'إذما', 'إذن', 'أف', 'أقل', 'أكثر', 'ألا', 'إلا', 'التي', 'الذي', 'الذين', 'اللاتي', 'اللائي', 'اللتان', 'اللتيا', 'اللتين', 'اللذان', 'اللذين', 'اللواتي', 'إلى', 'إليك', 'إليكم', 'إليكما', 'إليكن', 'أم', 'أما', 'أما', 'إما', 'أن', 'إن', 'إنا', 'أنا', 'أنت', 'أنتم', 'أنتما', 'أنتن', 'إنما', 'إنه', 'إنها', 'أنى', 'أنى', 'آه', 'آها', 'أو', 'أولاء', 'أولئك', 'أوه', 'آي', 'أي', 'أيها', 'إي', 'أين', 'أين', 'أينما', 'إيه', 'بخ', 'بس', 'بعد', 'بعض', 'بك', 'بكم', 'بكم', 'بكما', 'بكن', 'بل', 'بلى', 'بما', 'بماذا', 'بمن', 'بنا', 'به', 'بها', 'بهم', 'بهما', 'بهن', 'بي', 'بين', 'بيد', 'تلك', 'تلكم', 'تلكما', 'ته', 'تي', 'تين', 'تينك', 'ثم', 'ثمة', 'حاشا', 'حبذا', 'حتى', 'حيث', 'حيثما', 'حين', 'خلا', 'دون', 'ذا', 'ذات', 'ذاك', 'ذان', 'ذانك', 'ذلك', 'ذلكم', 'ذلكما','كان','كانت', 'ذلكن', 'ذه', 'ذو', 'ذوا', 'ذواتا', 'ذواتي', 'ذي', 'ذين', 'ذينك', 'ريث', 'سوف', 'سوى', 'شتان', 'عدا', 'عسى', 'عل', 'على', 'عليك', 'عليه', 'عما', 'عن', 'عند', 'غير', 'فإذا', 'فإن', 'فلا', 'فمن', 'في', 'فيم', 'فيما', 'فيه', 'فيها', 'قد', 'كأن', 'كأنما', 'كأي', 'كأين', 'كذا', 'كذلك', 'كل', 'كلا', 'كلاهما', 'كلتا', 'كلما', 'كليكما', 'كليهما', 'كم', 'كم', 'كما', 'كي', 'كيت', 'كيف', 'كيفما', 'لا', 'لاسيما', 'لدى', 'لست', 'لستم', 'لستما', 'لستن', 'لسن', 'لسنا', 'لعل', 'لك', 'لكم', 'لكما', 'لكن', 'لكنما', 'لكي', 'لكيلا', 'لم', 'لما', 'لن', 'لنا', 'له', 'لها', 'لهم', 'لهما', 'لهن', 'لو', 'لولا', 'لوما', 'لي', 'لئن', 'ليت', 'ليس', 'ليسا', 'ليست', 'ليستا', 'ليسوا', 'ما', 'ماذا', 'متى', 'مذ', 'مع', 'مما', 'ممن', 'من', 'منه', 'منها', 'منذ', 'مه', 'مهما', 'نحن', 'نحو', 'نعم', 'ها', 'هاتان', 'هاته', 'هاتي', 'هاتين', 'هاك', 'هاهنا', 'هذا', 'هذان', 'هذه', 'هذي', 'هذين', 'هكذا', 'هل', 'هلا', 'هم', 'هما', 'هن', 'هنا', 'هناك', 'هنالك', 'هو', 'هؤلاء', 'هي', 'هيا', 'هيت', 'هيهات', 'والذي', 'والذين', 'وإذ', 'وإذا', 'وإن', 'ولا', 'ولكن', 'ولو', 'وما', 'ومن', 'وهو', 'يا']

def normalizeArabic(t):
    t = re.sub("[إأٱآا]", "ا", t)
    t = re.sub("ى", "ي", t)
    t = re.sub("ة", 'ه', t)
    t = re.sub("ؤ", "ء", t)
    t = re.sub("ئ", "ء", t)
    return (t)

ar_stop= []
for w in ar_sw:
  ar_stop.append(normalizeArabic(w))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
#COLLECT Ar stopwords from 2 sources

ar_stopwords = stopwords.words('arabic') + list(ar_words.stopwords_list()) + ar_stop
print('nlkt arabic stopwords =',len(stopwords.words('arabic')))
print('Arabic-Stopwords =', len(ar_words.stopwords_list()))
print('My list =', len(ar_stop))
print('sum =',len(ar_stopwords), 'unique=', len(set(ar_stopwords)) )

nlkt arabic stopwords = 754
Arabic-Stopwords = 13629
My list = 251
sum = 14634 unique= 13997


In [9]:
# define 'clean_tweet' function to clean the text and remove unwanted text parts
def clean_text(text):
    # define regular expression patterns
    p_english = "[a-zA-Z0-9]+"
    p_url = "https?://[A-Za-z0-9./]+"
    p_mention = "\@[\_0-9a-zA-Z]+\:?"    
    p_retweet = "RT \@[\_\-0-9a-zA-Z]+\:?"
    p_punctuations = "[" + string.punctuation + "]"
    
    # remove unwanted parts
    text = re.sub(p_english, ' ', text)
    text = re.sub(p_retweet, ' ', text)
    text = re.sub(p_mention, ' ', text)
    text = re.sub(p_url, ' ', text)
    text = re.sub(p_punctuations, ' ', text)
    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    
    # remove الهمزة
    text = re.sub("[أإآ]", 'ا', text)
    text = re.sub("ة", 'ه', text)
    text = re.sub("ى", 'ي', text)
    
    # removing tashkeel
    tashkel = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(tashkel, '', text)
    
    # remove repeated letters more than two letters
    text = re.sub(r'(.)\1+', r'\1\1', text)
    
    #trim    
    text = text.strip()
    
    # remove stopwords
    words = [word for word in text.split() if word not in ar_stopwords]
    words = [word for word in words if len(word)>=2]
    
    # merge and return final text
    return ' '.join(words)

In [10]:
# apply clean function on all text in the dataframes
for k, v in datasetDict.items():
    v['clean_text'] = v['text'].apply(clean_text)

In [16]:

#------------------------------------------------------ Choose a dataset df
df= res_df
#df= prod_df
#df= htl_df
#df= mov_df

seq_len = 128 # standardized length of each word sequence 
#max_vocab = 1500 # max number of words to consider when tokenizing (based on freq)

# fit tokenizer vocab (note that it lowercases and strips punct)
#tokenizer = Tokenizer(num_words=max_vocab)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
max_vocab = len(tokenizer.word_index) +1
# standard train/val split
train_text, val_text, y_train, y_val = train_test_split(df.clean_text, df.polarity, 
                                                        test_size=0.2, random_state=123, stratify=df.polarity)

# convert train and val texts to token sequences of standardized length 128,
# padding fills leading 0s in or cuts off sequence at 128th word
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = pad_sequences(train_text, maxlen=seq_len)

val_text = tokenizer.texts_to_sequences(val_text)
val_text = pad_sequences(val_text, maxlen=seq_len)


embedding_dim = 20 # hyper-parameter 

inp = Input(shape=(seq_len,)) # must specify format of input layer
x = Embedding(max_vocab, embedding_dim)(inp) # model learns its own word embeddings
x = Bidirectional(LSTM(8, recurrent_dropout=.3))(x) # bi-LSTM with regularization
y = Dense(1, activation='sigmoid')(x)

NN = Model(inp, y)
NN.summary()

threshold= 0.5
NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tfa.metrics.F1Score(num_classes=2, average='micro', threshold=threshold)])
history = NN.fit(train_text, y_train, 
                 validation_data=(val_text, y_val),
                 epochs=25, batch_size=512, verbose=1)


print('RESULT: model f1 score is :', f1_score(y_val, (NN.predict(val_text)[:,0] > .5).astype(int)))



Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_3 (Embedding)     (None, 128, 20)           1151340   
                                                                 
 bidirectional_3 (Bidirectio  (None, 16)               1856      
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1,153,213
Trainable params: 1,153,213
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoc

In [17]:

#------------------------------------------------------ Choose a dataset df
#df= res_df
df= prod_df
#df= htl_df
#df= mov_df

seq_len = 128 # standardized length of each word sequence 
#max_vocab = 1500 # max number of words to consider when tokenizing (based on freq)

# fit tokenizer vocab (note that it lowercases and strips punct)
#tokenizer = Tokenizer(num_words=max_vocab)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
max_vocab = len(tokenizer.word_index) +1
# standard train/val split
train_text, val_text, y_train, y_val = train_test_split(df.clean_text, df.polarity, 
                                                        test_size=0.2, random_state=123, stratify=df.polarity)

# convert train and val texts to token sequences of standardized length 128,
# padding fills leading 0s in or cuts off sequence at 128th word
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = pad_sequences(train_text, maxlen=seq_len)

val_text = tokenizer.texts_to_sequences(val_text)
val_text = pad_sequences(val_text, maxlen=seq_len)


embedding_dim = 20 # hyper-parameter 

inp = Input(shape=(seq_len,)) # must specify format of input layer
x = Embedding(max_vocab, embedding_dim)(inp) # model learns its own word embeddings
x = Bidirectional(LSTM(8, recurrent_dropout=.3))(x) # bi-LSTM with regularization
y = Dense(1, activation='sigmoid')(x)

NN = Model(inp, y)
NN.summary()

threshold= 0.5
NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tfa.metrics.F1Score(num_classes=2, average='micro', threshold=threshold)])
history = NN.fit(train_text, y_train, 
                 validation_data=(val_text, y_val),
                 epochs=25, batch_size=512, verbose=1)


print('RESULT: model f1 score is :', f1_score(y_val, (NN.predict(val_text)[:,0] > .5).astype(int)))



Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_4 (Embedding)     (None, 128, 20)           257680    
                                                                 
 bidirectional_4 (Bidirectio  (None, 16)               1856      
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 1)                 17        
                                                                 
Total params: 259,553
Trainable params: 259,553
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11

In [18]:

#------------------------------------------------------ Choose a dataset df
#df= res_df
#df= prod_df
df= htl_df
#df= mov_df

seq_len = 128 # standardized length of each word sequence 
#max_vocab = 1500 # max number of words to consider when tokenizing (based on freq)

# fit tokenizer vocab 
#tokenizer = Tokenizer(num_words=max_vocab)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
max_vocab = len(tokenizer.word_index) +1
# standard train/val split
train_text, val_text, y_train, y_val = train_test_split(df.clean_text, df.polarity, 
                                                        test_size=0.2, random_state=123, stratify=df.polarity)

# convert train and val texts to token sequences of standardized length 128,
# padding fills leading 0s in or cuts off sequence at 128th word
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = pad_sequences(train_text, maxlen=seq_len)

val_text = tokenizer.texts_to_sequences(val_text)
val_text = pad_sequences(val_text, maxlen=seq_len)


embedding_dim = 20 # hyper-parameter 

inp = Input(shape=(seq_len,)) # must specify format of input layer
x = Embedding(max_vocab, embedding_dim)(inp) # model learns its own word embeddings
x = Bidirectional(LSTM(8, recurrent_dropout=.3))(x) # bi-LSTM with regularization
y = Dense(1, activation='sigmoid')(x)

NN = Model(inp, y)
NN.summary()

threshold= 0.5
NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tfa.metrics.F1Score(num_classes=2, average='micro', threshold=threshold)])
history = NN.fit(train_text, y_train, 
                 validation_data=(val_text, y_val),
                 epochs=25, batch_size=512, verbose=1)


print('RESULT: model f1 score is :', f1_score(y_val, (NN.predict(val_text)[:,0] > .5).astype(int)))



Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_5 (Embedding)     (None, 128, 20)           2171700   
                                                                 
 bidirectional_5 (Bidirectio  (None, 16)               1856      
 nal)                                                            
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 2,173,573
Trainable params: 2,173,573
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoc

In [19]:

#------------------------------------------------------ Choose a dataset df
#df= res_df
#df= prod_df
#df= htl_df
df= mov_df

seq_len = 128 # standardized length of each word sequence 
#max_vocab = 1500 # max number of words to consider when tokenizing (based on freq)

# fit tokenizer vocab 
#tokenizer = Tokenizer(num_words=max_vocab)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
max_vocab = len(tokenizer.word_index) +1
# standard train/val split
train_text, val_text, y_train, y_val = train_test_split(df.clean_text, df.polarity, 
                                                        test_size=0.2, random_state=123, stratify=df.polarity)

# convert train and val texts to token sequences of standardized length 128,
# padding fills leading 0s in or cuts off sequence at 128th word
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = pad_sequences(train_text, maxlen=seq_len)

val_text = tokenizer.texts_to_sequences(val_text)
val_text = pad_sequences(val_text, maxlen=seq_len)


embedding_dim = 20 # hyper-parameter 

inp = Input(shape=(seq_len,)) # must specify format of input layer
x = Embedding(max_vocab, embedding_dim)(inp) # model learns its own word embeddings
x = Bidirectional(LSTM(8, recurrent_dropout=.3))(x) # bi-LSTM with regularization
y = Dense(1, activation='sigmoid')(x)

NN = Model(inp, y)
NN.summary()

threshold= 0.5
NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tfa.metrics.F1Score(num_classes=2, average='micro', threshold=threshold)])
history = NN.fit(train_text, y_train, 
                 validation_data=(val_text, y_val),
                 epochs=25, batch_size=512, verbose=1)


print('RESULT: model f1 score is :', f1_score(y_val, (NN.predict(val_text)[:,0] > .5).astype(int)))



Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_6 (Embedding)     (None, 128, 20)           1489240   
                                                                 
 bidirectional_6 (Bidirectio  (None, 16)               1856      
 nal)                                                            
                                                                 
 dense_6 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1,491,113
Trainable params: 1,491,113
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoc

In [26]:

#------------------------------------------------------ Choose a dataset df
#df= res_df
#df= prod_df
#df= htl_df
#df= mov_df

df = pd.concat([res_df,prod_df,htl_df,mov_df])

seq_len = 128 # standardized length of each word sequence 
#max_vocab = 1500 # max number of words to consider when tokenizing (based on freq)

# fit tokenizer vocab 
#tokenizer = Tokenizer(num_words=max_vocab)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
max_vocab = len(tokenizer.word_index) +1
# standard train/val split
train_text, val_text, y_train, y_val = train_test_split(df.clean_text, df.polarity, 
                                                        test_size=0.2, random_state=123, stratify=df.polarity)

# convert train and val texts to token sequences of standardized length 128,
# padding fills leading 0s in or cuts off sequence at 128th word
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = pad_sequences(train_text, maxlen=seq_len)

val_text = tokenizer.texts_to_sequences(val_text)
val_text = pad_sequences(val_text, maxlen=seq_len)


embedding_dim = 20 # hyper-parameter 

inp = Input(shape=(seq_len,)) # must specify format of input layer
x = Embedding(max_vocab, embedding_dim)(inp) # model learns its own word embeddings
x = Bidirectional(LSTM(8, recurrent_dropout=.3))(x) # bi-LSTM with regularization
y = Dense(1, activation='sigmoid')(x)

NN = Model(inp, y)
NN.summary()

threshold= 0.5
NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tfa.metrics.F1Score(num_classes=2, average='micro', threshold=threshold)])
history = NN.fit(train_text, y_train, 
                 validation_data=(val_text, y_val),
                 epochs=25, batch_size=512, verbose=1)


print('RESULT: model f1 score is :', f1_score(y_val, (NN.predict(val_text)[:,0] > .5).astype(int)))



Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 128)]             0         
                                                                 
 embedding_8 (Embedding)     (None, 128, 20)           3765580   
                                                                 
 bidirectional_8 (Bidirectio  (None, 16)               1856      
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 1)                 17        
                                                                 
Total params: 3,767,453
Trainable params: 3,767,453
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoc

**ArSenTD-Lev (Arabic Sentiment Twitter Dataset for LEVantine dialect)**

In [21]:
#read files of first dataset (MSA)
cor_df=pd.read_csv('ArSenTD-LEV.tsv', sep='\t')
cor_df= cor_df.drop(columns=['Country', 'Sentiment_Expression','Sentiment_Target'])
#keep only binary classes (pos & neg)
cor_df=cor_df[cor_df['Sentiment']!='neutral'].reset_index(drop=True)
#unify format with other dataset
cor_df['Sentiment'].replace({'very_positive': '1', 'positive': '1', 'very_negative':'0', 'negative':'0'}, inplace=True)
cor_df= cor_df.reset_index(drop=True)
cor_df.rename(columns={'Tweet': 'text', 'Sentiment': 'polarity'}, inplace=True)
cor_df["polarity"] = pd.to_numeric(cor_df["polarity"])
cor_df.head()
cor_df

Unnamed: 0,text,Topic,polarity
0,"""أنا أؤمن بأن الانسان ينطفئ جماله عند ابتعاد م...",personal,0
1,من الذاكره... @3FInQe . عندما اعتقد كريستيانو ...,sports,1
2,#مصطلحات_لبنانيه_حيرت_البشريه بتوصل عالبيت ، ب...,personal,0
3,نصمت !! لتسير حياتنا على مً يرام فالناّس لم تع...,personal,0
4,@Yousef_MUFC اكثر ما يزعجنا بعد مستوانا خارج ا...,sports,0
...,...,...,...
3110,نهتم من خلال خدمة تنسيق الرسائل بإظهار رسالة ا...,education,1
3111,صلاح من لاعب في المقاولون العرب يحلم ان يلعب ل...,sports,1
3112,الملك سلمان بن عبد العزيز: تطبيق الأنظمة بحزم ...,politics,1
3113,@ZahraaIraq9 😂 كل ما ادخل حسابي الكه تغريداتج ...,personal,0


In [22]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [23]:
# define 'clean_tweet' function to clean the text and remove unwanted text parts
def clean_tweet(text):
    # define regular expression patterns
    p_english = "[a-zA-Z0-9]+"
    p_url = "https?://[A-Za-z0-9./]+"
    p_mention = "\@[\_0-9a-zA-Z]+\:?"    
    p_retweet = "RT \@[\_\-0-9a-zA-Z]+\:?"
    p_punctuations = "[" + string.punctuation + "]"
    
    # remove unwanted parts
    text = re.sub(p_english, ' ', text)
    text = re.sub(p_retweet, ' ', text)
    text = re.sub(p_mention, ' ', text)
    text = re.sub(p_url, ' ', text)
    text = re.sub(p_punctuations, ' ', text)
    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    
    # remove الهمزة
    text = re.sub("[أإآ]", 'ا', text)
    text = re.sub("ة", 'ه', text)
    text = re.sub("ى", 'ي', text)
    
    # removing tashkeel
    tashkel = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(tashkel, '', text)
    
    # remove repeated letters more than two letters
    text = re.sub(r'(.)\1+', r'\1\1', text)
    text = text.strip()

    text= remove_emoji(text)
    
    # remove stopwords
    words = [word for word in text.split() if word not in ar_stopwords]
    words = [word for word in words if len(word)>=2]
    
    # merge and return final text
    return ' '.join(words)

In [24]:
 cor_df['clean_text'] = cor_df['text'].apply(clean_tweet)

In [25]:

df= cor_df

seq_len = 128 # standardized length of each word sequence 
#max_vocab = 1500 # max number of words to consider when tokenizing (based on freq)

# fit tokenizer vocab 
#tokenizer = Tokenizer(num_words=max_vocab)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
max_vocab = len(tokenizer.word_index) +1
# standard train/val split
train_text, val_text, y_train, y_val = train_test_split(df.clean_text, df.polarity, 
                                                        test_size=0.2, random_state=123, stratify=df.polarity)

# convert train and val texts to token sequences of standardized length 128,
# padding fills leading 0s in or cuts off sequence at 128th word
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = pad_sequences(train_text, maxlen=seq_len)

val_text = tokenizer.texts_to_sequences(val_text)
val_text = pad_sequences(val_text, maxlen=seq_len)


embedding_dim = 20 # hyper-parameter 

inp = Input(shape=(seq_len,)) # must specify format of input layer
x = Embedding(max_vocab, embedding_dim)(inp) # model learns its own word embeddings
x = Bidirectional(LSTM(8, recurrent_dropout=.3))(x) # bi-LSTM with regularization
y = Dense(1, activation='sigmoid')(x)

NN = Model(inp, y)
NN.summary()

threshold= 0.5
NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tfa.metrics.F1Score(num_classes=2, average='micro', threshold=threshold)])
history = NN.fit(train_text, y_train, 
                 validation_data=(val_text, y_val),
                 epochs=25, batch_size=512, verbose=1)


print('RESULT: model f1 score is :', f1_score(y_val, (NN.predict(val_text)[:,0] > .5).astype(int)))



Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_7 (Embedding)     (None, 128, 20)           424260    
                                                                 
 bidirectional_7 (Bidirectio  (None, 16)               1856      
 nal)                                                            
                                                                 
 dense_7 (Dense)             (None, 1)                 17        
                                                                 
Total params: 426,133
Trainable params: 426,133
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11