In [1]:
!nvidia-smi

Wed Dec 15 13:47:58 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install Arabic-Stopwords
!pip install arabic_reshaper
!pip install python-bidi
!pip install tensorflow-addons #to use f1 score in complie's metrics

Collecting Arabic-Stopwords
  Downloading Arabic_Stopwords-0.3-py3-none-any.whl (353 kB)
[K     |████████████████████████████████| 353 kB 5.3 MB/s 
[?25hCollecting pyarabic>=0.6.2
  Downloading PyArabic-0.6.14-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 33.0 MB/s 
Installing collected packages: pyarabic, Arabic-Stopwords
Successfully installed Arabic-Stopwords-0.3 pyarabic-0.6.14
Collecting arabic_reshaper
  Downloading arabic_reshaper-2.1.3-py3-none-any.whl (20 kB)
Installing collected packages: arabic-reshaper
Successfully installed arabic-reshaper-2.1.3
Collecting python-bidi
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Installing collected packages: python-bidi
Successfully installed python-bidi-0.4.2
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 5.2 MB/s 
Installing collected packages: tensorflo

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM
import tensorflow_addons as tfa
from numpy import array
import pandas as pd
import gensim
from gensim.models import KeyedVectors
from gensim.models import word2vec

**Large Multi-Domain Resources for Arabic Sentiment Analysis**

For this dataset in MSA, I will use w2v pretrained on Wikipedia.

In [4]:
#read files of first dataset (MSA)
res_df=pd.read_csv('https://raw.githubusercontent.com/hadyelsahar/large-arabic-sentiment-analysis-resouces/master/datasets/RES.csv')
prod_df=pd.read_csv('https://raw.githubusercontent.com/hadyelsahar/large-arabic-sentiment-analysis-resouces/master/datasets/PROD.csv')
htl_df=pd.read_csv('https://raw.githubusercontent.com/hadyelsahar/large-arabic-sentiment-analysis-resouces/master/datasets/HTL.csv')
mov_df=pd.read_csv('https://raw.githubusercontent.com/hadyelsahar/large-arabic-sentiment-analysis-resouces/master/datasets/MOV.csv')

In [5]:
#keep only binary classes (pos & neg)
res_df=res_df[res_df['polarity']!=0].reset_index(drop=True)
prod_df=prod_df[prod_df['polarity']!=0].reset_index(drop=True)
htl_df=htl_df[htl_df['polarity']!=0].reset_index(drop=True)
mov_df=mov_df[mov_df['polarity']!=0].reset_index(drop=True)

In [6]:
datasetDict = {"resturants": res_df, "products": prod_df, "hotels": htl_df, "movies": mov_df}

In [12]:
# convert neg label from -1 to 0
for k, v in datasetDict.items():
  v['polarity'].replace({-1: 0}, inplace=True)

In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
import string
import arabicstopwords.arabicstopwords as ar_words

ar_sw=['إذ', 'إذا', 'إذما', 'إذن', 'أف', 'أقل', 'أكثر', 'ألا', 'إلا', 'التي', 'الذي', 'الذين', 'اللاتي', 'اللائي', 'اللتان', 'اللتيا', 'اللتين', 'اللذان', 'اللذين', 'اللواتي', 'إلى', 'إليك', 'إليكم', 'إليكما', 'إليكن', 'أم', 'أما', 'أما', 'إما', 'أن', 'إن', 'إنا', 'أنا', 'أنت', 'أنتم', 'أنتما', 'أنتن', 'إنما', 'إنه', 'إنها', 'أنى', 'أنى', 'آه', 'آها', 'أو', 'أولاء', 'أولئك', 'أوه', 'آي', 'أي', 'أيها', 'إي', 'أين', 'أين', 'أينما', 'إيه', 'بخ', 'بس', 'بعد', 'بعض', 'بك', 'بكم', 'بكم', 'بكما', 'بكن', 'بل', 'بلى', 'بما', 'بماذا', 'بمن', 'بنا', 'به', 'بها', 'بهم', 'بهما', 'بهن', 'بي', 'بين', 'بيد', 'تلك', 'تلكم', 'تلكما', 'ته', 'تي', 'تين', 'تينك', 'ثم', 'ثمة', 'حاشا', 'حبذا', 'حتى', 'حيث', 'حيثما', 'حين', 'خلا', 'دون', 'ذا', 'ذات', 'ذاك', 'ذان', 'ذانك', 'ذلك', 'ذلكم', 'ذلكما','كان','كانت', 'ذلكن', 'ذه', 'ذو', 'ذوا', 'ذواتا', 'ذواتي', 'ذي', 'ذين', 'ذينك', 'ريث', 'سوف', 'سوى', 'شتان', 'عدا', 'عسى', 'عل', 'على', 'عليك', 'عليه', 'عما', 'عن', 'عند', 'غير', 'فإذا', 'فإن', 'فلا', 'فمن', 'في', 'فيم', 'فيما', 'فيه', 'فيها', 'قد', 'كأن', 'كأنما', 'كأي', 'كأين', 'كذا', 'كذلك', 'كل', 'كلا', 'كلاهما', 'كلتا', 'كلما', 'كليكما', 'كليهما', 'كم', 'كم', 'كما', 'كي', 'كيت', 'كيف', 'كيفما', 'لا', 'لاسيما', 'لدى', 'لست', 'لستم', 'لستما', 'لستن', 'لسن', 'لسنا', 'لعل', 'لك', 'لكم', 'لكما', 'لكن', 'لكنما', 'لكي', 'لكيلا', 'لم', 'لما', 'لن', 'لنا', 'له', 'لها', 'لهم', 'لهما', 'لهن', 'لو', 'لولا', 'لوما', 'لي', 'لئن', 'ليت', 'ليس', 'ليسا', 'ليست', 'ليستا', 'ليسوا', 'ما', 'ماذا', 'متى', 'مذ', 'مع', 'مما', 'ممن', 'من', 'منه', 'منها', 'منذ', 'مه', 'مهما', 'نحن', 'نحو', 'نعم', 'ها', 'هاتان', 'هاته', 'هاتي', 'هاتين', 'هاك', 'هاهنا', 'هذا', 'هذان', 'هذه', 'هذي', 'هذين', 'هكذا', 'هل', 'هلا', 'هم', 'هما', 'هن', 'هنا', 'هناك', 'هنالك', 'هو', 'هؤلاء', 'هي', 'هيا', 'هيت', 'هيهات', 'والذي', 'والذين', 'وإذ', 'وإذا', 'وإن', 'ولا', 'ولكن', 'ولو', 'وما', 'ومن', 'وهو', 'يا']

def normalizeArabic(t):
    t = re.sub("[إأٱآا]", "ا", t)
    t = re.sub("ى", "ي", t)
    t = re.sub("ة", 'ه', t)
    t = re.sub("ؤ", "ء", t)
    t = re.sub("ئ", "ء", t)
    return (t)

ar_stop= []
for w in ar_sw:
  ar_stop.append(normalizeArabic(w))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
#COLLECT Ar stopwords from 2 sources

ar_stopwords = stopwords.words('arabic') + list(ar_words.stopwords_list()) + ar_stop
print('nlkt arabic stopwords =',len(stopwords.words('arabic')))
print('Arabic-Stopwords =', len(ar_words.stopwords_list()))
print('My list =', len(ar_stop))
print('sum =',len(ar_stopwords), 'unique=', len(set(ar_stopwords)) )

nlkt arabic stopwords = 754
Arabic-Stopwords = 13629
My list = 251
sum = 14634 unique= 13997


In [9]:
# define 'clean_tweet' function to clean the text and remove unwanted text parts
def clean_text(text):
    # define regular expression patterns
    p_english = "[a-zA-Z0-9]+"
    p_url = "https?://[A-Za-z0-9./]+"
    p_mention = "\@[\_0-9a-zA-Z]+\:?"    
    p_retweet = "RT \@[\_\-0-9a-zA-Z]+\:?"
    p_punctuations = "[" + string.punctuation + "]"
    
    # remove unwanted parts
    text = re.sub(p_english, ' ', text)
    text = re.sub(p_retweet, ' ', text)
    text = re.sub(p_mention, ' ', text)
    text = re.sub(p_url, ' ', text)
    text = re.sub(p_punctuations, ' ', text)
    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    
    # remove الهمزة
    text = re.sub("[أإآ]", 'ا', text)
    text = re.sub("ة", 'ه', text)
    text = re.sub("ى", 'ي', text)
    
    # removing tashkeel
    tashkel = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(tashkel, '', text)
    
    # remove repeated letters more than two letters
    text = re.sub(r'(.)\1+', r'\1\1', text)
    
    #trim    
    text = text.strip()
    
    # remove stopwords
    words = [word for word in text.split() if word not in ar_stopwords]
    words = [word for word in words if len(word)>=2]
    
    # merge and return final text
    return ' '.join(words)

In [10]:
import re
import numpy as np
from nltk import ngrams

# Clean/Normalize Arabic Text
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    #trim    
    text = text.strip()

    return text

In [11]:
# apply clean function on all text in the dataframes
for k, v in datasetDict.items():
    v['clean_text'] = v['text'].apply(clean_text)

In [13]:
!wget 'https://bakrianoo.ewr1.vultrobjects.com/aravec/full_uni_sg_300_wiki.zip'
!unzip 'full_uni_sg_300_wiki.zip'

--2021-12-15 14:19:10--  https://bakrianoo.ewr1.vultrobjects.com/aravec/full_uni_sg_300_wiki.zip
Resolving bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)... 108.61.0.122, 2001:19f0:0:22::100
Connecting to bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)|108.61.0.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 720163266 (687M) [application/zip]
Saving to: ‘full_uni_sg_300_wiki.zip’


2021-12-15 14:19:18 (93.7 MB/s) - ‘full_uni_sg_300_wiki.zip’ saved [720163266/720163266]

Archive:  full_uni_sg_300_wiki.zip
  inflating: full_uni_sg_300_wiki.mdl  
  inflating: full_uni_sg_300_wiki.mdl.trainables.syn1neg.npy  
  inflating: full_uni_sg_300_wiki.mdl.wv.vectors.npy  


In [14]:
# load the AraVec model
w2v = gensim.models.Word2Vec.load("full_uni_sg_300_wiki.mdl")
print("We've",len(w2v.wv.index2word),"vocabularies") 

We've 320636 vocabularies


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

seq_len = 128 # standardized length of each word sequence 

#------------------------------------------------------ Choose a dataset df
df= res_df
#df= prod_df
#df= htl_df
#df= mov_df

#################################### vectorizing cleaned text
# fit tokenizer vocab 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.clean_text)

################################### standard train/val split
train_text, val_text, y_train, y_val = train_test_split(df.clean_text, df.polarity, 
                                                        test_size=0.2, random_state=123, stratify=df.polarity)


################################## indexing + padding
# convert train and val texts to token sequences of standardized length 128,
# padding fills leading 0s in or cuts off sequence at 128th word
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = pad_sequences(train_text, maxlen=seq_len, padding='post')

val_text = tokenizer.texts_to_sequences(val_text)
val_text = pad_sequences(val_text, maxlen=seq_len, padding='post')

print('one training text after padding: ')
print(train_text[10])

################################### preparing embedding matrix
embedding_dim = w2v.vector_size # w2v embedding dim
word_index = tokenizer.word_index # vocab

# use the gensim model to build a numpy array of embeddings,
# we'll feed this array to the keras embeddings layer.
# each row i of the array will correspond to the word token assigned to that value 
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    try:
        embedding_vector = w2v[word]
        embedding_matrix[i] = embedding_vector
    except: # word in our data vocab is missing in w2v, will use 0 vector for that word
        pass
print('embedding matrix shape (vocab, embeddings dim) is ', embedding_matrix.shape)


##################################### defining model

inp = Input(shape=(seq_len,))
x = Embedding(len(word_index) + 1,
              embedding_dim,
              weights=[embedding_matrix], # where we feed the pretrained vecs
              trainable=False)(inp) # freeze these parameters in the model
#recurrent_dropout=.1
x = Bidirectional(LSTM(64, activation='relu'))(x)
x = Dense(32)(x) # fully connected layer on top of the output of the bi-LSTM
#x = Dropout(.3)(x)
#x = Dense(20)(x) #added
y = Dense(1, activation='sigmoid')(x)

NN = Model(inp, y)

print(NN.summary())

######################################## compiling model
threshold = 0.5
NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc',tfa.metrics.F1Score(num_classes=2, average='micro', threshold=threshold)])


######################################## fitting model = training
NN.fit(train_text, y_train, validation_data=(val_text, y_val), epochs=10, batch_size=50, verbose=1)


######################################### testing

# This can be used if we have train, val, test sets
# but since data size is not to large, I choose 2 splits only
#result = the validation of last epoch

#prob= NN.predict(val_text)
#predections= [1 if p >= 0.5 else 0 for p in prob]
#f1_score(y_val, predections)


one training text after padding: 
[27250   150 10991    30    12   191   894   189   499   302  4058 27251
   409   183  2088   732   183  2892   145   326  1907 11482     9     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]




#embedding matrix shape (vocab, embeddings dim) is  (46498, 300)
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding (Embedding)       (None, 128, 300)          13949400  
                                                                 
 bidirectional (Bidirectiona  (None, 128)              186880    
 l)                                                              
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 14,140,441
Trainable params: 191,041
Non-trainab

<keras.callbacks.History at 0x7f8db369d6d0>

In [18]:
prob= NN.predict(val_text)
predections= [1 if p >= 0.5 else 0 for p in prob]
print('---------------------------RESULT------------------------')
print('The model f1 score on validation set is :', f1_score(y_val, predections))

---------------------------RESULT------------------------
The model f1 score on validation set is : 0.8707482993197277


The next fit for the same above settings shows a bit better result! 

In [None]:
#without dropout + clean text ------------ resturant
NN.fit(train_text, y_train, validation_data=(val_text, y_val), epochs=10, batch_size=50, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fee93bf6910>

.

Below are some past results with some different choices, for my own learning/comparing ..... 

In [None]:
#with dropouts + clean text ------------ resturant
NN.fit(train_text, y_train, validation_data=(val_text, y_val), epochs=5, batch_size=50, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fee99f0c8d0>

In [None]:
#without dropout + clean text ------------ resturant
NN.fit(train_text, y_train, validation_data=(val_text, y_val), epochs=5, batch_size=50, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fee81881550>

In [None]:
#without dropout + clean text ------------ resturant
NN.fit(train_text, y_train, validation_data=(val_text, y_val), epochs=20, batch_size=100, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fee9236ea50>

In [None]:
#without dropout + clean text + added dense layer ------------ resturant
NN.fit(train_text, y_train, validation_data=(val_text, y_val), epochs=5, batch_size=50, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fee80e20810>

In [None]:
#without dropout + clean str (few preprocess than clean text) ------------ resturant
NN.fit(train_text, y_train, validation_data=(val_text, y_val), epochs=10, batch_size=50, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7e6e944450>

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

seq_len = 128 # standardized length of each word sequence 

#------------------------------------------------------ Choose a dataset df
#df= res_df
df= prod_df
#df= htl_df
#df= mov_df

#################################### vectorizing cleaned text
# fit tokenizer vocab 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.clean_text)

################################### standard train/val split
train_text, val_text, y_train, y_val = train_test_split(df.clean_text, df.polarity, 
                                                        test_size=0.2, random_state=123, stratify=df.polarity)


################################## indexing + padding
# convert train and val texts to token sequences of standardized length 128,
# padding fills leading 0s in or cuts off sequence at 128th word
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = pad_sequences(train_text, maxlen=seq_len, padding='post')

val_text = tokenizer.texts_to_sequences(val_text)
val_text = pad_sequences(val_text, maxlen=seq_len, padding='post')

print('one training text after padding: ')
print(train_text[10])

################################### preparing embedding matrix
embedding_dim = w2v.vector_size # w2v embedding dim
word_index = tokenizer.word_index # vocab

# use the gensim model to build a numpy array of embeddings,
# we'll feed this array to the keras embeddings layer.
# each row i of the array will correspond to the word token assigned to that value 
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    try:
        embedding_vector = w2v[word]
        embedding_matrix[i] = embedding_vector
    except: # word in our data vocab is missing in w2v, will use 0 vector for that word
        pass
print('embedding matrix shape (vocab, embeddings dim) is ', embedding_matrix.shape)


##################################### defining model

inp = Input(shape=(seq_len,))
x = Embedding(len(word_index) + 1,
              embedding_dim,
              weights=[embedding_matrix], # where we feed the pretrained vecs
              trainable=False)(inp) # freeze these parameters in the model
#recurrent_dropout=.1
x = Bidirectional(LSTM(64, activation='relu'))(x)
x = Dense(32)(x) # fully connected layer on top of the output of the bi-LSTM
#x = Dropout(.3)(x)
#x = Dense(20)(x) #added
y = Dense(1, activation='sigmoid')(x)

NN = Model(inp, y)

print(NN.summary())

######################################## compiling model
threshold = 0.5
NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc',tfa.metrics.F1Score(num_classes=2, average='micro', threshold=threshold)])


######################################## fitting model = training
NN.fit(train_text, y_train, validation_data=(val_text, y_val), epochs=10, batch_size=50, verbose=1)


######################################### testing

# This can be used if we have train, val, test sets
# but since data size is not to large, I choose 2 splits only
#result = the validation of last epoch

prob= NN.predict(val_text)
predections= [1 if p >= 0.5 else 0 for p in prob]
print('---------------------------RESULT------------------------')
print('The model f1 score on validation set is :', f1_score(y_val, predections))


one training text after padding: 
[   5 4911 4912    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
embedding matrix shape (vocab, embeddings dim) is  (9908, 300)




Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 128, 300)          2972400   
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              186880    
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 32)                4128      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 3,163,441
Trainable params: 191,041
Non-trainable params: 2,972,400
___________________________________________

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

seq_len = 128 # standardized length of each word sequence 

#------------------------------------------------------ Choose a dataset df
#df= res_df
#df= prod_df
df= htl_df
#df= mov_df

#################################### vectorizing cleaned text
# fit tokenizer vocab 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.clean_text)

################################### standard train/val split
train_text, val_text, y_train, y_val = train_test_split(df.clean_text, df.polarity, 
                                                        test_size=0.2, random_state=123, stratify=df.polarity)


################################## indexing + padding
# convert train and val texts to token sequences of standardized length 128,
# padding fills leading 0s in or cuts off sequence at 128th word
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = pad_sequences(train_text, maxlen=seq_len, padding='post')

val_text = tokenizer.texts_to_sequences(val_text)
val_text = pad_sequences(val_text, maxlen=seq_len, padding='post')

print('one training text after padding: ')
print(train_text[10])

################################### preparing embedding matrix
embedding_dim = w2v.vector_size # w2v embedding dim
word_index = tokenizer.word_index # vocab

# use the gensim model to build a numpy array of embeddings,
# we'll feed this array to the keras embeddings layer.
# each row i of the array will correspond to the word token assigned to that value 
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    try:
        embedding_vector = w2v[word]
        embedding_matrix[i] = embedding_vector
    except: # word in our data vocab is missing in w2v, will use 0 vector for that word
        pass
print('embedding matrix shape (vocab, embeddings dim) is ', embedding_matrix.shape)


##################################### defining model

inp = Input(shape=(seq_len,))
x = Embedding(len(word_index) + 1,
              embedding_dim,
              weights=[embedding_matrix], # where we feed the pretrained vecs
              trainable=False)(inp) # freeze these parameters in the model
#recurrent_dropout=.1
x = Bidirectional(LSTM(64, activation='relu'))(x)
x = Dense(32)(x) # fully connected layer on top of the output of the bi-LSTM
#x = Dropout(.3)(x)
#x = Dense(20)(x) #added
y = Dense(1, activation='sigmoid')(x)

NN = Model(inp, y)

print(NN.summary())

######################################## compiling model
threshold = 0.5
NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc',tfa.metrics.F1Score(num_classes=2, average='micro', threshold=threshold)])


######################################## fitting model = training
NN.fit(train_text, y_train, validation_data=(val_text, y_val), epochs=10, batch_size=50, verbose=1)


######################################### testing

# This can be used if we have train, val, test sets
# but since data size is not to large, I choose 2 splits only
#result = the validation of last epoch

prob= NN.predict(val_text)
predections= [1 if p >= 0.5 else 0 for p in prob]
print('---------------------------RESULT------------------------')
print('The model f1 score on validation set is :', f1_score(y_val, predections))


one training text after padding: 
[73645   669    65 16068 24665   354  1963 25444  9015   681 37482 37483
 14973  2873  2968    11    47  2264  1432  5626   438   782     7 17589
 12493  5977 15135  2909 12146   920   258    78   979 20174   776   125
    12   696   942  4167    54     1 13347    82   235  1445   391  1574
   415   301     1   208   802   965    45 73646    60  3204   871   728
 34468 19787   432     8   924  1264 73647     1  2172  7351 21030 16417
 22121  7327  3070  6356  5940    10  2035   691  5166   162    29 26612
   180  3493  4637   348    24   582  2714  3006 73648 16353  1101 73649
   188   166  1459   162   288  2536   620     3  1442   976    34   385
  5595     7  5626  2264  6539 26347    13   813 31132   317     1  1631
 13556   586   592 36829 12184  3822    19    23]




embedding matrix shape (vocab, embeddings dim) is  (85463, 300)
Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 128, 300)          25638900  
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              186880    
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 32)                4128      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 25,829,941
Trainable params: 191,041
Non-traina

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

seq_len = 128 # standardized length of each word sequence 

#------------------------------------------------------ Choose a dataset df
#df= res_df
#df= prod_df
#df= htl_df
df= mov_df

#################################### vectorizing cleaned text
# fit tokenizer vocab 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.clean_text)

################################### standard train/val split
train_text, val_text, y_train, y_val = train_test_split(df.clean_text, df.polarity, 
                                                        test_size=0.2, random_state=123, stratify=df.polarity)


################################## indexing + padding
# convert train and val texts to token sequences of standardized length 128,
# padding fills leading 0s in or cuts off sequence at 128th word
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = pad_sequences(train_text, maxlen=seq_len, padding='post')

val_text = tokenizer.texts_to_sequences(val_text)
val_text = pad_sequences(val_text, maxlen=seq_len, padding='post')

print('one training text after padding: ')
print(train_text[10])

################################### preparing embedding matrix
embedding_dim = w2v.vector_size # w2v embedding dim
word_index = tokenizer.word_index # vocab

# use the gensim model to build a numpy array of embeddings,
# we'll feed this array to the keras embeddings layer.
# each row i of the array will correspond to the word token assigned to that value 
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    try:
        embedding_vector = w2v[word]
        embedding_matrix[i] = embedding_vector
    except: # word in our data vocab is missing in w2v, will use 0 vector for that word
        pass
print('embedding matrix shape (vocab, embeddings dim) is ', embedding_matrix.shape)


##################################### defining model

inp = Input(shape=(seq_len,))
x = Embedding(len(word_index) + 1,
              embedding_dim,
              weights=[embedding_matrix], # where we feed the pretrained vecs
              trainable=False)(inp) # freeze these parameters in the model
#recurrent_dropout=.1
x = Bidirectional(LSTM(64, activation='relu'))(x)
x = Dense(32)(x) # fully connected layer on top of the output of the bi-LSTM
#x = Dropout(.3)(x)
#x = Dense(20)(x) #added
y = Dense(1, activation='sigmoid')(x)

NN = Model(inp, y)

print(NN.summary())

######################################## compiling model
threshold = 0.5
NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc',tfa.metrics.F1Score(num_classes=2, average='micro', threshold=threshold)])


######################################## fitting model = training
NN.fit(train_text, y_train, validation_data=(val_text, y_val), epochs=10, batch_size=50, verbose=1)


######################################### testing

# This can be used if we have train, val, test sets
# but since data size is not to large, I choose 2 splits only
#result = the validation of last epoch

prob= NN.predict(val_text)
predections= [1 if p >= 0.5 else 0 for p in prob]
print('---------------------------RESULT------------------------')
print('The model f1 score on validation set is :', f1_score(y_val, predections))


one training text after padding: 
[ 5578  7055   263  6755   125   890    71  6265    22  6799  1160 29361
   603    79    49  1314    53   327  2713   403  2690  2415    15  1230
 29362    89   673 14053  4009  2796  2090 13427 14054  2090  3698 14055
 18734   398   731   689  1668   164  2351    58 18735     7  3719    32
   100  6806  1990  2791 29363    18    32  1277 29364 29365 18736  1045
    58  5629 29366  1912     7    42   204   284 11267 18737 14056 14057
 11268  6266    22  2284   329 18643  5130  7971   982    40    22  9028
   303   505    49  4674    81 29367  3475 29368   372     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]




embedding matrix shape (vocab, embeddings dim) is  (62508, 300)
Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_3 (Embedding)     (None, 128, 300)          18752400  
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              186880    
 nal)                                                            
                                                                 
 dense_6 (Dense)             (None, 32)                4128      
                                                                 
 dense_7 (Dense)             (None, 1)                 33        
                                                                 
Total params: 18,943,441
Trainable params: 191,041
Non-traina

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

seq_len = 128 # standardized length of each word sequence 

#------------------------------------------------------ Choose a dataset df
#df= res_df
#df= prod_df
#df= htl_df
#df= mov_df

df= pd.concat([res_df,prod_df,htl_df,mov_df])

#################################### vectorizing cleaned text
# fit tokenizer vocab 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.clean_text)

################################### standard train/val split
train_text, val_text, y_train, y_val = train_test_split(df.clean_text, df.polarity, 
                                                        test_size=0.2, random_state=123, stratify=df.polarity)


################################## indexing + padding
# convert train and val texts to token sequences of standardized length 128,
# padding fills leading 0s in or cuts off sequence at 128th word
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = pad_sequences(train_text, maxlen=seq_len, padding='post')

val_text = tokenizer.texts_to_sequences(val_text)
val_text = pad_sequences(val_text, maxlen=seq_len, padding='post')

print('one training text after padding: ')
print(train_text[10])

################################### preparing embedding matrix
embedding_dim = w2v.vector_size # w2v embedding dim
word_index = tokenizer.word_index # vocab

# use the gensim model to build a numpy array of embeddings,
# we'll feed this array to the keras embeddings layer.
# each row i of the array will correspond to the word token assigned to that value 
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    try:
        embedding_vector = w2v[word]
        embedding_matrix[i] = embedding_vector
    except: # word in our data vocab is missing in w2v, will use 0 vector for that word
        pass
print('embedding matrix shape (vocab, embeddings dim) is ', embedding_matrix.shape)


##################################### defining model

inp = Input(shape=(seq_len,))
x = Embedding(len(word_index) + 1,
              embedding_dim,
              weights=[embedding_matrix], # where we feed the pretrained vecs
              trainable=False)(inp) # freeze these parameters in the model
#recurrent_dropout=.1
x = Bidirectional(LSTM(64, activation='relu'))(x)
x = Dense(32)(x) # fully connected layer on top of the output of the bi-LSTM
#x = Dropout(.3)(x)
#x = Dense(20)(x) #added
y = Dense(1, activation='sigmoid')(x)

NN = Model(inp, y)

print(NN.summary())

######################################## compiling model
threshold = 0.5
NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc',tfa.metrics.F1Score(num_classes=2, average='micro', threshold=threshold)])


######################################## fitting model = training
NN.fit(train_text, y_train, validation_data=(val_text, y_val), epochs=10, batch_size=50, verbose=1)


######################################### testing

# This can be used if we have train, val, test sets
# but since data size is not to large, I choose 2 splits only
#result = the validation of last epoch

prob= NN.predict(val_text)
predections= [1 if p >= 0.5 else 0 for p in prob]
print('---------------------------RESULT------------------------')
print('The model f1 score on validation set is :', f1_score(y_val, predections))


one training text after padding: 
[    91   4921    961   2039    227    224    353  17643  44581     91
    401   2813     85      6  60782    220    131   6108     11    241
    371    467     51    614   3501   2989     55     78    182   1699
     36      6  35705 114340   2042   9754   9779  20073      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0]




embedding matrix shape (vocab, embeddings dim) is  (147742, 300)
Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_4 (Embedding)     (None, 128, 300)          44322600  
                                                                 
 bidirectional_4 (Bidirectio  (None, 128)              186880    
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 32)                4128      
                                                                 
 dense_9 (Dense)             (None, 1)                 33        
                                                                 
Total params: 44,513,641
Trainable params: 191,041
Non-train

**ArSenTD-Lev (Arabic Sentiment Twitter Dataset for LEVantine dialect)**

For this dataset, I will use w2v pretrained on Twitter tweets.
Since the dataset is small, I will not consider single domains.

In [23]:
!wget 'https://bakrianoo.ewr1.vultrobjects.com/aravec/full_uni_sg_300_twitter.zip'
!unzip 'full_uni_sg_300_twitter.zip'

--2021-12-15 17:00:51--  https://bakrianoo.ewr1.vultrobjects.com/aravec/full_uni_sg_300_twitter.zip
Resolving bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)... 108.61.0.122, 2001:19f0:0:22::100
Connecting to bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)|108.61.0.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2832094149 (2.6G) [application/zip]
Saving to: ‘full_uni_sg_300_twitter.zip’


2021-12-15 17:01:29 (71.2 MB/s) - ‘full_uni_sg_300_twitter.zip’ saved [2832094149/2832094149]

Archive:  full_uni_sg_300_twitter.zip
  inflating: full_uni_sg_300_twitter.mdl  
  inflating: full_uni_sg_300_twitter.mdl.trainables.syn1neg.npy  
  inflating: full_uni_sg_300_twitter.mdl.wv.vectors.npy  


In [24]:
# load the AraVec model
tw2v = gensim.models.Word2Vec.load("full_uni_sg_300_twitter.mdl")
print("We've",len(tw2v.wv.index2word),"vocabularies") 

We've 1259756 vocabularies


In [25]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [26]:
# define 'clean_tweet' function to clean the text and remove unwanted text parts
def clean_tweet(text):
    # define regular expression patterns
    p_english = "[a-zA-Z0-9]+"
    p_url = "https?://[A-Za-z0-9./]+"
    p_mention = "\@[\_0-9a-zA-Z]+\:?"    
    p_retweet = "RT \@[\_\-0-9a-zA-Z]+\:?"
    p_punctuations = "[" + string.punctuation + "]"
    
    # remove unwanted parts
    text = re.sub(p_english, ' ', text)
    text = re.sub(p_retweet, ' ', text)
    text = re.sub(p_mention, ' ', text)
    text = re.sub(p_url, ' ', text)
    text = re.sub(p_punctuations, ' ', text)
    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    
    # remove الهمزة
    text = re.sub("[أإآ]", 'ا', text)
    text = re.sub("ة", 'ه', text)
    text = re.sub("ى", 'ي', text)
    
    # removing tashkeel
    tashkel = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(tashkel, '', text)
    
    # remove repeated letters more than two letters
    text = re.sub(r'(.)\1+', r'\1\1', text)
    text = text.strip()

    text= remove_emoji(text)
    
    # remove stopwords
    words = [word for word in text.split() if word not in ar_stopwords]
    words = [word for word in words if len(word)>=2]
    
    # merge and return final text
    return ' '.join(words)

In [28]:
#read files of first dataset (MSA)
cor_df=pd.read_csv('ArSenTD-LEV.tsv', sep='\t')
cor_df= cor_df.drop(columns=['Country', 'Sentiment_Expression','Sentiment_Target'])
cor_df=cor_df[cor_df['Sentiment']!='neutral'].reset_index(drop=True)
#unify format with other dataset
cor_df['Sentiment'].replace({'very_positive': '1', 'positive': '1', 'very_negative':'0', 'negative':'0'}, inplace=True)
cor_df= cor_df.reset_index(drop=True)
cor_df.rename(columns={'Tweet': 'text', 'Sentiment': 'polarity'}, inplace=True)
cor_df["polarity"] = pd.to_numeric(cor_df["polarity"])
cor_df['clean_text'] = cor_df['text'].apply(clean_tweet)
cor_df

Unnamed: 0,text,Topic,polarity,clean_text
0,"""أنا أؤمن بأن الانسان ينطفئ جماله عند ابتعاد م...",personal,0,اؤمن بان الانسان ينطفئ جماله ابتعاد يحب بريق ا...
1,من الذاكره... @3FInQe . عندما اعتقد كريستيانو ...,sports,1,الذاكره اعتقد كريستيانو افضل لاعب العالم كاكا ...
2,#مصطلحات_لبنانيه_حيرت_البشريه بتوصل عالبيت ، ب...,personal,0,مصطلحات لبنانيه حيرت البشريه بتوصل عالبيت بنط ...
3,نصمت !! لتسير حياتنا على مً يرام فالناّس لم تع...,personal,0,نصمت لتسير حياتنا يرام فالناس تعد نقيه
4,@Yousef_MUFC اكثر ما يزعجنا بعد مستوانا خارج ا...,sports,0,يزعجنا مستوانا خارج ارضنا تمثيل كونتي فوزه
...,...,...,...,...
3110,نهتم من خلال خدمة تنسيق الرسائل بإظهار رسالة ا...,education,1,نهتم خدمه تنسيق الرسائل باظهار رساله الماجستير...
3111,صلاح من لاعب في المقاولون العرب يحلم ان يلعب ل...,sports,1,صلاح لاعب المقاولون العرب يحلم يلعب للاهلي وال...
3112,الملك سلمان بن عبد العزيز: تطبيق الأنظمة بحزم ...,politics,1,الملك سلمان عبد العزيز تطبيق الانظمه بحزم تطاو...
3113,@ZahraaIraq9 😂 كل ما ادخل حسابي الكه تغريداتج ...,personal,0,ادخل حسابي الكه تغريداتج حب العراق وانتي هسه ي...


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

seq_len = 128 # standardized length of each word sequence 


df= cor_df

#################################### vectorizing cleaned text
# fit tokenizer vocab 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.clean_text)

################################### standard train/val split
train_text, val_text, y_train, y_val = train_test_split(df.clean_text, df.polarity, 
                                                        test_size=0.2, random_state=123, stratify=df.polarity)


################################## indexing + padding
# convert train and val texts to token sequences of standardized length 128,
# padding fills leading 0s in or cuts off sequence at 128th word
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = pad_sequences(train_text, maxlen=seq_len, padding='post')

val_text = tokenizer.texts_to_sequences(val_text)
val_text = pad_sequences(val_text, maxlen=seq_len, padding='post')

print('one training text after padding: ')
print(train_text[10])

################################### preparing embedding matrix
embedding_dim = tw2v.vector_size # w2v embedding dim
word_index = tokenizer.word_index # vocab

# use the gensim model to build a numpy array of embeddings,
# we'll feed this array to the keras embeddings layer.
# each row i of the array will correspond to the word token assigned to that value 
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    try:
        embedding_vector = tw2v[word]
        embedding_matrix[i] = embedding_vector
    except: # word in our data vocab is missing in w2v, will use 0 vector for that word
        pass
print('embedding matrix shape (vocab, embeddings dim) is ', embedding_matrix.shape)


##################################### defining model

inp = Input(shape=(seq_len,))
x = Embedding(len(word_index) + 1,
              embedding_dim,
              weights=[embedding_matrix], # where we feed the pretrained vecs
              trainable=False)(inp) # freeze these parameters in the model
#recurrent_dropout=.1
x = Bidirectional(LSTM(64, activation='relu'))(x)
x = Dense(32)(x) # fully connected layer on top of the output of the bi-LSTM
#x = Dropout(.3)(x)
#x = Dense(20)(x) #added
y = Dense(1, activation='sigmoid')(x)

NN = Model(inp, y)

print(NN.summary())

######################################## compiling model
threshold = 0.5
NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc',tfa.metrics.F1Score(num_classes=2, average='micro', threshold=threshold)])


######################################## fitting model = training
NN.fit(train_text, y_train, validation_data=(val_text, y_val), epochs=10, batch_size=50, verbose=1)


######################################### testing

# This can be used if we have train, val, test sets
# but since data size is not to large, I choose 2 splits only
#result = the validation of last epoch

prob= NN.predict(val_text)
predections= [1 if p >= 0.5 else 0 for p in prob]
print('---------------------------RESULT------------------------')
print('The model f1 score on validation set is :', f1_score(y_val, predections))


one training text after padding: 
[   43    43    43    76   412 16479  1436    38 16480 16481 16482 16483
 16484  4526 16485 16486   446     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]




embedding matrix shape (vocab, embeddings dim) is  (17055, 300)
Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_5 (Embedding)     (None, 128, 300)          5116500   
                                                                 
 bidirectional_5 (Bidirectio  (None, 128)              186880    
 nal)                                                            
                                                                 
 dense_10 (Dense)            (None, 32)                4128      
                                                                 
 dense_11 (Dense)            (None, 1)                 33        
                                                                 
Total params: 5,307,541
Trainable params: 191,041
Non-trainab