# Part 3: Sentiment Analysis - 2

**Note: please download utilities.py **

In this task you are going to create a NN model that classify the status of the writer of some
Arabic text to “happy” and “sad” based on the text. In this task you will represent the text by
averaging the embeddings of the words comprising the text. For the word embeddings here, use
the already available AraVec 3.0 (https://github.com/bakrianoo/aravec).

# **Importing Modules:**

In [1]:
import gensim
import spacy
import gensim
import re
import numpy as np
import utilities
from nltk import ngrams
from utilities import *
import shutil 
import glob
import pandas as pd
import nltk
import string
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten
from keras.layers.core import Activation, Dropout, Dense,Embedding
from sklearn.metrics import confusion_matrix, accuracy_score
from tensorflow.keras.layers import LSTM



# **importing Aravec:**

In [2]:
!pip install gensim spacy nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!wget "https://bakrianoo.ewr1.vultrobjects.com/aravec/full_grams_cbow_100_twitter.zip"
!unzip "full_grams_cbow_100_twitter.zip"

--2023-01-12 18:39:29--  https://bakrianoo.ewr1.vultrobjects.com/aravec/full_grams_cbow_100_twitter.zip
Resolving bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)... 108.61.0.122, 2001:19f0:0:22::100
Connecting to bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)|108.61.0.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1131904811 (1.1G) [application/zip]
Saving to: ‘full_grams_cbow_100_twitter.zip’


2023-01-12 18:39:34 (202 MB/s) - ‘full_grams_cbow_100_twitter.zip’ saved [1131904811/1131904811]

Archive:  full_grams_cbow_100_twitter.zip
  inflating: full_grams_cbow_100_twitter.mdl  
  inflating: full_grams_cbow_100_twitter.mdl.trainables.syn1neg.npy  
  inflating: full_grams_cbow_100_twitter.mdl.wv.vectors.npy  


In [4]:
!wget "https://bakrianoo.ewr1.vultrobjects.com/aravec/full_uni_cbow_300_twitter.zip"
!unzip "full_uni_cbow_300_twitter.zip"

--2023-01-12 18:40:01--  https://bakrianoo.ewr1.vultrobjects.com/aravec/full_uni_cbow_300_twitter.zip
Resolving bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)... 108.61.0.122, 2001:19f0:0:22::100
Connecting to bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)|108.61.0.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2833686412 (2.6G) [application/zip]
Saving to: ‘full_uni_cbow_300_twitter.zip’


2023-01-12 18:40:26 (110 MB/s) - ‘full_uni_cbow_300_twitter.zip’ saved [2833686412/2833686412]

Archive:  full_uni_cbow_300_twitter.zip
  inflating: full_uni_cbow_300_twitter.mdl  
  inflating: full_uni_cbow_300_twitter.mdl.trainables.syn1neg.npy  
  inflating: full_uni_cbow_300_twitter.mdl.wv.vectors.npy  


In [5]:
model = gensim.models.Word2Vec.load("full_grams_cbow_100_twitter.mdl")
print("We've",len(model.wv.index2word),"vocabularies")

We've 1476715 vocabularies


In [None]:
%mkdir spacyModel

In [None]:
model.wv.save_word2vec_format("./spacyModel/aravec.txt")

In [None]:
!gzip ./spacyModel/aravec.txt

In [6]:
 #  module

# ============================   
# ====== N-Grams Models ======

t_model = gensim.models.Word2Vec.load('full_grams_cbow_100_twitter.mdl')

# python 3.X
token = clean_str(u'ابو تريكه').replace(" ", "_")
# python 2.7
# token = clean_str(u'ابو تريكه'.decode('utf8', errors='ignore')).replace(" ", "_")

if token in t_model.wv:
    most_similar = t_model.wv.most_similar( token, topn=10 )
    for term, score in most_similar:
        term = clean_str(term).replace(" ", "_")
        if term != token:
            print(term, score)

# تريكه 0.752911388874054
# حسام_غالي 0.7516342401504517
# وائل_جمعه 0.7244222164154053
# وليد_سليمان 0.7177559733390808
# ...

# =========================================
# == Get the most similar tokens to a compound query
# most similar to 
# عمرو دياب + الخليج - مصر

pos_tokens=[ clean_str(t.strip()).replace(" ", "_") for t in ['عمرو دياب', 'الخليج'] if t.strip() != ""]
neg_tokens=[ clean_str(t.strip()).replace(" ", "_") for t in ['مصر'] if t.strip() != ""]

vec = calc_vec(pos_tokens=pos_tokens, neg_tokens=neg_tokens, n_model=t_model, dim=t_model.vector_size)

most_sims = t_model.wv.similar_by_vector(vec, topn=10)
for term, score in most_sims:
    if term not in pos_tokens+neg_tokens:
        print(term, score)

# راشد_الماجد 0.7094649076461792
# ماجد_المهندس 0.6979793906211853
# عبدالله_رويشد 0.6942606568336487
# ...

# ====================
# ====================


# ============================== 
# ====== Uni-Grams Models ======

t_model = gensim.models.Word2Vec.load('full_uni_cbow_300_twitter.mdl')

# python 3.X
token = clean_str(u'تونس')
# python 2.7
# token = clean_str('تونس'.decode('utf8', errors='ignore'))

most_similar = t_model.wv.most_similar( token, topn=10 )
for term, score in most_similar:
    print(term, score)

# ليبيا 0.8864325284957886
# الجزائر 0.8783721327781677
# السودان 0.8573237061500549
# مصر 0.8277812600135803
# ...


# get a word vector
word_vector = t_model.wv[ token ]

ابوتريكه 0.9565805792808533
حازم_امام 0.864891767501831
وائل_جمعه 0.8543370366096497
تريكه 0.8521531820297241
حسام_غالي 0.846001148223877
عماد_متعب 0.8435681462287903
حسن_شحاته 0.8425122499465942
عمرو_زكي 0.8408412337303162
حسام_حسن 0.8271308541297913
رمضان_صبحي 0.8270741701126099
راشد_الماجد 0.7094648480415344
ماجد_المهندس 0.6979794502258301
عبدالله_رويشد 0.6942605376243591
عبدالله_الرويشد 0.6927955746650696
خالد_عبدالرحمن 0.6894348859786987
رابح_صقر 0.684174120426178
عبدالمجيد_عبدالله 0.684122622013092
محمد_عبده 0.6824554204940796
نبيل_شعيل 0.6798837184906006
زايد_الصالح 0.6735830903053284
ليبيا 0.7866284251213074
الجزائر 0.7823305726051331
مصر 0.7278609871864319
فرنسا 0.7052708864212036
موريتانيا 0.6982650756835938
طرابلس 0.6874827146530151
السودان 0.6861478090286255
تركيا 0.6820014119148254
لبنان 0.650722086429596
الاردن 0.6424295902252197


# Importing dataset

Dataset descreption: this dataset consists of happy tweets in .txt format and sad tweets in .txt format.

I uploaded the tweets dataset to google drive, and i'm going to import them to Extracting_folder dir,uznip them and put all the happy tweets in happyTweets.csv and the sad tweets in sadTweets.csv.

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:
!mkdir Extracting_folder

In [9]:

shutil.unpack_archive("/content/gdrive/MyDrive/Proj_1 Dataset.zip", "Extracting_folder/")

In [10]:

with open('happyTweets.csv', 'a') as csv_file:
    for path in glob.glob('/content/Extracting_folder/Dataset/happy/./*.txt'):
        with open(path) as txt_file:
            txt = txt_file.read() + '\n'
            csv_file.write(txt)


In [11]:
with open('sadTweets.csv', 'a') as csv_file:
    for path in glob.glob('/content/Extracting_folder/Dataset/sad/./*.txt'):
        with open(path) as txt_file:
            txt = txt_file.read() + '\n'
            csv_file.write(txt)

In the code below I labeled the column with "text" and added a new column named "label" with value of 1 to indicate that the data is happy

In [12]:

dataset1 = pd.read_csv('/content/happyTweets.csv',on_bad_lines='skip')
dataset1.columns = ['text'  ] 
dataset1["label"] = "1"
dataset1.head(5)

Unnamed: 0,text,label
0,وأحب أسافر مع سحاب تعلى 💙,1
1,📎 أحبك كثر ما ينسى زمانه مستريح البال وأحبك كث...,1
2,توأم توهم طالعين من بطن أمهم 😍,1
3,#الاتحاد_النصر لاتحسبونا نسينا يالطواقي ولانبي...,1
4,عبير عطري ..يخترق النوافذ.. ليخضع قلبك برقصات ...,1


In the code below I labeled the column with "text" and added a new column named "label" with value of 0 to indicate that the data is sad

In [13]:
dataset2 = pd.read_csv('/content/sadTweets.csv',on_bad_lines='skip')
dataset2.columns = ['text'  ] 
dataset2["label"] = "0"
dataset2.head(5)

Unnamed: 0,text,label
0,ياريت كان موجود عشان نشوف رده الفعل دي 😩 #BEAU...,0
1,كأن يبغى لهم جلد 🌚,0
2,وضعي اليوم 💔,0
3,الاغانى دى بحبها جدا بحب المود بتاعها اووى 💔 ج...,0
4,رئيسين في يومين قالو الربيع العربي 😒 نحنا الكت...,0


In the code below i merged the happyTweets.csv and sadTweets.csv into one csv file called tweets.csv

In [14]:
pd.concat([dataset1, dataset2]).to_csv('tweets.csv', index=False)

In [15]:
df = pd.read_csv('/content/tweets.csv',on_bad_lines='skip')

df.head(5)

Unnamed: 0,text,label
0,وأحب أسافر مع سحاب تعلى 💙,1
1,📎 أحبك كثر ما ينسى زمانه مستريح البال وأحبك كث...,1
2,توأم توهم طالعين من بطن أمهم 😍,1
3,#الاتحاد_النصر لاتحسبونا نسينا يالطواقي ولانبي...,1
4,عبير عطري ..يخترق النوافذ.. ليخضع قلبك برقصات ...,1


# Text pre-processing:

In [16]:

nltk.download('punkt')
nltk.download("stopwords")
arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))

# Clean/Normalize Arabic Text
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','"','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    text = text.replace("[إأآا]", "ا")
    text = text.replace("ة", "ه")
    text = text.replace("گ", "ك")
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    text = " ".join([w for w in text.split(" ") if not w in arb_stopwords])    
    
    #trim    
    text = text.strip()
    #removing numbers
    mapping = str.maketrans('', '', string.digits)
    text = text.translate(mapping)
    #remove english letters
    text = re.sub(r'[a-z,A-Z]', '', text)

    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Emoji pre-processing: removing emojis

In [17]:

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

applying text and emoji pre-processing to dataset

In [18]:
df['text']=df['text'].apply(clean_str)
df['text']=df['text'].apply(remove_emoji)
print(df.head(5))

                                                text  label
0                              واحب اسافر سحاب تعلي       1
1   احبك كثر ينسي زمانه مستريح البال واحبك كثر قل...      1
2                         توام توهم طالعين بطن امهم       1
3  #الاتحاد النصر لاتحسبونا نسينا يالطواقي ولانبي...      1
4  عبير عطري يخترق النوافذ ليخضع قلبك برقصات كالف...      1


I'll process the texts and store them in X and the label (happy = 1 , sad = 0) and store them in y

In [19]:
X = df.values[:, 0] 
y = df.values[:, 1]

# Splitting dataset into training set and testing set

In [20]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

using Tokenizer class will create a word-to-index dictionary, each word in the corpus is used as key and the index is used as value, after executing the code below we will notice that each sentence(tweet) in the dataset will be converted to an array of integers each sentence has a different length, each word in the sentence is converted to an integer that corresponds to that word.

In [21]:

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [22]:
X_train

[[1790, 1592, 1790, 1592, 3361, 439, 3362],
 [31, 5],
 [759, 3974],
 [2270, 2271, 2272],
 [149, 241, 551, 475, 452],
 [363, 4245, 37, 4245, 461],
 [184, 338, 1, 2350, 1240, 6, 1122, 633],
 [388, 222, 332, 372, 220, 285, 410, 394],
 [885, 3363, 390, 212, 1146, 92, 680, 2, 5, 2351],
 [53, 1123],
 [176,
  176,
  176,
  957,
  940,
  1987,
  957,
  1124,
  50,
  2066,
  77,
  642,
  108,
  219,
  1049,
  1928,
  1420],
 [1384, 342],
 [1205, 706, 3544, 2436, 1050, 566, 7, 301],
 [707, 634, 3, 692, 526, 708, 650, 221],
 [6, 373, 165, 1352, 2, 3035, 1095, 817, 56, 373, 1, 165],
 [11],
 [2774, 24, 4246, 13, 567, 25, 227, 173, 1546, 275, 1],
 [1839, 2197, 440, 693, 1988, 2, 3975, 1743, 302, 427, 2, 4247, 706, 190, 68],
 [2437, 2438, 568, 568, 568, 568],
 [86, 4786],
 [33, 22, 37, 43, 163, 197, 65, 3, 76, 1],
 [9, 3364],
 [3738,
  3739,
  1421,
  1640,
  254,
  1641,
  3740,
  3741,
  1169,
  886,
  832,
  1692,
  3545,
  1840,
  1169,
  975,
  3742],
 [901, 124, 1422],
 [865, 1547, 3743, 1051, 

Since each list has a different length, i will do padding to set a fixed length value which in this case equals to 100, sentences that are longer than 100 will be truncated to 100 and sentences that are shorter than 100 will add 0's to the end of the sentence till it reaches length 100. 

In [23]:

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
X_train

array([[1790, 1592, 1790, ...,    0,    0,    0],
       [  31,    5,    0, ...,    0,    0,    0],
       [ 759, 3974,    0, ...,    0,    0,    0],
       ...,
       [  11, 3716,  248, ...,    0,    0,    0],
       [ 417,  825,    0, ...,    0,    0,    0],
       [   1,  750,   36, ...,    0,    0,    0]], dtype=int32)

converting X_train,X_test,y_train and y_test to np arrays

In [24]:

X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

# Loading AraVec and creating an embedding matrix:

In [25]:
aravec = gensim.models.Word2Vec.load('/content/full_grams_cbow_100_twitter.mdl')

In [26]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = aravec.wv.get_vector(word)
        embedding_matrix[i] = embedding_vector
        
    except KeyError:
        continue

In [27]:
embedding_matrix

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-2.14037389e-01,  1.90066540e+00, -6.36513948e-01, ...,
         5.95165300e+00, -3.18309927e+00, -4.45298576e+00],
       [ 2.36931220e-02,  1.74992907e+00, -1.65043747e+00, ...,
        -2.39443541e+00, -1.15075684e+00, -6.20111322e+00],
       ...,
       [-1.18079126e+00,  1.78729117e+00, -4.25312221e-02, ...,
        -3.04296941e-01,  1.21305931e+00, -1.17517459e+00],
       [-5.91417355e-03,  3.35818839e+00,  2.30283332e+00, ...,
        -4.99925852e-01, -5.65355659e-01, -1.73278558e+00],
       [-1.63245261e-01,  7.83558935e-02,  4.30102617e-01, ...,
        -4.70813096e-01, -2.75861233e-01,  4.33836654e-02]])

# Creating NN model

In [None]:
model=Sequential()
embedding_layer=Embedding(input_dim=vocab_size,output_dim=100,input_length=maxlen)
#input_dim : Size of the vocabulary, we will incode words from 0 - vocab_size -1 
#output_dim: Length of the vector for each word
#input_length : Maximum length of a sequence
model.add(embedding_layer)
model.add(Flatten())
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='tanh'))
#compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

In [28]:

model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='tanh'))

In [29]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          6422100   
                                                                 
 flatten (Flatten)           (None, 10000)             0         
                                                                 
 dense (Dense)               (None, 10)                100010    
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 6,522,121
Trainable params: 100,021
Non-trainable params: 6,422,100
_________________________________________________________________
None


training the model

In [31]:
history = model.fit(X_train, y_train, batch_size=128, epochs=30, verbose=1, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# Evaluating model performance

In [32]:
score = model.evaluate(X_test, y_test, verbose=1)



In [33]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.6547591686248779
Test Accuracy: 0.7399468421936035


# Predicting the Test set results

In [34]:
y_pred = model.predict(X_test)
y_pred=(y_pred>0.5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0. 0.]
 [0. 1.]
 [0. 0.]
 ...
 [0. 0.]
 [0. 0.]
 [1. 0.]]


# Making the confusion matrix

In [35]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[4542 1159]
 [1777 3812]]


0.7399468556244464

# Part 5: Sentiment Analysis - 4

Re-implement the same task using a RNN employing LSTM and uses the embeddings
generated using AraVec 3.0. Note that here you will feed your model with the embedding for each
word in sequence.

# Creating RNN with LSTM model

In [38]:
model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)
model.add(LSTM(128)) #LSTM layer with 128 neurons


model.add(Dense(1, activation='tanh'))
#model.add(Dense(10, activation='relu'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

Training the model

In [39]:
history = model.fit(X_train, y_train, batch_size=128, epochs=30, verbose=1, validation_split=0.2)

score = model.evaluate(X_test, y_test, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


I tried different activation functions and Tanh came out with the best results so I used it.


---


Tanh: loss: 0.6004 - acc: 0.7534


---


relu: loss: 7.5957 - acc: 0.5019


---


sigmoid: loss: 0.6933 - acc: 0.5001

In [40]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.599871814250946
Test Accuracy: 0.766961932182312


# Predicting the Test set results

In [41]:
y_pred = model.predict(X_test)
y_pred=(y_pred>0.5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0. 0.]
 [0. 1.]
 [0. 0.]
 ...
 [0. 0.]
 [0. 0.]
 [0. 0.]]


# Making the confusion matrix

In [42]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[4806  895]
 [1736 3853]]


0.7669619131975199