# Part 3: Sentiment Analysis - 2

**Note: please download utilities.py **

In this task you are going to create a NN model that classify the status of the writer of some
Arabic text to “happy” and “sad” based on the text. In this task you will represent the text by
averaging the embeddings of the words comprising the text. For the word embeddings here, use
the already available AraVec 3.0 (https://github.com/bakrianoo/aravec).

# **Importing Modules:**

In [None]:
import gensim
import spacy
import gensim
import re
import numpy as np
import utilities
from nltk import ngrams
from utilities import *
import shutil 
import glob
import pandas as pd
import nltk
import string
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten
from keras.layers.core import Activation, Dropout, Dense,Embedding
from sklearn.metrics import confusion_matrix, accuracy_score
from tensorflow.keras.layers import LSTM

# **importing Aravec:**

In [None]:
!pip install gensim spacy nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!wget "https://bakrianoo.ewr1.vultrobjects.com/aravec/full_grams_cbow_100_twitter.zip"
!unzip "full_grams_cbow_100_twitter.zip"

--2023-01-02 13:13:05--  https://bakrianoo.ewr1.vultrobjects.com/aravec/full_grams_cbow_100_twitter.zip
Resolving bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)... 108.61.0.122, 2001:19f0:0:22::100
Connecting to bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)|108.61.0.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1131904811 (1.1G) [application/zip]
Saving to: ‘full_grams_cbow_100_twitter.zip’


2023-01-02 13:13:19 (81.4 MB/s) - ‘full_grams_cbow_100_twitter.zip’ saved [1131904811/1131904811]

Archive:  full_grams_cbow_100_twitter.zip
  inflating: full_grams_cbow_100_twitter.mdl  
  inflating: full_grams_cbow_100_twitter.mdl.trainables.syn1neg.npy  
  inflating: full_grams_cbow_100_twitter.mdl.wv.vectors.npy  


In [None]:
!wget "https://bakrianoo.ewr1.vultrobjects.com/aravec/full_uni_cbow_300_twitter.zip"
!unzip "full_uni_cbow_300_twitter.zip"

--2023-01-02 13:13:38--  https://bakrianoo.ewr1.vultrobjects.com/aravec/full_uni_cbow_300_twitter.zip
Resolving bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)... 108.61.0.122, 2001:19f0:0:22::100
Connecting to bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)|108.61.0.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2833686412 (2.6G) [application/zip]
Saving to: ‘full_uni_cbow_300_twitter.zip’


2023-01-02 13:14:14 (75.1 MB/s) - ‘full_uni_cbow_300_twitter.zip’ saved [2833686412/2833686412]

Archive:  full_uni_cbow_300_twitter.zip
  inflating: full_uni_cbow_300_twitter.mdl  
  inflating: full_uni_cbow_300_twitter.mdl.trainables.syn1neg.npy  
  inflating: full_uni_cbow_300_twitter.mdl.wv.vectors.npy  


In [None]:
model = gensim.models.Word2Vec.load("full_grams_cbow_100_twitter.mdl")
print("We've",len(model.wv.index2word),"vocabularies")

We've 1476715 vocabularies


In [None]:
%mkdir spacyModel

In [None]:
model.wv.save_word2vec_format("./spacyModel/aravec.txt")

In [None]:
!gzip ./spacyModel/aravec.txt

In [None]:
 #  module

# ============================   
# ====== N-Grams Models ======

t_model = gensim.models.Word2Vec.load('full_grams_cbow_100_twitter.mdl')

# python 3.X
token = clean_str(u'ابو تريكه').replace(" ", "_")
# python 2.7
# token = clean_str(u'ابو تريكه'.decode('utf8', errors='ignore')).replace(" ", "_")

if token in t_model.wv:
    most_similar = t_model.wv.most_similar( token, topn=10 )
    for term, score in most_similar:
        term = clean_str(term).replace(" ", "_")
        if term != token:
            print(term, score)

# تريكه 0.752911388874054
# حسام_غالي 0.7516342401504517
# وائل_جمعه 0.7244222164154053
# وليد_سليمان 0.7177559733390808
# ...

# =========================================
# == Get the most similar tokens to a compound query
# most similar to 
# عمرو دياب + الخليج - مصر

pos_tokens=[ clean_str(t.strip()).replace(" ", "_") for t in ['عمرو دياب', 'الخليج'] if t.strip() != ""]
neg_tokens=[ clean_str(t.strip()).replace(" ", "_") for t in ['مصر'] if t.strip() != ""]

vec = calc_vec(pos_tokens=pos_tokens, neg_tokens=neg_tokens, n_model=t_model, dim=t_model.vector_size)

most_sims = t_model.wv.similar_by_vector(vec, topn=10)
for term, score in most_sims:
    if term not in pos_tokens+neg_tokens:
        print(term, score)

# راشد_الماجد 0.7094649076461792
# ماجد_المهندس 0.6979793906211853
# عبدالله_رويشد 0.6942606568336487
# ...

# ====================
# ====================


# ============================== 
# ====== Uni-Grams Models ======

t_model = gensim.models.Word2Vec.load('full_uni_cbow_300_twitter.mdl')

# python 3.X
token = clean_str(u'تونس')
# python 2.7
# token = clean_str('تونس'.decode('utf8', errors='ignore'))

most_similar = t_model.wv.most_similar( token, topn=10 )
for term, score in most_similar:
    print(term, score)

# ليبيا 0.8864325284957886
# الجزائر 0.8783721327781677
# السودان 0.8573237061500549
# مصر 0.8277812600135803
# ...


# get a word vector
word_vector = t_model.wv[ token ]

ابوتريكه 0.9565805792808533
حازم_امام 0.864891767501831
وائل_جمعه 0.8543370366096497
تريكه 0.8521531820297241
حسام_غالي 0.846001148223877
عماد_متعب 0.8435681462287903
حسن_شحاته 0.8425122499465942
عمرو_زكي 0.8408412337303162
حسام_حسن 0.8271308541297913
رمضان_صبحي 0.8270741701126099
راشد_الماجد 0.7094648480415344
ماجد_المهندس 0.6979794502258301
عبدالله_رويشد 0.6942605376243591
عبدالله_الرويشد 0.6927955746650696
خالد_عبدالرحمن 0.6894348859786987
رابح_صقر 0.684174120426178
عبدالمجيد_عبدالله 0.684122622013092
محمد_عبده 0.6824554204940796
نبيل_شعيل 0.6798837184906006
زايد_الصالح 0.6735830903053284
ليبيا 0.7866284251213074
الجزائر 0.7823305726051331
مصر 0.7278609871864319
فرنسا 0.7052708864212036
موريتانيا 0.6982650756835938
طرابلس 0.6874827146530151
السودان 0.6861478090286255
تركيا 0.6820014119148254
لبنان 0.650722086429596
الاردن 0.6424295902252197


# Importing dataset

Dataset descreption: this dataset consists of happy tweets in .txt format and sad tweets in .txt format.

I uploaded the tweets dataset to google drive, and i'm going to import them to Extracting_folder dir,uznip them and put all the happy tweets in happyTweets.csv and the sad tweets in sadTweets.csv.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!mkdir Extracting_folder

In [None]:

shutil.unpack_archive("/content/gdrive/MyDrive/Proj_1 Dataset.zip", "Extracting_folder/")

In [None]:

with open('happyTweets.csv', 'a') as csv_file:
    for path in glob.glob('/content/Extracting_folder/Dataset/happy/./*.txt'):
        with open(path) as txt_file:
            txt = txt_file.read() + '\n'
            csv_file.write(txt)


In [None]:
with open('sadTweets.csv', 'a') as csv_file:
    for path in glob.glob('/content/Extracting_folder/Dataset/sad/./*.txt'):
        with open(path) as txt_file:
            txt = txt_file.read() + '\n'
            csv_file.write(txt)

In the code below I labeled the column with "text" and added a new column named "label" with value of 1 to indicate that the data is happy

In [None]:

dataset1 = pd.read_csv('/content/happyTweets.csv',on_bad_lines='skip')
dataset1.columns = ['text'  ] 
dataset1["label"] = "1"
dataset1.head(5)

Unnamed: 0,text,label
0,| ❀ التهميش حل لڪل شخص يحاول إنو يصعد فوق السط...,1
1,بمناسبة فوز الهلال .. 💙 سحب على آيفون XR📱 رتوي...,1
2,كأن عمري كل ماشفتك .. يطول قولي لي منهو شاعري ...,1
3,الله يوفقه وين ماراح بس حيرجعلي 😍,1
4,لجنة النزاهة اعضاءها من لجان الاتحاد لا يغرك ا...,1


In the code below I labeled the column with "text" and added a new column named "label" with value of 0 to indicate that the data is sad

In [None]:
dataset2 = pd.read_csv('/content/sadTweets.csv',on_bad_lines='skip')
dataset2.columns = ['text'  ] 
dataset2["label"] = "0"
dataset2.head(5)

Unnamed: 0,text,label
0,بالعاافيه .. 🙈 ترى عطيتكم عين,0
1,الحمد لله فيه المعيوف ولا كان علووم 😭,0
2,┊┊⇣✧ ┊⇣✦ ⇣✧ ⠀⠀ لليالي نجد مامثلك لليالي غلاك أ...,0
3,انا بعرف ازي اتقل ع حد بحبه لحد مايضيع مني 😒,0
4,مرصد الأزهر ده تخصص هوليجنز بس ؟؟ ماعندوش حد. ...,0


In the code below i merged the happyTweets.csv and sadTweets.csv into one csv file called tweets.csv

In [None]:
pd.concat([dataset1, dataset2]).to_csv('tweets.csv', index=False)

In [None]:
df = pd.read_csv('/content/tweets.csv',on_bad_lines='skip')

df.head(5)

Unnamed: 0,text,label
0,| ❀ التهميش حل لڪل شخص يحاول إنو يصعد فوق السط...,1
1,بمناسبة فوز الهلال .. 💙 سحب على آيفون XR📱 رتوي...,1
2,كأن عمري كل ماشفتك .. يطول قولي لي منهو شاعري ...,1
3,الله يوفقه وين ماراح بس حيرجعلي 😍,1
4,لجنة النزاهة اعضاءها من لجان الاتحاد لا يغرك ا...,1


# Text pre-processing:

In [None]:

nltk.download('punkt')
nltk.download("stopwords")
arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))

# Clean/Normalize Arabic Text
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','"','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    text = text.replace("[إأآا]", "ا")
    text = text.replace("ة", "ه")
    text = text.replace("گ", "ك")
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    text = " ".join([w for w in text.split(" ") if not w in arb_stopwords])    
    
    #trim    
    text = text.strip()
    #removing numbers
    mapping = str.maketrans('', '', string.digits)
    text = text.translate(mapping)
    #remove english letters
    text = re.sub(r'[a-z,A-Z]', '', text)

    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Emoji pre-processing: removing emojis

In [None]:

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

applying text and emoji pre-processing to dataset

In [None]:
df['text']=df['text'].apply(clean_str)
df['text']=df['text'].apply(remove_emoji)
print(df.head(5))

                                                text  label
0  |  التهميش حل لڪل شخص يحاول انو يصعد السطور ما...      1
1  بمناسبه فوز الهلال   سحب علي ايفون  رتويت وتاب...      1
2  عمري ماشفتك  يطول قولي منهو شاعري  ؟  واقول ان...      1
3                      الله يوفقه وين ماراح حيرجعلي       1
4  لجنه النزاهه اعضاءها لجان الاتحاد يغرك اسم الل...      1


I'll process the texts and store them in X and the label (happy = 1 , sad = 0) and store them in y

In [None]:
X = df.values[:, 0] 
y = df.values[:, 1]

# Splitting dataset into training set and testing set

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

using Tokenizer class will create a word-to-index dictionary, each word in the corpus is used as key and the index is used as value, after executing the code below we will notice that each sentence(tweet) in the dataset will be converted to an array of integers each sentence has a different length, each word in the sentence is converted to an integer that corresponds to that word.

In [None]:

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train

[[362, 10],
 [4815, 3, 664, 32, 942, 2098, 53, 558, 2099, 419, 1744, 307],
 [3362, 598, 96],
 [1],
 [2179, 69, 4816, 644, 73],
 [4206, 2347, 549, 1044],
 [119,
  2649,
  66,
  633,
  178,
  3548,
  257,
  3746,
  3747,
  70,
  3549,
  1797,
  3550,
  3040,
  2,
  1132,
  5,
  113,
  3748,
  3363],
 [],
 [277,
  1,
  1519,
  1955,
  928,
  261,
  1627,
  889,
  1896,
  1956,
  1957,
  928,
  261,
  1431,
  1217,
  1628,
  82],
 [7, 1577, 144, 4817, 172, 1203, 4818],
 [685, 419, 622, 599, 289, 929, 1365, 179, 1238, 179, 71, 1396, 417, 2650],
 [3749, 470],
 [461,
  1,
  105,
  108,
  823,
  105,
  3,
  3964,
  4481,
  2897,
  4482,
  132,
  13,
  1,
  1798,
  4819,
  5,
  4820],
 [737, 269, 1109, 73, 4207, 66, 164, 66, 1578],
 [2011, 2, 203, 2, 3965, 2, 3041],
 [457, 383, 51],
 [254, 943, 3750, 176, 1745, 3205, 3042, 4483, 81, 944, 3043, 4821, 4208],
 [68, 114, 1153, 121, 2180, 1045, 1, 2100, 1, 176, 207],
 [930],
 [58, 44, 6, 37, 2, 64, 21, 84, 29, 30, 85, 88],
 [1547, 203, 1844, 1629, 1

Since each list has a different length, i will do padding to set a fixed length value which in this case equals to 100, sentences that are longer than 100 will be truncated to 100 and sentences that are shorter than 100 will add 0's to the end of the sentence till it reaches length 100. 

In [None]:

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
X_train

array([[ 362,   10,    0, ...,    0,    0,    0],
       [4815,    3,  664, ...,    0,    0,    0],
       [3362,  598,   96, ...,    0,    0,    0],
       ...,
       [4994, 4995, 4364, ...,    0,    0,    0],
       [ 858, 3439, 2049, ...,    0,    0,    0],
       [2869, 2336, 1516, ...,    0,    0,    0]], dtype=int32)

converting X_train,X_test,y_train and y_test to np arrays

In [None]:

X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

# Loading AraVec and creating an embedding matrix:

In [None]:
aravec = gensim.models.Word2Vec.load('/content/full_grams_cbow_100_twitter.mdl')

In [None]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = aravec.wv.get_vector(word)
        embedding_matrix[i] = embedding_vector
        
    except KeyError:
        continue

In [None]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.21403739,  1.9006654 , -0.63651395, ...,  5.951653  ,
        -3.18309927, -4.45298576],
       [ 0.02369312,  1.74992907, -1.65043747, ..., -2.39443541,
        -1.15075684, -6.20111322],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.4668681 , -0.54435545, -0.67405331, ..., -1.13150871,
        -0.05076632,  0.58615381],
       [ 0.17737707, -0.45625114,  0.44169402, ..., -0.16563699,
        -0.46141586,  0.14885765]])

# Creating NN model

In [None]:

model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

model.add(Flatten())
model.add(Dense(1, activation='tanh'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 100, 100)          6422000   
                                                                 
 flatten_1 (Flatten)         (None, 10000)             0         
                                                                 
 dense_10 (Dense)            (None, 1)                 10001     
                                                                 
Total params: 6,432,001
Trainable params: 10,001
Non-trainable params: 6,422,000
_________________________________________________________________
None


training the model

In [None]:
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


# Evaluating model performance

In [None]:
score = model.evaluate(X_test, y_test, verbose=1)



In [None]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.5472121238708496
Test Accuracy: 0.7155026793479919


# Predicting the Test set results

In [None]:
y_pred = model.predict(X_test)
y_pred=(y_pred>0.5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0. 0.]
 [0. 0.]
 [0. 1.]
 ...
 [1. 1.]
 [1. 1.]
 [1. 1.]]


# Making the confusion matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[4622 1010]
 [2196 3441]]


0.7155027065400656

# Part 5: Sentiment Analysis - 4

Re-implement the same task using a RNN employing LSTM and uses the embeddings
generated using AraVec 3.0. Note that here you will feed your model with the embedding for each
word in sequence.

# Creating RNN with LSTM model

In [None]:
model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)
model.add(LSTM(128)) #LSTM layer with 128 neurons


model.add(Dense(1, activation='tanh'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

Training the model

In [None]:
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

score = model.evaluate(X_test, y_test, verbose=1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


I tried different activation functions and Tanh came out with the best results so I used it.


---


Tanh: loss: 0.6004 - acc: 0.7534


---


relu: loss: 7.5957 - acc: 0.5019


---


sigmoid: loss: 0.6933 - acc: 0.5001

In [None]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.48427847027778625
Test Accuracy: 0.759251058101654


# Predicting the Test set results

In [None]:
y_pred = model.predict(X_test)
y_pred=(y_pred>0.5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0. 0.]
 [0. 0.]
 [1. 1.]
 ...
 [1. 1.]
 [1. 1.]
 [1. 1.]]


# Making the confusion matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[4600 1032]
 [1681 3956]]


0.7592510426834679