# **Part 1: Arabic Word Embeddings**

Perform the text preprocessing steps that you think are important, such as handling emojis
and numbers, prepare the dataset for training your model similar to that we have seen in the class,
and then create the NN model and train it. Once you are done, extract the embeddings and create
a simple API that takes an Arabic word and returns its embeddings.

# Importing Modules:

In [1]:
import gensim
import spacy
import gensim
import re
import numpy as np
from nltk import ngrams
import shutil 
import glob
import pandas as pd
import nltk
import string
import tensorflow as tf 
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten
from keras.layers.core import Activation, Dropout, Dense,Embedding
from sklearn.metrics import confusion_matrix, accuracy_score
from tensorflow.keras.layers import LSTM





# Importing the dataset

The dataset combines reviews from hotels, books, movies, products and a few airlines. It has three classes (Mixed, Negative and Positive). Most were mapped from reviewers' ratings with 3 being mixed, above 3 positive and below 3 negative. Each row has a label and text separated by a tab (tsv). Text (reviews) were cleaned by removing Arabic diacritics and non-Arabic characters. The dataset has no duplicate reviews. 

Importing the dataset from Google Drive

In [28]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import pandas as pd
path='/content/gdrive/MyDrive/arabic dataset/ar_reviews_100k.tsv.zip'
df=pd.read_csv(path, delimiter = '\t',on_bad_lines='skip')
df.head()

Unnamed: 0,label,text
0,Positive,ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...
1,Positive,أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...
2,Positive,هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...
3,Positive,خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...
4,Positive,ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...


since the label has string values, I'll convert Positive to 1, Mixed to 0 and Negative to -1

In [4]:
label_mapping = {"Positive": 1, "Negative":-1 , "Mixed": 0}

df["label"] = df["label"].map(label_mapping)

# Text pre-processing:

In [5]:
# To remove rows where the content of the row = null
df.dropna(inplace=True)

In [6]:
nltk.download('punkt')
nltk.download("stopwords")
arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))

# Clean/Normalize Arabic Text
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','"','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    text = text.replace("[إأآا]", "ا")
    text = text.replace("ة", "ه")
    text = text.replace("گ", "ك")
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    text = " ".join([w for w in text.split(" ") if not w in arb_stopwords])    
    
    #trim    
    text = text.strip()
    #removing numbers
    mapping = str.maketrans('', '', string.digits)
    text = text.translate(mapping)
    #remove english letters
    text = re.sub(r'[a-z,A-Z]', '', text)

    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Emoji pre-processing: removing emojis

In [7]:

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

applying text and emoji pre-processing to dataset

In [8]:
df['text']=df['text'].apply(clean_str)
df['text']=df['text'].apply(remove_emoji)
print(df.head(5))

   label                                               text
0      1  ممتاز نوعا  النظافه والموقع والتجهيز والشاطيء ...
1      1  احد اسباب نجاح الامارات ان شخص الدوله يعشق ترا...
2      1  هادفه  وقويه تنقلك صخب شوارع القاهره الي هدوء ...
3      1  خلصنا  مبدئيا اللي مستني ابهار زي الفيل الازرق...
4      1  ياسات جلوريا جزء يتجزا دبي  فندق متكامل الخدما...


storing the text column in X and the label column in y

In [9]:
y = df.values[:, 0] 
X = df.values[:, 1]

In [10]:
X

array(['ممتاز نوعا  النظافه والموقع والتجهيز والشاطيء المطعم',
       'احد اسباب نجاح الامارات ان شخص الدوله يعشق ترابها نحب الامارات ومضات فكر نصائح لدوله تطمح بالصفوف الاولي وقائد يقبل الا براحه شعبه وتوفر سب العيش الكريم حكم ومواقف ونصائح لكل فرد فينا بمجرد كتاب سياسي كنت اعتقد يستحق القراءه مرات كثيره',
       'هادفه  وقويه تنقلك صخب شوارع القاهره الي هدوء جبال الشيشان  للتعرف علي حقيقه يجري البلاد حروب ضاربه بحق المسلمين وجزء كبير تاريخ المنطقه التضحيه  الرجوله  الوفاء والكثير القيم الاخري اثبتت وجودها الروايه البسيطه',
       ...,
       'كتاب ضعيف جدا ولم استمتع قصه سرد لحاله او مشهد بدون فكره للقصه',
       'ممله جدا محمد حسن علوان فنان بالكلمات والوصف عنده دقيق وزائد حد اللزوم كتاب اقراءه للكتاب علي امل اني احب كتابته للاسف سيء زي الاول',
       'ارجع اليه مره اخري  قربه البحر المكان قديم توجد خدمات اربع نجوم'],
      dtype=object)

In [11]:
y

array([1, 1, 1, ..., -1, -1, -1], dtype=object)

# Splitting the dataset into Training Set and Test Set

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Tokenization is said to be dividing a large quantity of text into smaller fragments known as Tokens. The code below will create a word-to-index dictionary where each word in the row(sentence) will be mapped to an integer.

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


In [14]:
X_train

[[12,
  416,
  18089,
  28236,
  13463,
  5140,
  3544,
  489,
  38516,
  312,
  1398,
  2508,
  1127,
  42739,
  3744,
  5792,
  3488,
  698,
  1319,
  26549,
  12,
  61,
  2,
  904,
  755,
  980,
  1895,
  10819,
  48036],
 [138, 35, 4923, 3805, 48037],
 [3,
  289,
  3042,
  1286,
  910,
  3,
  38,
  191,
  3214,
  28,
  135,
  4593,
  2985,
  5522,
  667,
  1430,
  2167],
 [38,
  836,
  247,
  1035,
  617,
  3689,
  115282,
  38,
  1021,
  2369,
  3422,
  1939,
  1995,
  263,
  496,
  82025,
  2688,
  445,
  2140,
  54,
  2231,
  186,
  5578],
 [56,
  3,
  36,
  525,
  148,
  2077,
  26550,
  424,
  1128,
  27,
  115283,
  115284,
  22,
  2689,
  2],
 [13826,
  471,
  310,
  3545,
  1698,
  195,
  4972,
  10614,
  3545,
  1260,
  4972,
  9579,
  16831,
  4199,
  1616,
  3690,
  1456],
 [89, 3745, 28237, 2412, 669, 1316],
 [7103,
  338,
  263,
  2841,
  350,
  45,
  1354,
  628,
  4027,
  1513,
  12434,
  8333,
  113,
  35217,
  597,
  7631,
  1337,
  4,
  24,
  115285,
  82026,
  15

In [15]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

253070


Since each list has a different length, i will do padding to set a fixed length value which in this case equals to 100, sentences that are longer than 100 will be truncated to 100 and sentences that are shorter than 100 will add 0's to the end of the sentence till it reaches length 100. 

In [16]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
X_train

array([[   12,   416, 18089, ...,     0,     0,     0],
       [  138,    35,  4923, ...,     0,     0,     0],
       [    3,   289,  3042, ...,     0,     0,     0],
       ...,
       [ 1637,    69,  3151, ...,     0,     0,     0],
       [   97,    20,    76, ...,     0,     0,     0],
       [ 1950,   425,  1549, ...,     0,     0,     0]], dtype=int32)

# Creating a NN with embedding layer

Embedding layer: Is a lookup table that maps from integer indices (which stand for specific words) to dense vectors (their embeddings).

---
Dense layer: layer in ANN which is deeply connected to its preceding layer. Neurons of the dense layer perfrom matrix-vector multiplication with the neurons of the preceding layer. Dense layer is used for changing/reducing the dimension of the vectors by using every neuron.

---

I also added a flatten layer. Why do we need a flatten layer? to convert the data into a 1D array for inputting it into the next layer in this case the next layer is the dense layer.

In [20]:
model=Sequential()
embedding_layer=Embedding(input_dim=vocab_size,output_dim=100,input_length=maxlen)
#input_dim : Size of the vocabulary, we will incode words from 0 - vocab_size -1 
#output_dim: Length of the vector for each word
#input_length : Maximum length of a sequence
model.add(embedding_layer)
model.add(Flatten())
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='tanh'))
#compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          25307000  
                                                                 
 flatten_1 (Flatten)         (None, 10000)             0         
                                                                 
 dense_2 (Dense)             (None, 10)                100010    
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 25,407,021
Trainable params: 25,407,021
Non-trainable params: 0
_________________________________________________________________
None


since the vocab_size is 253070, trainable params= vocab_size * 100 (each word is represented as 100-d vector).
In the flattening layer, we simply multiply rows and column.
In the dense layer, the number of params is 1000 and 1 for the bias param.



---



converting X_train,X_test,y_train and y_test to np arrays

In [21]:
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

# Training the model

In [22]:
history = model.fit(X_train, y_train, batch_size=128, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Getting the weights of the Embedding layer

In [23]:
weights = model.layers[0].get_weights()[0]
print(weights)

[[-0.7601298   0.8570462  -0.69118005 ...  1.0517274   0.6928735
   0.7624987 ]
 [-1.991208    1.9169594  -1.9762536  ...  2.5530415   1.9534229
   1.9724464 ]
 [-2.1275437   2.0733821  -1.9947189  ...  2.0338314   2.0023675
   2.0489068 ]
 ...
 [ 0.03709206  0.01431115  0.04825715 ... -0.00948751  0.01576583
   0.01737461]
 [-0.00854262  0.04703185  0.03483893 ...  0.02980835  0.04633557
  -0.02506369]
 [ 0.00620263  0.02873882  0.02087995 ...  0.02553023 -0.033357
  -0.02456512]]


Saving the weights to a file

In [24]:
np.save('word_embeddings.npy', weights)

# Creating a simple API that takes a word as input and returns its embedding

In [25]:
def get_word_embedding(word, weights):
  # Encode the word as an integer
  word_index = tokenizer.word_index[word]
  word_index = tf.expand_dims([word_index], 0)

  # Use the embedding layer to map the word to its embedding
  embedding = embedding_layer(word_index)
  embedding = tf.squeeze(embedding, 0)
  return embedding
print(get_word_embedding("النظافه",weights = model.layers[0].get_weights()[0]))


tf.Tensor(
[[-0.9922143   1.1814101  -0.97152483  0.14547624 -0.5331973  -1.6971412
  -0.82043153 -0.53876996  0.7780593   0.01815255 -0.5316169  -0.39349267
  -0.30680868 -0.40158752 -0.66147715  1.0508604   0.8876518   1.0048691
   0.74629    -1.1315792   0.7643611  -1.008156    2.4647226   0.8883315
  -1.0197064  -0.6358612   1.1027583  -2.1635435  -0.8823438   0.00381925
   0.7230763  -3.3032553  -0.6933701   0.825052   -0.9408656  -0.788787
   0.61812305  0.41097564  0.8941448   0.92631966  0.930256   -0.9424762
   0.8076341  -0.9124677  -0.67086667 -0.33976287  0.16366878 -1.0262203
   0.5754759  -1.1632648   0.88864493 -3.045166   -0.21738052 -0.1724902
  -0.90607464  1.0016558  -1.0675061  -1.017763    0.849075    0.09227903
   2.4875886  -0.25538683  0.7079304   0.5881984   0.43186814  0.8103003
  -2.4053786  -0.66412675  0.5077886  -0.26462802  0.910804    0.744773
  -1.0276185  -0.87984693  0.84242576  0.7500944  -0.6811668   1.2161808
  -1.0139138   1.2856175  -0.49904862  

# Part 2: Sentiment Analysis - 1

In this task you are going to create a NN model that classify the status of the writer of some
Arabic text to “happy” and “sad” based on the text. In this task you will represent the text by
averaging the embeddings of the words comprising the text. For the word embeddings here, use
the API you have created in Part 1.

Importing the tweets

I uploaded the tweets dataset to google drive, and i'm going to import them to Extracting_folder dir,uznip them and put all the happy tweets in happyTweets.csv and the sad tweets in sadTweets.csv.

In [26]:
!mkdir Extracting_folder

In [29]:
shutil.unpack_archive("/content/gdrive/MyDrive/Proj_1 Dataset.zip", "Extracting_folder/")

In [30]:
with open('happyTweets.csv', 'a') as csv_file:
    for path in glob.glob('/content/Extracting_folder/Dataset/happy/./*.txt'):
        with open(path) as txt_file:
            txt = txt_file.read() + '\n'
            csv_file.write(txt)


In [31]:
with open('sadTweets.csv', 'a') as csv_file:
    for path in glob.glob('/content/Extracting_folder/Dataset/sad/./*.txt'):
        with open(path) as txt_file:
            txt = txt_file.read() + '\n'
            csv_file.write(txt)

In the code below I labeled the column with "text" and added a new column named "label" with value of 1 to indicate that the data is happy

In [32]:
dataset1 = pd.read_csv('/content/happyTweets.csv',on_bad_lines='skip')
dataset1.columns = ['text'  ] 
dataset1["label"] = "1"
dataset1.head(5)

Unnamed: 0,text,label
0,#نطالب_بفتح_مكبرات_الصوت 🔹 رفع نداء الحق في مك...,1
1,تدري متى يلعب بك الهم والشوق لاصرت تعشق واحد م...,1
2,🌸 خذ معک بين اللھم و آمين كل الذين تحبھم سلاما...,1
3,الشيء الوحيد الذي وصلوا فيه للعالمية هو : المس...,1
4,أنا أشبه الأغاني القديمه لا ينصت لها إلا صاحب ...,1


In the code below I labeled the column with "text" and added a new column named "label" with value of 0 to indicate that the data is sad

In [33]:
dataset2 = pd.read_csv('/content/sadTweets.csv',on_bad_lines='skip')
dataset2.columns = ['text'  ] 
dataset2["label"] = "0"
dataset2.head(5)

Unnamed: 0,text,label
0,وش هو السئ للغايه 🤔,0
1,ممكن كل شيء جايز 🤔 العلم عند الله ☝️,0
2,البقاء لله 😢,0
3,ياجمرة الشوق الخفي نسيت أنا و جرحك وفي 💔 #࿐➳❤︎...,0
4,والأم من بعد الله ملاذ !♡ لكن رقه لا ملاذ لها 💔,0


In the code below i merged the happyTweets.csv and sadTweets.csv into one csv file called tweets.csv

In [34]:
pd.concat([dataset1, dataset2]).to_csv('tweets.csv', index=False)

In [35]:
dfT = pd.read_csv('/content/tweets.csv',on_bad_lines='skip')

dfT.head(5)

Unnamed: 0,text,label
0,#نطالب_بفتح_مكبرات_الصوت 🔹 رفع نداء الحق في مك...,1
1,تدري متى يلعب بك الهم والشوق لاصرت تعشق واحد م...,1
2,🌸 خذ معک بين اللھم و آمين كل الذين تحبھم سلاما...,1
3,الشيء الوحيد الذي وصلوا فيه للعالمية هو : المس...,1
4,أنا أشبه الأغاني القديمه لا ينصت لها إلا صاحب ...,1


applying text and emoji pre-processing to dataset

In [36]:
dfT['text']=df['text'].apply(clean_str)
dfT['text']=df['text'].apply(remove_emoji)
print(dfT.head(5))

                                                text  label
0  ممتاز نوعا  النظافه والموقع والتجهيز والشاطيء ...      1
1  احد اسباب نجاح الامارات ان شخص الدوله يعشق ترا...      1
2  هادفه  وقويه تنقلك صخب شوارع القاهره الي هدوء ...      1
3  خلصنا  مبدئيا اللي مستني ابهار زي الفيل الازرق...      1
4  ياسات جلوريا جزء يتجزا دبي  فندق متكامل الخدما...      1


I'll process the texts and store them in Xt and the label (happy = 1 , sad = 0) and store them in yt

In [37]:
Xt = dfT.values[:, 0] 
yt= dfT.values[:, 1]

In [38]:
Xt

array(['ممتاز نوعا  النظافه والموقع والتجهيز والشاطيء المطعم',
       'احد اسباب نجاح الامارات ان شخص الدوله يعشق ترابها نحب الامارات ومضات فكر نصائح لدوله تطمح بالصفوف الاولي وقائد يقبل الا براحه شعبه وتوفر سب العيش الكريم حكم ومواقف ونصائح لكل فرد فينا بمجرد كتاب سياسي كنت اعتقد يستحق القراءه مرات كثيره',
       'هادفه  وقويه تنقلك صخب شوارع القاهره الي هدوء جبال الشيشان  للتعرف علي حقيقه يجري البلاد حروب ضاربه بحق المسلمين وجزء كبير تاريخ المنطقه التضحيه  الرجوله  الوفاء والكثير القيم الاخري اثبتت وجودها الروايه البسيطه',
       ...,
       'رغم الاسلوب الضعيف نسبيا بسبب قله الخبره المقدمه دي اول روايه للكاتبه الروايه كليا حاجه محترمه جدا والخيال عبقري',
       'حسنا الجيد انها انتهت بالطريقه اياها انني كنت اود ان تنتهي زمن البدايه ظننتها ستكون حديث الاشياء بخصوص الشاب الراحل وماننت ان احد يصل بمستوي التطفل الي انتحال شخصيه احدهم ويعتدي علي خصوصياته الامور رسائل الطرفين ترقني احسب يوسف قراره متاخرا المهم انه فعل',
       'روايه رمزيه جيده تسلط الضوء علي حياه الشباب الفتره تاريخ مصر 

In [39]:
yt

array([1, 1, 1, ..., 0, 0, 0], dtype=object)

# Splitting dataset into training set and testing set

In [40]:
Xt_train, Xt_test, yt_train, yt_test = train_test_split(Xt, yt, test_size=0.20, random_state=42)

In [41]:
tokenizer2 = Tokenizer(num_words=5000)
tokenizer2.fit_on_texts(Xt_train)

Xt_train = tokenizer2.texts_to_sequences(Xt_train)
Xt_test = tokenizer2.texts_to_sequences(Xt_test)

In [42]:
Xt_train

[[47, 34, 45, 894, 23, 4087, 198, 978, 825, 582, 51, 2626, 28],
 [47,
  315,
  3953,
  2,
  80,
  19,
  153,
  6,
  296,
  3056,
  4207,
  4743,
  611,
  546,
  37,
  4743,
  13,
  3859,
  6,
  3022,
  2,
  59,
  3610,
  748],
 [1125, 111, 20, 1399, 356, 299],
 [1,
  1070,
  2895,
  81,
  116,
  269,
  38,
  852,
  177,
  147,
  351,
  2677,
  147,
  351,
  758,
  24,
  83,
  36,
  263],
 [964,
  4208,
  3057,
  1,
  3908,
  440,
  699,
  61,
  31,
  242,
  2650,
  1953,
  1096,
  224,
  1154,
  623,
  206,
  2097,
  2,
  600,
  352,
  3909,
  132,
  416,
  129,
  2980,
  579,
  151,
  4443,
  4,
  25,
  687,
  3860,
  8,
  3057,
  208,
  305,
  223,
  43,
  7,
  223,
  40,
  3,
  2779,
  1,
  72,
  422,
  678,
  198,
  616,
  422,
  289,
  1,
  1502,
  2301,
  8,
  271,
  4,
  102,
  1400,
  959,
  496,
  1401,
  3307,
  869,
  46,
  44],
 [3023, 1240, 430, 3519, 10, 1026, 3090, 62, 107, 4088],
 [6, 28, 7, 1061, 1722, 1604, 4295, 154, 1722, 384, 46, 542, 1637, 681, 21],
 [44,
  367,
 

In [43]:
# Adding 1 because of reserved 0 index
vocab_size2 = len(tokenizer2.word_index) + 1

maxlen = 100

Xt_train = pad_sequences(Xt_train, padding='post', maxlen=maxlen)
Xt_test = pad_sequences(Xt_test, padding='post', maxlen=maxlen)
Xt_train

array([[  47,   34,   45, ...,    0,    0,    0],
       [  47,  315, 3953, ...,    0,    0,    0],
       [1125,  111,   20, ...,    0,    0,    0],
       ...,
       [ 127,   25,   72, ...,    0,    0,    0],
       [1492,  402, 2207, ...,    0,    0,    0],
       [ 127,  132,   32, ...,    0,    0,    0]], dtype=int32)

In [44]:
vocab_size2

186403

converting Xt_train,Xt_test,yt_train and yt_test to np arrays

In [45]:
Xt_train = np.asarray(Xt_train).astype(np.float32)
Xt_test = np.asarray(Xt_test).astype(np.float32)
yt_train = np.asarray(yt_train).astype(np.float32)
yt_test = np.asarray(yt_test).astype(np.float32)

# Creating a NN model

In [46]:
model2 = Sequential()
#vocab_size is the size of the weights matrix
embedding_layer2 = Embedding(vocab_size, 100, weights=[weights], input_length=maxlen , trainable=False)
model2.add(embedding_layer2)

model2.add(Flatten())
model2.add(Dense(1, activation='tanh'))

In [47]:
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model2.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          25307000  
                                                                 
 flatten_2 (Flatten)         (None, 10000)             0         
                                                                 
 dense_4 (Dense)             (None, 1)                 10001     
                                                                 
Total params: 25,317,001
Trainable params: 10,001
Non-trainable params: 25,307,000
_________________________________________________________________
None


In [48]:
history2 = model2.fit(Xt_train, yt_train, batch_size=128, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Evaluating model performance

In [49]:
score = model2.evaluate(Xt_test, yt_test, verbose=1)



# Predicting the Test set results

In [50]:
y_pred = model2.predict(Xt_test)
y_pred=(y_pred>0.5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), yt_test.reshape(len(yt_test),1)),1))

[[0. 1.]
 [0. 0.]
 [0. 1.]
 ...
 [0. 0.]
 [0. 0.]
 [0. 1.]]


In [51]:
y_pred

array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [False]])

# Making the confusion matrix

In [52]:
cm = confusion_matrix(yt_test, y_pred)
print(cm)
accuracy_score(yt_test, y_pred)

[[5723    0]
 [5584    0]]


0.5061466348279826

# Part 4: Sentiment Analysis - 3

Re-implement the same task using a RNN employing LSTM and uses the embeddings
generated

# Creating RNN with LSTM model

In [53]:

model3 = Sequential()
embedding_layer3 = Embedding(vocab_size, 100, weights=[weights], input_length=maxlen , trainable=False)
model3.add(embedding_layer3)
model3.add(LSTM(128)) #LSTM layer with 128 neurons


model3.add(Dense(1, activation='relu'))
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

Training the model

In [54]:
history3 = model3.fit(Xt_train, yt_train, batch_size=128, epochs=10, verbose=1, validation_split=0.2)

score = model3.evaluate(Xt_test, yt_test, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Predicting the Test set results

In [55]:
y_pred2 = model3.predict(Xt_test)
y_pred2=(y_pred2>0.5)
print(np.concatenate((y_pred2.reshape(len(y_pred2),1), yt_test.reshape(len(yt_test),1)),1))

[[1. 1.]
 [1. 0.]
 [1. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 1.]]


# Making the confusion matrix

In [56]:
cm = confusion_matrix(yt_test, y_pred)
print(cm)
accuracy_score(yt_test, y_pred)

[[5723    0]
 [5584    0]]


0.5061466348279826