<a href="https://colab.research.google.com/github/annguyenhuynh/Anhuynh/blob/main/Word_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word Embeddings Technique

*   Like Word2Vec, this technique is used to represent words and semantic relationship between them as vectors for natural language processing (NLP)

*   Unlike Word2Vec, we don't use a pre-trained model, but we manually train our data






In [2]:
from tensorflow.keras.preprocessing.text import one_hot

In [3]:
import tensorflow as tf
print(tf.__version__)

2.17.0


In [4]:
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']

In [5]:
# Initialize vocabulary size
vocab_size =10000

# One Hot Representation

In [6]:
oh_representaion = [one_hot(words,vocab_size) for words in sent]
oh_representaion

[[5026, 4328, 2235, 5321],
 [5026, 4328, 2235, 2318],
 [5026, 2594, 2235, 5225],
 [3908, 3480, 4384, 8600, 9083],
 [3908, 3480, 4384, 8600, 7646],
 [3394, 5026, 3427, 2235, 2606],
 [7287, 3113, 6740, 8600]]

In [7]:
"""
In a vocabulary dictionary size of 10000 words, the word 'the', for e.g, have the index of 3123.
If comparing the first 2 sents, one will see that the first 3 vectors are exactly the same,
  the difference is the one vector at the end that represents milk and juice, respectively
"""

"\nIn a vocabulary dictionary size of 10000 words, the word 'the', for e.g, have the index of 3123.\nIf comparing the first 2 sents, one will see that the first 3 vectors are exactly the same,\n  the difference is the one vector at the end that represents milk and juice, respectively\n"

# Word Embedding Representation

In [8]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [9]:
import numpy as np

In [10]:
"""
To train the model in DL, we need to ensure sentences have same length (input size is fixed).
To do so, we use paddings (pre or post). Pre-padding are better when multiple types of neural networks are combined to perform a task.
"""
sent_length=8
embedded_docs=pad_sequences(oh_representaion,padding='pre',maxlen=sent_length)
embedded_docs

array([[   0,    0,    0,    0, 5026, 4328, 2235, 5321],
       [   0,    0,    0,    0, 5026, 4328, 2235, 2318],
       [   0,    0,    0,    0, 5026, 2594, 2235, 5225],
       [   0,    0,    0, 3908, 3480, 4384, 8600, 9083],
       [   0,    0,    0, 3908, 3480, 4384, 8600, 7646],
       [   0,    0,    0, 3394, 5026, 3427, 2235, 2606],
       [   0,    0,    0,    0, 7287, 3113, 6740, 8600]], dtype=int32)

# Feature Representation
**Convert the arrays of indexes into vectors**

In [11]:
# Create 10-dimension vector
dim=10

In [12]:
# Create an embedding layer that train many neural networds (work like Word2Vec)
model = Sequential()
model.add(Embedding(vocab_size,10))
model.compile('adam','mse') #Add an optimizer and a loss function to our model

In [13]:
model.summary()

In [14]:
model.build(input_shape=(None, sent_length))

In [15]:
model.summary()

In [16]:
print(model.predict(embedded_docs))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 618ms/step
[[[ 0.01892597 -0.00026926 -0.0439918   0.01021737 -0.0360389
   -0.01194047 -0.00475793 -0.03725079  0.01545986  0.04769624]
  [ 0.01892597 -0.00026926 -0.0439918   0.01021737 -0.0360389
   -0.01194047 -0.00475793 -0.03725079  0.01545986  0.04769624]
  [ 0.01892597 -0.00026926 -0.0439918   0.01021737 -0.0360389
   -0.01194047 -0.00475793 -0.03725079  0.01545986  0.04769624]
  [ 0.01892597 -0.00026926 -0.0439918   0.01021737 -0.0360389
   -0.01194047 -0.00475793 -0.03725079  0.01545986  0.04769624]
  [ 0.03406492  0.02938137  0.03456593  0.03603406 -0.01365336
    0.02485354 -0.00922742 -0.0262849   0.01083253  0.0056682 ]
  [ 0.00444746  0.00414716 -0.00800307  0.02608799 -0.00108822
    0.02869275  0.04440302  0.00421248  0.01433188 -0.03720325]
  [ 0.01173595 -0.04847815  0.03907535 -0.04786451  0.04051057
   -0.0244301   0.02214963  0.00365313  0.01654848 -0.0032614 ]
  [ 0.00730488 -0.02168752  0.04745029  0.

#Bidirectional LSTM RNN

*   Recurrent Neural Network(RNN) is a deep learning model used to handle sequential input data and return sequential output data. We use this techniquce in translation, chatbots, Q/A, and so on.

*   LSTM RNN: Long Short Term Memory RNN solves 2 problems with RNN models:
    *   Vanishing gradient problems (dead neurons)
    *   Loss context information (gaps between words with same context are too big)

*   Bidirectional LSTM RNN: Many-to-many RNN models that process sequential data in both directions (forward and backward) (application: we need context for word predictions)

*   Math intuition is similar to a deep learning models with input layers, activation functions in the hidden layers (RELU variants) and activation function specific for output layer depending on the purpose of the model (binary classification or multiclass classification, etc.).





In [18]:
import pandas as pd
import numpy as np

**Case study: Classfifying fake news using bidirectional LSTM RNN**

In [27]:
train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
train_data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [28]:
train_data.isnull().sum()

Unnamed: 0,0
id,0
title,558
author,1957
text,39
label,0


In [29]:
train_data.shape

(20800, 5)

In [30]:
train_data=train_data.dropna()

In [32]:
train_data.shape

(18285, 5)

In [65]:
# Get independent features
X=train_data.drop('label', axis=1)
y=train_data['label']

In [66]:
# Check if dataset is balanced or not
y.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,10361
1,7924


In [67]:
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional

In [68]:
vocabulary_size = 5000

In [73]:
data = X.copy()

In [74]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [71]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [75]:
ps = PorterStemmer()
corpus = []
for index, row in data.iterrows():
  print(index)
  review = re.sub('[^a-zA-Z]', ' ',row['title'])
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
15115
15116
15117
15118
15119
15120
15123
15124
15125
15126
15127
15128
15129
15130
15131
15132
15133
15134
15136
15137
15138
15139
15140
15141
15142
15143
15144
15145
15146
15147
15149
15150
15151
15153
15154
15155
15156
15157
15158
15159
15161
15162
15163
15164
15165
15166
15167
15168
15169
15170
15171
15172
15174
15175
15176
15177
15178
15179
15181
15182
15183
15184
15185
15186
15187
15188
15189
15191
15192
15193
15194
15195
15196
15197
15199
15200
15201
15202
15203
15204
15205
15206
15207
15208
15209
15211
15212
15213
15214
15215
15216
15217
15218
15219
15220
15221
15222
15223
15224
15225
15226
15227
15228
15229
15230
15231
15232
15233
15234
15235
15236
15237
15239
15240
15241
15242
15243
15244
15245
15246
15247
15248
15249
15250
15251
15252
15253
15254
15255
15256
15257
15258
15259
15260
15262
15265
15266
15267
15268
15269
15270
15271
15272
15273
15274
15275
15276
15277
15278
15279
15280
15281
15282
15284
15285
15286

In [76]:
corpus

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri',
 'jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart',
 'beno hamon win french socialist parti presidenti nomin new york time',
 'back channel plan ukrain russia courtesi trump associ new york time',
 'obama organ action partner soro link indivis disrupt trump agenda',
 'bbc comedi sketch real housew isi caus outrag',
 'russian research discov secret nazi militari base treasur hunter arctic photo',
 'us offici see link trump russia',
 'ye paid govern troll social media blog forum websit',
 'major leagu soccer argentin find home success new york time',
 'well fargo chief abruptli step new york time',
 'anonym donor pay million releas everyon arrest dakota access pipelin',
 'fbi close hilla

In [77]:
oh_repr = [one_hot(word, vocabulary_size) for word in corpus]
oh_repr

[[4100, 4366, 2184, 4058, 4638, 1506, 1486, 4920, 4887, 3389],
 [2340, 4254, 2285, 981, 2118, 294, 1789],
 [1907, 3736, 2859, 3915],
 [2643, 4705, 4679, 3476, 4831, 4445],
 [2230, 2118, 4752, 505, 577, 420, 2118, 742, 3977, 913],
 [1965,
  1536,
  2256,
  3548,
  2562,
  1324,
  1340,
  4859,
  4576,
  408,
  1744,
  4642,
  2728,
  2690,
  1789],
 [4239, 543, 3353, 1509, 4005, 1438, 89, 4948, 66, 3296, 427],
 [4234, 3052, 3772, 4589, 641, 3502, 1324, 228, 66, 3296, 427],
 [2728, 3970, 1426, 1990, 2060, 3797, 2351, 2340, 1324, 1914],
 [3519, 528, 2328, 1847, 1721, 2699, 3798, 4969],
 [2800, 2438, 1597, 2937, 3616, 665, 2376, 2687, 2130, 1308, 3188],
 [3476, 4619, 4638, 3797, 1324, 641],
 [3881, 3101, 4796, 4593, 547, 2956, 362, 1490, 4697],
 [3328, 4545, 2231, 4184, 3165, 4559, 56, 66, 3296, 427],
 [327, 155, 2702, 3619, 384, 66, 3296, 427],
 [4931, 1338, 2370, 1175, 1577, 4922, 4266, 891, 4071, 3904],
 [2154, 3938, 4254],
 [2014, 3058, 4744, 4947, 1324, 124, 2636, 1789],
 [1665, 3350,

In [91]:
# Embedding Representation
st_length = 25
em_docs=pad_sequences(oh_repr,padding='pre',maxlen=st_length)
em_docs

array([[   0,    0,    0, ..., 4920, 4887, 3389],
       [   0,    0,    0, ..., 2118,  294, 1789],
       [   0,    0,    0, ..., 3736, 2859, 3915],
       ...,
       [   0,    0,    0, ...,   66, 3296,  427],
       [   0,    0,    0, ..., 1699, 3395,  574],
       [   0,    0,    0, ..., 4130, 1643, 2156]], dtype=int32)

In [92]:
# Creating model
embedding_vector_features = 40
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_vector_features, input_length=st_length))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.build(input_shape=(None, st_length))
print(model.summary())



None


In [93]:
X_final = np.array(em_docs)
y_final=np.array(y)

In [94]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final,y_final,test_size=0.2,random_state=42)

In [95]:
model.fit(X_train, y_train, validation_data=(X_test,y_test),epochs=10, batch_size=32)

Epoch 1/10
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.8265 - loss: 0.3434 - val_accuracy: 0.9133 - val_loss: 0.2033
Epoch 2/10
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.9535 - loss: 0.1263 - val_accuracy: 0.9141 - val_loss: 0.2074
Epoch 3/10
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9700 - loss: 0.0806 - val_accuracy: 0.9070 - val_loss: 0.2268
Epoch 4/10
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9849 - loss: 0.0490 - val_accuracy: 0.9114 - val_loss: 0.2733
Epoch 5/10
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.9906 - loss: 0.0298 - val_accuracy: 0.9092 - val_loss: 0.3455
Epoch 6/10
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9960 - loss: 0.0134 - val_accuracy: 0.9106 - val_loss: 0.3747
Epoch 7/10
[1m458/458[0m 

<keras.src.callbacks.history.History at 0x7e6e6c637670>

In [96]:
y_pred = model.predict(X_test)

[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [102]:
y_pred=np.where(y_pred>=0.5,1,0)

In [103]:
from sklearn.metrics import confusion_matrix

In [104]:
confusion_matrix(y_test,y_pred)

array([[1895,  187],
       [ 134, 1441]])

In [106]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.9122231337161608
