### Word Embedding Techniques using Embedding Layer in Keras

In [3]:
import tensorflow as tf
print(tf.__version__)

2.15.0


In [4]:
from tensorflow.keras.preprocessing.text import one_hot

In [6]:
# sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']

In [7]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [10]:
# Vocabulary size
voc_size=5000

#### One Hot Representation

In [11]:
onehot_repr=[one_hot(words,voc_size)for words in sent]
print(onehot_repr) #out of the 5000 words, 4876 has the word 'the'

[[4876, 372, 2471, 4584], [4876, 372, 2471, 2058], [4876, 998, 2471, 1421], [4160, 643, 467, 3541, 1436], [4160, 643, 467, 3541, 1523], [3564, 4876, 2734, 2471, 415], [420, 2319, 3403, 3541]]


### Word Embedding Represntation

In [12]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [13]:
import numpy as np

In [16]:
# pre padding
sent_length=8
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0 4876  372 2471 4584]
 [   0    0    0    0 4876  372 2471 2058]
 [   0    0    0    0 4876  998 2471 1421]
 [   0    0    0 4160  643  467 3541 1436]
 [   0    0    0 4160  643  467 3541 1523]
 [   0    0    0 3564 4876 2734 2471  415]
 [   0    0    0    0  420 2319 3403 3541]]


In [17]:
# 10 feature dimesnions
dim=10

In [18]:
model=Sequential()
model.add(Embedding(voc_size,10,input_length=sent_length))
model.compile('adam','mse')

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             50000     
                                                                 
Total params: 50000 (195.31 KB)
Trainable params: 50000 (195.31 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
#'the glass of milk',
embedded_docs[0]

array([   0,    0,    0,    0, 4876,  372, 2471, 4584], dtype=int32)

In [21]:
model.predict(embedded_docs[0])



array([[ 0.00039358, -0.01695055, -0.00477405,  0.02774824, -0.01830947,
        -0.01789609, -0.03986027,  0.04345098, -0.01983104, -0.04693346],
       [ 0.00039358, -0.01695055, -0.00477405,  0.02774824, -0.01830947,
        -0.01789609, -0.03986027,  0.04345098, -0.01983104, -0.04693346],
       [ 0.00039358, -0.01695055, -0.00477405,  0.02774824, -0.01830947,
        -0.01789609, -0.03986027,  0.04345098, -0.01983104, -0.04693346],
       [ 0.00039358, -0.01695055, -0.00477405,  0.02774824, -0.01830947,
        -0.01789609, -0.03986027,  0.04345098, -0.01983104, -0.04693346],
       [-0.01630967, -0.01673692,  0.00818444,  0.01049082,  0.01968263,
         0.02242651,  0.04468682,  0.04870704, -0.03541225, -0.03197278],
       [ 0.04395847,  0.02063057, -0.02672843,  0.00315542,  0.01962094,
         0.02064759, -0.02648944, -0.02024477, -0.01260625, -0.02126298],
       [ 0.02750119, -0.00514066,  0.00512937, -0.03874106,  0.0288753 ,
         0.01534803,  0.01167025,  0.02320747

In [22]:
print(model.predict(embedded_docs))

[[[ 0.00039358 -0.01695055 -0.00477405  0.02774824 -0.01830947
   -0.01789609 -0.03986027  0.04345098 -0.01983104 -0.04693346]
  [ 0.00039358 -0.01695055 -0.00477405  0.02774824 -0.01830947
   -0.01789609 -0.03986027  0.04345098 -0.01983104 -0.04693346]
  [ 0.00039358 -0.01695055 -0.00477405  0.02774824 -0.01830947
   -0.01789609 -0.03986027  0.04345098 -0.01983104 -0.04693346]
  [ 0.00039358 -0.01695055 -0.00477405  0.02774824 -0.01830947
   -0.01789609 -0.03986027  0.04345098 -0.01983104 -0.04693346]
  [-0.01630967 -0.01673692  0.00818444  0.01049082  0.01968263
    0.02242651  0.04468682  0.04870704 -0.03541225 -0.03197278]
  [ 0.04395847  0.02063057 -0.02672843  0.00315542  0.01962094
    0.02064759 -0.02648944 -0.02024477 -0.01260625 -0.02126298]
  [ 0.02750119 -0.00514066  0.00512937 -0.03874106  0.0288753
    0.01534803  0.01167025  0.02320747  0.04855933 -0.02117498]
  [-0.01679984  0.04504225  0.04291202  0.01165206  0.00649063
    0.0333701   0.03250017 -0.02420166 -0.0055962

In [23]:
embedded_docs[0]

array([   0,    0,    0,    0, 4876,  372, 2471, 4584], dtype=int32)

In [24]:
print(model.predict(embedded_docs)[0])

[[ 0.00039358 -0.01695055 -0.00477405  0.02774824 -0.01830947 -0.01789609
  -0.03986027  0.04345098 -0.01983104 -0.04693346]
 [ 0.00039358 -0.01695055 -0.00477405  0.02774824 -0.01830947 -0.01789609
  -0.03986027  0.04345098 -0.01983104 -0.04693346]
 [ 0.00039358 -0.01695055 -0.00477405  0.02774824 -0.01830947 -0.01789609
  -0.03986027  0.04345098 -0.01983104 -0.04693346]
 [ 0.00039358 -0.01695055 -0.00477405  0.02774824 -0.01830947 -0.01789609
  -0.03986027  0.04345098 -0.01983104 -0.04693346]
 [-0.01630967 -0.01673692  0.00818444  0.01049082  0.01968263  0.02242651
   0.04468682  0.04870704 -0.03541225 -0.03197278]
 [ 0.04395847  0.02063057 -0.02672843  0.00315542  0.01962094  0.02064759
  -0.02648944 -0.02024477 -0.01260625 -0.02126298]
 [ 0.02750119 -0.00514066  0.00512937 -0.03874106  0.0288753   0.01534803
   0.01167025  0.02320747  0.04855933 -0.02117498]
 [-0.01679984  0.04504225  0.04291202  0.01165206  0.00649063  0.0333701
   0.03250017 -0.02420166 -0.00559623  0.01469716]]


In [26]:
#Other sentences
new =["The world is a better place",
      "Marvel series is my favourite movie",
      "I like DC movies",
      "the cat is eating the food",
      "Tom and Jerry is my favourite movie",
      "Python is my favourite programming language"
      ]

In [27]:
one_hot_repr=[one_hot(words,voc_size)for words in new]
print(one_hot_repr) #out of the 5000 words, 4876 has the word 'the'

[[4876, 1107, 171, 467, 2043, 2948], [2151, 3121, 171, 4837, 496, 2375], [4160, 578, 3676, 2338], [4876, 324, 171, 2952, 4876, 1253], [324, 4272, 4354, 171, 4837, 496, 2375], [1639, 171, 4837, 496, 4650, 3658]]


In [29]:
# post padding
new_length=10
embedded_docs=pad_sequences(one_hot_repr,padding='post',maxlen=new_length)
print(embedded_docs)

[[4876 1107  171  467 2043 2948    0    0    0    0]
 [2151 3121  171 4837  496 2375    0    0    0    0]
 [4160  578 3676 2338    0    0    0    0    0    0]
 [4876  324  171 2952 4876 1253    0    0    0    0]
 [ 324 4272 4354  171 4837  496 2375    0    0    0]
 [1639  171 4837  496 4650 3658    0    0    0    0]]


In [30]:
model=Sequential()
model.add(Embedding(voc_size,10,input_length=new_length))
model.compile('adam','mse')

In [32]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 10, 10)            50000     
                                                                 
Total params: 50000 (195.31 KB)
Trainable params: 50000 (195.31 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [33]:
embedded_docs[0]

array([4876, 1107,  171,  467, 2043, 2948,    0,    0,    0,    0],
      dtype=int32)

In [34]:
model.predict(embedded_docs[0])



array([[ 0.01829154,  0.04260584,  0.01281222, -0.03263942,  0.04790422,
         0.03539631, -0.04659817,  0.00308064,  0.03586851,  0.01801587],
       [ 0.03881271, -0.01296195, -0.03096943, -0.0168111 , -0.0049369 ,
         0.02860687, -0.00998471,  0.00699721,  0.03833641, -0.01140261],
       [ 0.01851145, -0.04461743,  0.04765738, -0.03855868,  0.01514151,
         0.02905396,  0.04738616,  0.0301504 , -0.00159618, -0.02448009],
       [ 0.0249458 ,  0.00573247,  0.0241282 ,  0.03271906,  0.0445062 ,
        -0.04656712, -0.0493541 , -0.01016663,  0.0096672 , -0.01746993],
       [-0.03817396,  0.03151212,  0.03846145,  0.04021387, -0.01619778,
        -0.0444242 ,  0.01952061,  0.04830753,  0.01973318, -0.02316817],
       [ 0.01318345,  0.032358  , -0.03204743, -0.00779158, -0.00447237,
         0.00030994,  0.04628514,  0.00173362,  0.04238769, -0.02413818],
       [ 0.02031283,  0.02099978, -0.00468849,  0.01622145,  0.0357714 ,
        -0.01131367, -0.01385726,  0.0118652 

In [35]:
print(model.predict(embedded_docs))

[[[ 1.82915442e-02  4.26058434e-02  1.28122233e-02 -3.26394215e-02
    4.79042195e-02  3.53963114e-02 -4.65981737e-02  3.08064371e-03
    3.58685143e-02  1.80158652e-02]
  [ 3.88127081e-02 -1.29619464e-02 -3.09694298e-02 -1.68110952e-02
   -4.93689626e-03  2.86068656e-02 -9.98470932e-03  6.99720532e-03
    3.83364074e-02 -1.14026070e-02]
  [ 1.85114481e-02 -4.46174257e-02  4.76573817e-02 -3.85586843e-02
    1.51415132e-02  2.90539600e-02  4.73861583e-02  3.01504023e-02
   -1.59617513e-03 -2.44800933e-02]
  [ 2.49457993e-02  5.73246554e-03  2.41282023e-02  3.27190645e-02
    4.45062034e-02 -4.65671197e-02 -4.93541025e-02 -1.01666339e-02
    9.66719538e-03 -1.74699314e-02]
  [-3.81739624e-02  3.15121152e-02  3.84614505e-02  4.02138717e-02
   -1.61977783e-02 -4.44242023e-02  1.95206068e-02  4.83075269e-02
    1.97331794e-02 -2.31681708e-02]
  [ 1.31834485e-02  3.23580019e-02 -3.20474282e-02 -7.79157877e-03
   -4.47237492e-03  3.09944153e-04  4.62851413e-02  1.73362345e-03
    4.23876904e-