## Practical Implementation : Word embedding layer with keras
- we will take some example then convert to vector

In [23]:
from tensorflow.keras.preprocessing.text import one_hot

In [24]:
## sentences first use one_hot encoding, these are our set of questions
sent=['the glass of milk' ,
'the glass of juice',
'the cup of tea ',
'I am a good boy',
'I am a good developer' ,
'understand the meaning of words',
'your videos are good',]

In [25]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea ',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [26]:
## Define vocabulary size
voc_size=10000 # lets take 10000

In [27]:
for words in sent:
    print(words)
    break

the glass of milk


In [28]:
## one hot Representation
## make list comprehension we pass words(each sentence at a time) in sent to one hot along with vocab size
one_hot_repr=[one_hot(words,voc_size) for words in sent]
one_hot_repr

[[4411, 5492, 9052, 2396],
 [4411, 5492, 9052, 6064],
 [4411, 8477, 9052, 6011],
 [5873, 2067, 208, 7073, 2734],
 [5873, 2067, 208, 7073, 397],
 [6587, 4411, 2788, 9052, 9531],
 [5614, 825, 3461, 7073]]

In [29]:
# the above representation means we convert each sentence in index at which these words are 
# in our 10000 vovab size means "the" present at 4411 index, "glass" present at 5492 index and so on
# so if we write a vector of glass it will be (10000,1) vector with 1 at index 5492 and rest values 0 
# similarly for other words
# above we can see that the first two sentence are quite similar and thats why we see indexes are same 
# and only last index is different

# so now if say consider our 300 feature dimension example where we have vocab words like
#  boy,king,girl,queen,apple,banana
# and convert this 300 to 2 dimension using PCA and plot it we see boy and king will be close, girl and 
# queen will be close, apple and banana will be close
# and here only out important algo "consine similarity" gets apllied and this has important use 
# case in recommendation system

In [30]:
# we direclty can use one_hot as we discuss about its problem we create and embedding layer
## word embedding representation

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences # talk later
from tensorflow.keras.models import Sequential # since we are creating a sequential model


In [31]:
import numpy as np

In [32]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea ',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [33]:
## pad_sequences we use this since all sentences can have different number of words so we use 
# pad_sequnce to pad them to same length as all words of sentence will go with a fixed number of time 
# stamp in RNN
set_length_max=8 # we set is for padding according to our dataset sent since here max words in sentence is 5 so 8 for safer side
# give it our one_hot_reps and apply prepadding padding added at start and also maxlen each sentence should be
embedded_docs=pad_sequences(one_hot_repr,padding='pre',maxlen=set_length_max)
print(embedded_docs) # we can see now our representation

[[   0    0    0    0 4411 5492 9052 2396]
 [   0    0    0    0 4411 5492 9052 6064]
 [   0    0    0    0 4411 8477 9052 6011]
 [   0    0    0 5873 2067  208 7073 2734]
 [   0    0    0 5873 2067  208 7073  397]
 [   0    0    0 6587 4411 2788 9052 9531]
 [   0    0    0    0 5614  825 3461 7073]]


In [44]:
# feature representation
dim=10 # i.e we have 10 features

In [48]:
# create a sequential model
model=Sequential()
# inside model add Embedding layer by giving voc_size,dim,input_length it internally use word2vec
model.add(Embedding(voc_size,dim,input_length=set_length_max))
## train this model with optimizer adam and loss function mse
model.compile('adam','mse')



In [49]:
# now see our model summary
model.summary()

In [54]:
model.predict(embedded_docs) # now they are coonverted to vector using embedding layers
# we pass padded one_hot_repr words

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step


array([[[-1.06703117e-03, -4.79221605e-02,  1.25568174e-02,
          3.36039774e-02,  3.40031274e-02, -1.32556334e-02,
         -4.81351614e-02, -1.11858919e-03, -1.41711012e-02,
          1.82498619e-03],
        [-1.06703117e-03, -4.79221605e-02,  1.25568174e-02,
          3.36039774e-02,  3.40031274e-02, -1.32556334e-02,
         -4.81351614e-02, -1.11858919e-03, -1.41711012e-02,
          1.82498619e-03],
        [-1.06703117e-03, -4.79221605e-02,  1.25568174e-02,
          3.36039774e-02,  3.40031274e-02, -1.32556334e-02,
         -4.81351614e-02, -1.11858919e-03, -1.41711012e-02,
          1.82498619e-03],
        [-1.06703117e-03, -4.79221605e-02,  1.25568174e-02,
          3.36039774e-02,  3.40031274e-02, -1.32556334e-02,
         -4.81351614e-02, -1.11858919e-03, -1.41711012e-02,
          1.82498619e-03],
        [ 3.06217708e-02,  2.10681222e-02,  1.44317262e-02,
         -4.20122035e-02,  1.52206458e-02,  1.22934580e-02,
         -2.64960770e-02,  1.37637742e-02,  4.701257