In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
### sentences
sentences=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]
sentences

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [3]:
## Define the vocabulary size
voc_size=10000

# One Hot Representation

In [None]:
one_hot_repr=[one_hot(sentence,voc_size) for sentence in sentences]
one_hot_repr
# If we expand "The", it will be in 6258th index of the entire vector which is of dimention 10000.
# If we expand "good", it will be in 2035th index of the entire vector which is of dimention 10000.

[[6258, 7235, 6888, 8948],
 [6258, 7235, 6888, 4278],
 [6258, 6121, 6888, 9990],
 [3642, 5135, 2555, 2035, 762],
 [3642, 5135, 2555, 2035, 5248],
 [8345, 6258, 2501, 6888, 7495],
 [5207, 2906, 3251, 2035]]

# Word Embedding Representation

In [5]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [6]:
max_sent_length=8
embedded_docs=pad_sequences(one_hot_repr,padding='pre',maxlen=max_sent_length)
print(embedded_docs)
# all sentences are made of equal length using padding

[[   0    0    0    0 6258 7235 6888 8948]
 [   0    0    0    0 6258 7235 6888 4278]
 [   0    0    0    0 6258 6121 6888 9990]
 [   0    0    0 3642 5135 2555 2035  762]
 [   0    0    0 3642 5135 2555 2035 5248]
 [   0    0    0 8345 6258 2501 6888 7495]
 [   0    0    0    0 5207 2906 3251 2035]]


# Feature Representation

In [7]:
dim=10 # In Google, each word has 300 dimensions
model=Sequential()
model.add(Embedding(voc_size,dim,input_length=max_sent_length)) # Embedding Layer
model.compile('adam','mse')



In [8]:
model.summary()

In [9]:
model.predict(embedded_docs)
# vector expansion of each word (sentence-wise) # vector dimension of each word is 10
# Each sentence have 8 words
# Total 7 sentences

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 406ms/step


array([[[ 4.45753969e-02, -4.62390184e-02,  4.01903279e-02,
          9.56351683e-03,  2.02530064e-02,  1.81475542e-02,
         -2.02062726e-02, -1.72649845e-02, -3.67956981e-02,
          3.47147137e-03],
        [ 4.45753969e-02, -4.62390184e-02,  4.01903279e-02,
          9.56351683e-03,  2.02530064e-02,  1.81475542e-02,
         -2.02062726e-02, -1.72649845e-02, -3.67956981e-02,
          3.47147137e-03],
        [ 4.45753969e-02, -4.62390184e-02,  4.01903279e-02,
          9.56351683e-03,  2.02530064e-02,  1.81475542e-02,
         -2.02062726e-02, -1.72649845e-02, -3.67956981e-02,
          3.47147137e-03],
        [ 4.45753969e-02, -4.62390184e-02,  4.01903279e-02,
          9.56351683e-03,  2.02530064e-02,  1.81475542e-02,
         -2.02062726e-02, -1.72649845e-02, -3.67956981e-02,
          3.47147137e-03],
        [-1.93661097e-02, -3.08101308e-02, -2.80199777e-02,
         -2.40601599e-04, -3.05439159e-03, -1.42577663e-02,
          4.02289666e-02,  1.11238845e-02,  1.428001

In [10]:
embedded_docs[0] # 1st sentence

array([   0,    0,    0,    0, 6258, 7235, 6888, 8948])

In [11]:
# model.predict(embedded_docs[0])
model.predict(np.expand_dims(embedded_docs[0], axis=0)) # vector expansion of each word in the 1st sentence

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step


array([[[ 4.45753969e-02, -4.62390184e-02,  4.01903279e-02,
          9.56351683e-03,  2.02530064e-02,  1.81475542e-02,
         -2.02062726e-02, -1.72649845e-02, -3.67956981e-02,
          3.47147137e-03],
        [ 4.45753969e-02, -4.62390184e-02,  4.01903279e-02,
          9.56351683e-03,  2.02530064e-02,  1.81475542e-02,
         -2.02062726e-02, -1.72649845e-02, -3.67956981e-02,
          3.47147137e-03],
        [ 4.45753969e-02, -4.62390184e-02,  4.01903279e-02,
          9.56351683e-03,  2.02530064e-02,  1.81475542e-02,
         -2.02062726e-02, -1.72649845e-02, -3.67956981e-02,
          3.47147137e-03],
        [ 4.45753969e-02, -4.62390184e-02,  4.01903279e-02,
          9.56351683e-03,  2.02530064e-02,  1.81475542e-02,
         -2.02062726e-02, -1.72649845e-02, -3.67956981e-02,
          3.47147137e-03],
        [-1.93661097e-02, -3.08101308e-02, -2.80199777e-02,
         -2.40601599e-04, -3.05439159e-03, -1.42577663e-02,
          4.02289666e-02,  1.11238845e-02,  1.428001