In [3]:
from tensorflow.keras.preprocessing.text import one_hot




In [4]:
### sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]

In [None]:
# Define the vocabulary size
voc_size=10000

In [6]:
## Define the one hot representation of each and every word
one_hot_repr=[one_hot(words,voc_size) for words in sent]
one_hot_repr
    

[[7, 9, 3, 4],
 [7, 9, 3, 5],
 [7, 4, 3, 3],
 [8, 2, 2, 2, 3],
 [8, 2, 2, 2, 2],
 [9, 7, 1, 3, 1],
 [4, 4, 5, 2]]

In [13]:
# Take all the words from sentences and then use in embedding layer to get the word embeddings representation
# This is the first step in NLP
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [None]:
import numpy as np


## Here you can see that each sentence given in original doc have different length and thus we can't train it on RNN easily as we pass the input at particular timestamp that is based on number of words in a sentence.
## to make each sentence of same length we can use the pad_seq that can add 0 either at the begining or at the end of each sentence depend on the missing words in a particular sentence
## Idea is to have one_hot representation of each word in a sentence in the form of index at which word is available in the vocab and since we can pass directly one hot encoded into rnn we will use this index of each word and convert that into word embedding representation

In [16]:
sent_length=8
embedded_docs=pad_sequences(one_hot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0 5513 5804 8703 1849]
 [   0    0    0    0 5513 5804 8703 8258]
 [   0    0    0    0 5513 5943 8703 4548]
 [   0    0    0 3331 4287 3461  846 6917]
 [   0    0    0 3331 4287 3461  846 3884]
 [   0    0    0 9495 5513 9038 8703 8054]
 [   0    0    0    0 7522 9440 5258  846]]


## Here I will be going ahaead and create feature representation that will basically help in showing the relation of each word with the features and words that are closre to each other or having similar meaning

In [17]:
## feature representation
dim=10

In [18]:
# Create the model w.r.t embedding layer and train the model
model=Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_length))
model.compile('adam','mse')





## This is my embedding model

In [None]:
## Parameters of the model is 10000*10 = 100000
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100000 (390.62 KB)
Trainable params: 100000 (390.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Above model is not actually trained model. It will actually help in converting the words you passed into the vectors

In [20]:
## Use embedding layer to predict vetorised version of each words in terms of 10 dimension
model.predict(embedded_docs) #  embedded_docs is the input to the model




array([[[-1.01207718e-02,  3.85929979e-02, -1.82886347e-02,
          4.77795340e-02, -3.00612934e-02, -3.89476307e-02,
         -2.88615469e-02, -1.23027340e-02, -2.94774305e-02,
         -1.58009902e-02],
        [-1.01207718e-02,  3.85929979e-02, -1.82886347e-02,
          4.77795340e-02, -3.00612934e-02, -3.89476307e-02,
         -2.88615469e-02, -1.23027340e-02, -2.94774305e-02,
         -1.58009902e-02],
        [-1.01207718e-02,  3.85929979e-02, -1.82886347e-02,
          4.77795340e-02, -3.00612934e-02, -3.89476307e-02,
         -2.88615469e-02, -1.23027340e-02, -2.94774305e-02,
         -1.58009902e-02],
        [-1.01207718e-02,  3.85929979e-02, -1.82886347e-02,
          4.77795340e-02, -3.00612934e-02, -3.89476307e-02,
         -2.88615469e-02, -1.23027340e-02, -2.94774305e-02,
         -1.58009902e-02],
        [-3.26917544e-02, -1.05291381e-02, -3.61989252e-02,
          1.28377937e-02, -3.53763476e-02,  1.93755887e-02,
         -2.30960734e-02,  3.06560062e-02, -3.807581

In [21]:
embedded_docs[0]

array([   0,    0,    0,    0, 5513, 5804, 8703, 1849])

In [22]:
## Below is the feature representation of the first sentence in terms of 10 dimension
model.predict(embedded_docs)[0]



array([[-1.0120772e-02,  3.8592998e-02, -1.8288635e-02,  4.7779534e-02,
        -3.0061293e-02, -3.8947631e-02, -2.8861547e-02, -1.2302734e-02,
        -2.9477431e-02, -1.5800990e-02],
       [-1.0120772e-02,  3.8592998e-02, -1.8288635e-02,  4.7779534e-02,
        -3.0061293e-02, -3.8947631e-02, -2.8861547e-02, -1.2302734e-02,
        -2.9477431e-02, -1.5800990e-02],
       [-1.0120772e-02,  3.8592998e-02, -1.8288635e-02,  4.7779534e-02,
        -3.0061293e-02, -3.8947631e-02, -2.8861547e-02, -1.2302734e-02,
        -2.9477431e-02, -1.5800990e-02],
       [-1.0120772e-02,  3.8592998e-02, -1.8288635e-02,  4.7779534e-02,
        -3.0061293e-02, -3.8947631e-02, -2.8861547e-02, -1.2302734e-02,
        -2.9477431e-02, -1.5800990e-02],
       [-3.2691754e-02, -1.0529138e-02, -3.6198925e-02,  1.2837794e-02,
        -3.5376348e-02,  1.9375589e-02, -2.3096073e-02,  3.0656006e-02,
        -3.8075816e-02, -4.4294357e-02],
       [-5.3178519e-05, -9.2239603e-03,  4.7280479e-02,  3.5688732e-02,
   

This is how we develop embedded layer for each of the text that we will pass to the RNN model after it convert these text into vector