In [2]:
from tensorflow.keras.utils import to_categorical

In [3]:
sent = [
    'the glass of milk',
    'the glass of juice',
    'the cup of tea',
    'I am a good boy',
    'I am a good developer',
    'understand the meaning of words',
    'your videos are good'
]

In [4]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [5]:
voc_size = 1000

##### One hot representation

The output is a list which has the index value of each word in the vector size of 10000

In [7]:
from tensorflow.keras.preprocessing.text import one_hot

# Define the sentences
sent = [
    'the glass of milk',
    'the glass of juice',
    'the cup of tea',
    'I am a good boy',
    'I am a good developer',
    'understand the meaning of words',
    'your videos are good'
]

# Set the vocabulary size
voc_size = 10000

# Apply one-hot encoding to each sentence
onehot_repr = [one_hot(words, voc_size) for words in sent]

print(onehot_repr)

[[7037, 6789, 6410, 7666], [7037, 6789, 6410, 6392], [7037, 1261, 6410, 9712], [1509, 3193, 7995, 8112, 2548], [1509, 3193, 7995, 8112, 6479], [2527, 7037, 9785, 6410, 3152], [5446, 3773, 4189, 8112]]


#### Word embedding 

In [8]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences #WE need to make sure all sentences have same number of words which are being passed into embedding layers
from tensorflow.keras.models import Sequential

In [9]:
import numpy as np

In [12]:
sent_length  = 8
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen = sent_length)
print(embedded_docs)

[[   0    0    0    0 7037 6789 6410 7666]
 [   0    0    0    0 7037 6789 6410 6392]
 [   0    0    0    0 7037 1261 6410 9712]
 [   0    0    0 1509 3193 7995 8112 2548]
 [   0    0    0 1509 3193 7995 8112 6479]
 [   0    0    0 2527 7037 9785 6410 3152]
 [   0    0    0    0 5446 3773 4189 8112]]


In [13]:
#Now we have to convert them into feature representation we need to have a fixed dimension for each array of vectors
dim = 15

In [17]:
model = Sequential()
model.add(Embedding(voc_size,10, input_length = sent_length))
model.compile('adam','mse')

In [18]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 8, 10)             100000    
                                                                 
Total params: 100000 (390.62 KB)
Trainable params: 100000 (390.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
print(model.predict(embedded_docs))

#Since we have a dimension of 10 to the model, each element gets converted into a a vector of 10 so the 0 in the first element gets converted into a vector of 10 dimensions

[[[ 0.02183932 -0.0356861   0.01647815  0.04364965  0.04049188
   -0.01847304  0.01271603  0.04679966  0.03345631  0.01633394]
  [ 0.02183932 -0.0356861   0.01647815  0.04364965  0.04049188
   -0.01847304  0.01271603  0.04679966  0.03345631  0.01633394]
  [ 0.02183932 -0.0356861   0.01647815  0.04364965  0.04049188
   -0.01847304  0.01271603  0.04679966  0.03345631  0.01633394]
  [ 0.02183932 -0.0356861   0.01647815  0.04364965  0.04049188
   -0.01847304  0.01271603  0.04679966  0.03345631  0.01633394]
  [-0.00043765 -0.00943429 -0.00961263 -0.00290214 -0.02811103
   -0.0423254  -0.02847641  0.02321613 -0.01152484  0.02565719]
  [-0.00983726 -0.02429731 -0.04165945  0.02177365  0.0222931
    0.03084009 -0.04993043 -0.00104544  0.00163128 -0.03236501]
  [ 0.04732058 -0.04271327  0.00682048 -0.00543018  0.00566672
    0.03952878 -0.02864032 -0.02767669 -0.04997131 -0.0459477 ]
  [-0.01522727  0.03817106 -0.025467    0.00990182  0.01407505
    0.03439838  0.02700258 -0.03527286  0.0374218

In [20]:
embedded_docs[0]

array([   0,    0,    0,    0, 7037, 6789, 6410, 7666])

In [21]:
onehot_repr[0]

[7037, 6789, 6410, 7666]