In [4]:
from tensorflow.keras.preprocessing.text import one_hot

In [5]:
# sentences
sentences = [
    "I loved the movie", 
    "The actors were amazing", 
    "Great plot and direction", 
    "It made me feel happy", 
    "I enjoyed every moment", 
    "The music was beautiful", 
    "Brilliant acting and story", 
    "I would watch it again", 
    "Highly recommend this film", 
    "The ending was satisfying",
]


In [6]:
sentences

['I loved the movie',
 'The actors were amazing',
 'Great plot and direction',
 'It made me feel happy',
 'I enjoyed every moment',
 'The music was beautiful',
 'Brilliant acting and story',
 'I would watch it again',
 'Highly recommend this film',
 'The ending was satisfying']

## Define the Vocabulary size

In [7]:
vocabulary_size = 10000  # size of the vocabulary

## One-Hot Representation

In [8]:
one_hot_repr = [one_hot(word, vocabulary_size) for word in sentences]
one_hot_repr

[[7797, 3057, 8036, 7838],
 [8036, 5464, 2125, 9661],
 [9295, 9027, 5318, 2627],
 [752, 8314, 4486, 2560, 8362],
 [7797, 6905, 2340, 1579],
 [8036, 9471, 8238, 9157],
 [1351, 2671, 5318, 4383],
 [7797, 4070, 951, 752, 4269],
 [4973, 8367, 5098, 9684],
 [8036, 2174, 8238, 24]]

## Word Embedding Representation


In [11]:
from tensorflow.keras.layers import Embedding
# from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [19]:
# Make all sentencesequlily words mean same size by adding padding parameter if padding is pre zeros in first and if post zeros in last
# padding = 'pre' or 'post'
sent_length = 8
embeded_docs = pad_sequences(one_hot_repr, maxlen=sent_length, padding='pre')
print(embeded_docs)

[[   0    0    0    0 7797 3057 8036 7838]
 [   0    0    0    0 8036 5464 2125 9661]
 [   0    0    0    0 9295 9027 5318 2627]
 [   0    0    0  752 8314 4486 2560 8362]
 [   0    0    0    0 7797 6905 2340 1579]
 [   0    0    0    0 8036 9471 8238 9157]
 [   0    0    0    0 1351 2671 5318 4383]
 [   0    0    0 7797 4070  951  752 4269]
 [   0    0    0    0 4973 8367 5098 9684]
 [   0    0    0    0 8036 2174 8238   24]]


## Feature Representation

In [35]:
dim =10
model = Sequential()
model.add(Embedding(vocabulary_size, dim, input_length=sent_length))
model.compile('adam', 'mse')
model.summary()

In [36]:
model.predict(embeded_docs)  # this will give us the embedding for each word in the sentence

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 754ms/step


array([[[ 2.91721858e-02, -3.56248617e-02,  1.62231065e-02,
         -1.58551820e-02, -3.03043257e-02, -1.25476122e-02,
          2.63944156e-02, -1.43061392e-02, -2.39399076e-02,
         -1.12369433e-02],
        [ 2.91721858e-02, -3.56248617e-02,  1.62231065e-02,
         -1.58551820e-02, -3.03043257e-02, -1.25476122e-02,
          2.63944156e-02, -1.43061392e-02, -2.39399076e-02,
         -1.12369433e-02],
        [ 2.91721858e-02, -3.56248617e-02,  1.62231065e-02,
         -1.58551820e-02, -3.03043257e-02, -1.25476122e-02,
          2.63944156e-02, -1.43061392e-02, -2.39399076e-02,
         -1.12369433e-02],
        [ 2.91721858e-02, -3.56248617e-02,  1.62231065e-02,
         -1.58551820e-02, -3.03043257e-02, -1.25476122e-02,
          2.63944156e-02, -1.43061392e-02, -2.39399076e-02,
         -1.12369433e-02],
        [ 5.38449362e-03, -2.51741167e-02, -2.77022719e-02,
          1.15469471e-02, -4.23818827e-03,  3.33327167e-02,
          2.92244889e-02, -2.81302333e-02,  1.482021

In [37]:
embeded_docs[0]

array([   0,    0,    0,    0, 7797, 3057, 8036, 7838])

In [43]:
model.predict(embeded_docs)[0]  # this will give us the embedding for each word in the sentence

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step


array([[ 2.9172186e-02, -3.5624862e-02,  1.6223107e-02, -1.5855182e-02,
        -3.0304326e-02, -1.2547612e-02,  2.6394416e-02, -1.4306139e-02,
        -2.3939908e-02, -1.1236943e-02],
       [ 2.9172186e-02, -3.5624862e-02,  1.6223107e-02, -1.5855182e-02,
        -3.0304326e-02, -1.2547612e-02,  2.6394416e-02, -1.4306139e-02,
        -2.3939908e-02, -1.1236943e-02],
       [ 2.9172186e-02, -3.5624862e-02,  1.6223107e-02, -1.5855182e-02,
        -3.0304326e-02, -1.2547612e-02,  2.6394416e-02, -1.4306139e-02,
        -2.3939908e-02, -1.1236943e-02],
       [ 2.9172186e-02, -3.5624862e-02,  1.6223107e-02, -1.5855182e-02,
        -3.0304326e-02, -1.2547612e-02,  2.6394416e-02, -1.4306139e-02,
        -2.3939908e-02, -1.1236943e-02],
       [ 5.3844936e-03, -2.5174117e-02, -2.7702272e-02,  1.1546947e-02,
        -4.2381883e-03,  3.3332717e-02,  2.9224489e-02, -2.8130233e-02,
         1.4820211e-03, -1.1761427e-02],
       [-4.5297518e-03,  1.6960714e-02,  4.1078776e-05,  3.1554881e-02,
   