# Embedding (convert text into vectors)

In [1]:
from tensorflow.keras.preprocessing.text import one_hot




In [2]:
# sentences
sentences = [
    "I loved the movie", 
    "The actors were amazing", 
    "Great plot and direction", 
    "It made me feel happy", 
    "I enjoyed every moment", 
    "The music was beautiful", 
    "Brilliant acting and story", 
    "I would watch it again", 
    "Highly recommend this film", 
    "The ending was satisfying",
]

In [3]:
sentences

['I loved the movie',
 'The actors were amazing',
 'Great plot and direction',
 'It made me feel happy',
 'I enjoyed every moment',
 'The music was beautiful',
 'Brilliant acting and story',
 'I would watch it again',
 'Highly recommend this film',
 'The ending was satisfying']

In [4]:
## Define The Vocabulary Size
vocabulary_size = 10000  

## One-Hot Representation

In [5]:
one_hot_repr = [one_hot(sentence, vocabulary_size) for sentence in sentences]
one_hot_repr

[[2768, 3335, 4675, 8830],
 [4675, 3577, 6205, 7183],
 [8691, 1892, 8083, 1825],
 [6494, 9915, 7671, 1913, 1044],
 [2768, 8331, 1991, 7390],
 [4675, 3997, 4985, 2680],
 [552, 3465, 8083, 1731],
 [2768, 8111, 1157, 6494, 4341],
 [1379, 2487, 7249, 2191],
 [4675, 653, 4985, 2017]]

## Word Embedding Representation

In [6]:
from tensorflow.keras.layers import Embedding
# from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [7]:
# Make all sentencesequlily words mean same size by adding padding parameter if padding is pre zeros in first and if post zeros in last
# padding = 'pre' or 'post'
sent_length = 8
embeded_docs = pad_sequences(one_hot_repr, maxlen=sent_length, padding='pre')
print(embeded_docs)

[[   0    0    0    0 2768 3335 4675 8830]
 [   0    0    0    0 4675 3577 6205 7183]
 [   0    0    0    0 8691 1892 8083 1825]
 [   0    0    0 6494 9915 7671 1913 1044]
 [   0    0    0    0 2768 8331 1991 7390]
 [   0    0    0    0 4675 3997 4985 2680]
 [   0    0    0    0  552 3465 8083 1731]
 [   0    0    0 2768 8111 1157 6494 4341]
 [   0    0    0    0 1379 2487 7249 2191]
 [   0    0    0    0 4675  653 4985 2017]]


## Feature Representation

In [8]:
dim =10
model = Sequential()
model.add(Embedding(vocabulary_size, dim, input_length=sent_length))
model.compile('adam', 'mse')
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100000 (390.62 KB)
Trainable params: 100000 (390.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
model.predict(embeded_docs)  # this will give us the embedding for each word in the sentence



array([[[-2.06714757e-02,  2.59392895e-02,  3.94410230e-02,
          8.40539858e-03, -3.51630338e-02,  2.02051550e-03,
          6.30535185e-04,  1.51980855e-02, -3.13802138e-02,
          1.28199719e-02],
        [-2.06714757e-02,  2.59392895e-02,  3.94410230e-02,
          8.40539858e-03, -3.51630338e-02,  2.02051550e-03,
          6.30535185e-04,  1.51980855e-02, -3.13802138e-02,
          1.28199719e-02],
        [-2.06714757e-02,  2.59392895e-02,  3.94410230e-02,
          8.40539858e-03, -3.51630338e-02,  2.02051550e-03,
          6.30535185e-04,  1.51980855e-02, -3.13802138e-02,
          1.28199719e-02],
        [-2.06714757e-02,  2.59392895e-02,  3.94410230e-02,
          8.40539858e-03, -3.51630338e-02,  2.02051550e-03,
          6.30535185e-04,  1.51980855e-02, -3.13802138e-02,
          1.28199719e-02],
        [-6.50228187e-03,  3.12867798e-02,  9.85586643e-03,
         -2.61471402e-02,  2.96783447e-03,  1.50724091e-02,
          2.07717158e-02,  2.38185860e-02,  2.874554

In [10]:
embeded_docs[0]

array([   0,    0,    0,    0, 2768, 3335, 4675, 8830])

In [11]:
model.predict(embeded_docs)[0]  # this will give us the embedding for each word in the sentence



array([[-0.02067148,  0.02593929,  0.03944102,  0.0084054 , -0.03516303,
         0.00202052,  0.00063054,  0.01519809, -0.03138021,  0.01281997],
       [-0.02067148,  0.02593929,  0.03944102,  0.0084054 , -0.03516303,
         0.00202052,  0.00063054,  0.01519809, -0.03138021,  0.01281997],
       [-0.02067148,  0.02593929,  0.03944102,  0.0084054 , -0.03516303,
         0.00202052,  0.00063054,  0.01519809, -0.03138021,  0.01281997],
       [-0.02067148,  0.02593929,  0.03944102,  0.0084054 , -0.03516303,
         0.00202052,  0.00063054,  0.01519809, -0.03138021,  0.01281997],
       [-0.00650228,  0.03128678,  0.00985587, -0.02614714,  0.00296783,
         0.01507241,  0.02077172,  0.02381859,  0.02874554,  0.00990304],
       [ 0.02032503,  0.01528737,  0.00546852,  0.03943329, -0.02041569,
         0.02039399,  0.04729117, -0.02142294, -0.02906103,  0.00432533],
       [-0.03041561, -0.00842433,  0.03948328,  0.00810151, -0.01348889,
         0.02052206, -0.04547421,  0.0446414 