# Word Embedding:

- 👉 Word Embedding means converting words into numbers (vectors) in such a way that similar words have similar meanings and thus similar vector values.

In [4]:
from tensorflow.keras.preprocessing.text import one_hot


In [1]:
### sentences 
sent = ['the glass of milk',
        'the glass of juice',
        'the cup of tea',
        'I am a good boy',
        'I am a good developer',
        'understand the meaning of words',
        'your videos are good'
]

In [3]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [5]:
## Define the vocabulary size
voc_size = 10000

In [None]:
## One Hot Representation for every word
one_hot_repr = [one_hot(words,voc_size) for words in sent]
one_hot_repr

[[6853, 4270, 3409, 700],
 [6853, 4270, 3409, 7331],
 [6853, 8941, 3409, 216],
 [8258, 8394, 1235, 3889, 7772],
 [8258, 8394, 1235, 3889, 809],
 [7956, 6853, 4997, 3409, 4239],
 [4561, 1067, 5763, 3889]]

In [9]:
## Word Embedding representaion 
from tensorflow.keras.layers import Embedding
#from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import pad_sequences

In [10]:
import numpy as np

## pad_sequence:
👉 In simple words:
pad_sequences() is used to make all input sequences the same length

In [13]:
one_hot_repr

[[6853, 4270, 3409, 700],
 [6853, 4270, 3409, 7331],
 [6853, 8941, 3409, 216],
 [8258, 8394, 1235, 3889, 7772],
 [8258, 8394, 1235, 3889, 809],
 [7956, 6853, 4997, 3409, 4239],
 [4561, 1067, 5763, 3889]]

In [None]:
## pad_sequence
sent_len = 8
embedded_docs = pad_sequences(one_hot_repr,padding='pre',maxlen=sent_len)
embedded_docs

array([[   0,    0,    0,    0, 6853, 4270, 3409,  700],
       [   0,    0,    0,    0, 6853, 4270, 3409, 7331],
       [   0,    0,    0,    0, 6853, 8941, 3409,  216],
       [   0,    0,    0, 8258, 8394, 1235, 3889, 7772],
       [   0,    0,    0, 8258, 8394, 1235, 3889,  809],
       [   0,    0,    0, 7956, 6853, 4997, 3409, 4239],
       [   0,    0,    0,    0, 4561, 1067, 5763, 3889]])

In [14]:
## Feature Representation
dim = 10


In [15]:
model = Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_len))
model.compile('adam','mse')







In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100000 (390.62 KB)
Trainable params: 100000 (390.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Summary :

| Step | Purpose                         | Function Used             |
| ---- | ------------------------------- | ------------------------- |
| 1️⃣  | Collect sentences               | —                         |
| 2️⃣  | Define vocabulary               | —                         |
| 3️⃣  | Convert words → integers        | `one_hot()`               |
| 4️⃣  | Equalize sentence length        | `pad_sequences()`         |
| 5️⃣  | Create embedding representation | `Embedding()`             |
| 6️⃣  | Feed padded sentences           | `model(padded_sentences)` |
| 7️⃣  | (Optional) View learned vectors | `.get_weights()`          |
