# Create Own Embedding using Keras
## This notebook outlines the concepts of creating own embeddings using Keras

### Import the necessary libraries

In [1]:
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
# from tensorflow.keras.layers.embeddings import Embedding
from tensorflow.keras.layers import Embedding


import numpy as np

### Define the corpus

In [2]:
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']

### Define the labels

In [3]:
labels = np.array([1,1,1,1,1,0,0,0,0,0])

### Tokenizer

In [4]:
t = Tokenizer()


In [5]:
t

<keras.src.legacy.preprocessing.text.Tokenizer at 0x1c940a2ceb0>

In [6]:
t.fit_on_texts(docs)

In [7]:
t

<keras.src.legacy.preprocessing.text.Tokenizer at 0x1c940a2ceb0>

### Integer encode the documents (Either)
- texts_to_sequences
- one_hot

In [8]:
docs

['Well done!',
 'Good work',
 'Great effort',
 'nice work',
 'Excellent!',
 'Weak',
 'Poor effort!',
 'not good',
 'poor work',
 'Could have done better.']

In [9]:
encoded_docs = t.texts_to_sequences(docs)
encoded_docs

[[6, 2],
 [3, 1],
 [7, 4],
 [8, 1],
 [9],
 [10],
 [5, 4],
 [11, 3],
 [5, 1],
 [12, 13, 2, 14]]

#### One-hot

In [10]:
vocab_size = 50
encoded_docs_oh = [one_hot(d, vocab_size) for d in docs]
encoded_docs_oh

[[28, 28],
 [27, 14],
 [1, 21],
 [47, 14],
 [34],
 [16],
 [22, 21],
 [16, 27],
 [22, 14],
 [28, 3, 28, 1]]

### Padding documents to a desired max_length

In [11]:
max_length = 4
padded_docs = pad_sequences(encoded_docs,
             maxlen=max_length,
             padding='post'
)
padded_docs

array([[ 6,  2,  0,  0],
       [ 3,  1,  0,  0],
       [ 7,  4,  0,  0],
       [ 8,  1,  0,  0],
       [ 9,  0,  0,  0],
       [10,  0,  0,  0],
       [ 5,  4,  0,  0],
       [11,  3,  0,  0],
       [ 5,  1,  0,  0],
       [12, 13,  2, 14]])

In [12]:
max_length = 4
padded_docs = pad_sequences(encoded_docs,
             maxlen=max_length,
             padding='pre'
)
padded_docs

array([[ 0,  0,  6,  2],
       [ 0,  0,  3,  1],
       [ 0,  0,  7,  4],
       [ 0,  0,  8,  1],
       [ 0,  0,  0,  9],
       [ 0,  0,  0, 10],
       [ 0,  0,  5,  4],
       [ 0,  0, 11,  3],
       [ 0,  0,  5,  1],
       [12, 13,  2, 14]])

### Define the Embedding layer

In [13]:
vocab_size = len(t.word_index) + 1
vocab_size

15

In [24]:
#The Embedding layer transforms input integers (typically representing words or tokens) into dense vectors of fixed size, enabling neural networks to process text or categorical data more effectively

#input_dim (or vocab_size)
#This parameter specifies the size of the vocabulary, i.e., the number of unique input values.
#For example, if you have 10,000 unique words in your text data, you would set input_dim=10000.

#output_dim
#This parameter specifies the size of the embedding vectors. This is typically between 50 and 300.
#For example, if you have 10,000 unique words in your text data and you want to use 50-dimensional embedding vectors, you would set output_dim=50.

#input_length
#This parameter specifies the length of input sequences.
#For example, if your input documents have 1000 words, this would be 1000.



embedding_layer = Embedding(input_dim=vocab_size, output_dim=50, input_length=max_length)



### Define the model

In [25]:
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(20, activation="relu"))
model.add(Dense(10, activation="relu"))
model.add(Dense(1, activation='sigmoid'))
# model.add(Dense(2, activation='softmax'))

### Compiling the model

In [21]:
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['acc']
)

### Summarize the model

In [22]:
model.summary()

### Build the model

In [23]:
model.fit(padded_docs, labels, epochs=50)

Epoch 1/50


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 306ms/step - acc: 1.0000 - loss: 0.4574
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - acc: 1.0000 - loss: 0.4525
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - acc: 1.0000 - loss: 0.4476
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - acc: 1.0000 - loss: 0.4427
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - acc: 1.0000 - loss: 0.4378
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - acc: 1.0000 - loss: 0.4328
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - acc: 1.0000 - loss: 0.4279
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - acc: 1.0000 - loss: 0.4231
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - acc: 1.0000 - loss: 0.4182
Ep

<keras.src.callbacks.history.History at 0x1c91e78efe0>

### Evaluating the model

In [19]:
loss, accuracy = model.evaluate(padded_docs, labels)
loss, accuracy

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - acc: 1.0000 - loss: 0.4156


(0.4156327247619629, 1.0)

In [20]:
from tensorflow import keras

t = Tokenizer()
t.fit_on_texts(docs)
encoded_docs = t.texts_to_sequences(docs)
padded_docs = pad_sequences(encoded_docs,
             maxlen=4,
             padding='post'
)

e = Embedding(15, 50, input_length=4)

model = Sequential()
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['acc']
)

model.fit(padded_docs, labels, epochs=50)

loss, accuracy = model.evaluate(padded_docs, labels)
loss, accuracy

Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 356ms/step - acc: 0.5000 - loss: 0.6884
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - acc: 0.6000 - loss: 0.6844
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - acc: 0.6000 - loss: 0.6804
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - acc: 0.7000 - loss: 0.6765
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - acc: 0.7000 - loss: 0.6725
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - acc: 0.8000 - loss: 0.6686
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - acc: 0.8000 - loss: 0.6646
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - acc: 0.8000 - loss: 0.6606
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - acc: 0.9000 - loss

(0.4574446678161621, 1.0)