In [1]:
import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.text import one_hot

In [23]:
sent=['a glass of beer',
     'the glass of juice',
     'welcome to my place',
     'I am learning python',
     'understand the meaning of words']

In [24]:
sent

['a glass of beer',
 'the glass of juice',
 'welcome to my place',
 'I am learning python',
 'understand the meaning of words']

In [25]:
# Defining a vocabulary size- dictionary of size 10000
voc_size = 10000

## One Hot Representation

In [26]:
# Applying one hot function where only 1 index(where the word is present) is 1 and rest is 0
# onehot() function will convert each sentence to 's and 1's considering some vocabulary size and it will give index 
# position of each word in the sentence
onehot_words = []
for sentence in sent:
    onehot_words.append(one_hot(sentence, voc_size))
    # Arguments should be each sentence and the vocabulary size
    
onehot_words
# These numbers are the index(index from the dictionary) of each word in a sentence based on given vocabulary size

[[8754, 1665, 2895, 4662],
 [8611, 1665, 2895, 411],
 [7478, 6788, 3614, 1942],
 [4702, 3420, 8294, 8630],
 [1485, 8611, 1972, 2895, 6449]]

## Word Embedding Representation

In [8]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [9]:
import numpy as np

- Whenever passing any sentences to the embedding layer all the sentences must have the same number of words. This will help to create a good embedding matrix.
- To make this happen we are using pad_sequences, ie to make words equal in all the sentences
- 0(zero) is added after or before all the words, ie the word at 0th index will be added

In [27]:
sent_length = 8
# We want to have maximum of 8 words in each sentence
embedded_docs = pad_sequences(onehot_words, padding = 'pre', maxlen = sent_length)
# first argument: passing one hot encoded values
# second argument: padding = 'pre' - padding is added before all the words
    # padding = 'post' - padding is added after all the words
# third argument: maxlen = max length of the sentence
embedded_docs
# Now we will pass this matrix to the embedding layer

array([[   0,    0,    0,    0, 8754, 1665, 2895, 4662],
       [   0,    0,    0,    0, 8611, 1665, 2895,  411],
       [   0,    0,    0,    0, 7478, 6788, 3614, 1942],
       [   0,    0,    0,    0, 4702, 3420, 8294, 8630],
       [   0,    0,    0, 1485, 8611, 1972, 2895, 6449]])

In [28]:
# Defining the number of dimensions to be used
dim = 10
# This is basically the number of features which we take for FEATURE REPRESENTATION

In [29]:
# Creating a model
model = Sequential()
# Adding the embedding layer
model.add(Embedding(voc_size, dim, input_length = sent_length))
# First argument = dictionary size
# Second argument = dimensions which are to be taken for feature representation
# Each word in every sentence will be converted to the given dimension
# So for first sentence, word present at 705 index(the) will be converted to vectors of 10 dimension
# Third argument = Maximum length of sentence

# This embedding layer will create a Feature Representation for each word in every sentence

model.compile(optimizer = 'adam', metrics = 'mse')

In [30]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 8, 10)             100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


- The Embedding layer has weights that are learned. If you save your model to file, this will include weights for the Embedding layer.

- The output of the Embedding layer is a 2D vector with one embedding for each word in the input sequence of words (input document).

- If you wish to connect a Dense layer directly to an Embedding layer, you must first flatten the 2D output matrix to a 1D vector using the Flatten layer.

In [31]:
# To see the embedded vector values
print(model.predict(embedded_docs))
# For each word 10 dimension vector is created

[[[-0.00185369 -0.02906373 -0.03831832  0.00635027 -0.02974092
   -0.00651143  0.0255412   0.0060045  -0.00444395 -0.02586404]
  [-0.00185369 -0.02906373 -0.03831832  0.00635027 -0.02974092
   -0.00651143  0.0255412   0.0060045  -0.00444395 -0.02586404]
  [-0.00185369 -0.02906373 -0.03831832  0.00635027 -0.02974092
   -0.00651143  0.0255412   0.0060045  -0.00444395 -0.02586404]
  [-0.00185369 -0.02906373 -0.03831832  0.00635027 -0.02974092
   -0.00651143  0.0255412   0.0060045  -0.00444395 -0.02586404]
  [ 0.03038326 -0.04392822  0.02806565 -0.02126027 -0.00138902
    0.01621561 -0.04730344 -0.03245162 -0.02034541  0.01248621]
  [ 0.01415369  0.02820823 -0.00216037  0.0408794   0.043222
   -0.04473099  0.04390018  0.03240642 -0.01373659  0.0081668 ]
  [ 0.02760471  0.01612211  0.00382591  0.0421035   0.01919608
   -0.02776971 -0.02248905 -0.03643564  0.00377132  0.01732625]
  [-0.04415665 -0.04150409 -0.03169983 -0.02090961  0.03045875
   -0.01201976 -0.04392027  0.03880182  0.04730037

In [32]:
embedded_docs[0]
# first sentence with added padding
#  0 0 0 0 the(705) glass(2708) of(9328) milk(8434)

array([   0,    0,    0,    0, 8754, 1665, 2895, 4662])

In [33]:
print(model.predict(embedded_docs[0]))
# Here we can see that each word in a sentence is converted to 10 vectors as we have given the dimension as 10.

[[[-0.00185369 -0.02906373 -0.03831832  0.00635027 -0.02974092
   -0.00651143  0.0255412   0.0060045  -0.00444395 -0.02586404]]

 [[-0.00185369 -0.02906373 -0.03831832  0.00635027 -0.02974092
   -0.00651143  0.0255412   0.0060045  -0.00444395 -0.02586404]]

 [[-0.00185369 -0.02906373 -0.03831832  0.00635027 -0.02974092
   -0.00651143  0.0255412   0.0060045  -0.00444395 -0.02586404]]

 [[-0.00185369 -0.02906373 -0.03831832  0.00635027 -0.02974092
   -0.00651143  0.0255412   0.0060045  -0.00444395 -0.02586404]]

 [[ 0.03038326 -0.04392822  0.02806565 -0.02126027 -0.00138902
    0.01621561 -0.04730344 -0.03245162 -0.02034541  0.01248621]]

 [[ 0.01415369  0.02820823 -0.00216037  0.0408794   0.043222
   -0.04473099  0.04390018  0.03240642 -0.01373659  0.0081668 ]]

 [[ 0.02760471  0.01612211  0.00382591  0.0421035   0.01919608
   -0.02776971 -0.02248905 -0.03643564  0.00377132  0.01732625]]

 [[-0.04415665 -0.04150409 -0.03169983 -0.02090961  0.03045875
   -0.01201976 -0.04392027  0.038801

## Example of Word Embedding

In [34]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
# define documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])
# integer encode the documents
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
# Giving shape also for first hidden layer
model.add(Dense(16, activation = 'relu', input_shape = (32,)))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

[[18, 19], [19, 12], [40, 30], [8, 12], [37], [14], [25, 30], [19, 19], [25, 12], [19, 49, 19, 27]]
[[18 19  0  0]
 [19 12  0  0]
 [40 30  0  0]
 [ 8 12  0  0]
 [37  0  0  0]
 [14  0  0  0]
 [25 30  0  0]
 [19 19  0  0]
 [25 12  0  0]
 [19 49 19 27]]
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_5 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 17        
Total params: 945
Trainable params: 945
Non-trainable params: 0
_________________________________________________________________
Non