In [1]:
# Source : https://github.com/jeffheaton/t81_558_deep_learning/blob/master/t81_558_class_11_05_embedding.ipynb

In [2]:
from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Embedding, Dense

In [3]:
# Define 10 resturant reviews.
reviews = [
    'Never coming back!',
    'Horrible service',
    'Rude waitress',
    'Cold food.',
    'Horrible food!',
    'Awesome',
    'Awesome service!',
    'Rocks!',
    'poor work',
    'Couldn\'t have done better']

# Define labels (1=negative, 0=positive)
labels = array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

In [4]:
VOCAB_SIZE = 50

# Before feeding the input reviews to the embedding layer, 
# we need to convert each distinct word to a number since thats what any algorithm can deal with!
# So here we are using text as input, but an Embedding layer can be used with any other type of data too 
# since any data can be represented using numbers.

encoded_reviews = [one_hot(d, VOCAB_SIZE) for d in reviews]
print(f"Encoded reviews: {encoded_reviews}")

# So the number corresponding to the word "Never" is 33, the number corresponding to "service" is 10, and so on.
# The embedding layer will take these numbers as input, and learn a suitable vector representation.

Encoded reviews: [[33, 20, 46], [19, 10], [28, 18], [4, 8], [19, 8], [4], [4, 10], [34], [5, 35], [4, 21, 11, 19]]


In [5]:
MAX_LENGTH = 4

# Since the reviews have varying number of words, we need to do zero padding since all the input vectors must be of the same size.
padded_reviews = pad_sequences(encoded_reviews, maxlen=MAX_LENGTH,
                               padding='post')
print(padded_reviews)

[[33 20 46  0]
 [19 10  0  0]
 [28 18  0  0]
 [ 4  8  0  0]
 [19  8  0  0]
 [ 4  0  0  0]
 [ 4 10  0  0]
 [34  0  0  0]
 [ 5 35  0  0]
 [ 4 21 11 19]]


In [6]:
model = Sequential()

# embedding layer takes inputs of length MAX_LENGTH and has an output embedding vector of size 8. 
# VOCAB_SIZE is the number of different numerical values the input vector elements can have.
# For each distinct word, the embedding layer will learn a vector representation.
# Since our VOCAB_SIZE is 50, and length of the embedding vector is 8, the number of parameters to be learnt is 50*8 = 400

# Note that unlike usual hidden layers in ANN, the Embedding layer does not have any activation function!

embedding_layer = Embedding(VOCAB_SIZE, 8, input_length=MAX_LENGTH)
model.add(embedding_layer)

# This Flatten layer converts the 2D array of the Embedding layer to a 1D array.
# Since the initial text input is a vector of size 4, and the embedding layer learns a vector of size 8 for each word, 
# the flatten layer output will be a vector of size 32.
model.add(Flatten())

model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 8)              400       
                                                                 
 flatten (Flatten)           (None, 32)                0         
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
# fit the model
model.fit(padded_reviews, labels, epochs=100, verbose=0)

<keras.callbacks.History at 0x7f552bf2ba90>

In [17]:
print(embedding_layer.get_weights()[0].shape)
# print(embedding_layer.get_weights())

# The number corresponding to the word "Never" is 33, the number corresponding to "service" is 10, and so on.
print("\n\n")
print("Embedding vector for the word Never is:\n",embedding_layer.get_weights()[0][33])
print("\n\n")
print("Embedding vector for the word service is:\n",embedding_layer.get_weights()[0][10])


(50, 8)



Embedding vector for the word Never is:
 [-0.08492238  0.14234544  0.06741757  0.08559477  0.05124443 -0.10405801
 -0.1454268  -0.06898103]



Embedding vector for the word service is:
 [-0.06222677 -0.03977245  0.01772755 -0.01145729 -0.05897132  0.09337803
  0.01182112 -0.07200011]


In [8]:
loss, accuracy = model.evaluate(padded_reviews, labels, verbose=0)
print(f'Accuracy: {accuracy}')

# This model is clearly overfitting since we have only a few reviews in our training data, and there is no overlap between the words in any review. 
# The accuracy will drop when the same word appears in both positive and negative reviews. Try it out and see what happens.

Accuracy: 1.0


In [9]:
print(f'Log-loss: {loss}')


Log-loss: 0.46238845586776733
