In [14]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as k
import os

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers as tf_layer

from tensorboard.plugins import projector

In [2]:
reviews = [
    "nice food",
    "amazing restaurant",
    "too good",
    "horrible sevice",
    "highly disgusting",
    "never recommending this to anyone",
]

sentiment = np.array([1, 1, 1, 0, 0, 0,]) 

In [3]:
# specifying vocabulary and padding dimention
vocab_size = 500
pad_len = 5

# encoding reviews into one-hot vectors
encoded_reviews = [one_hot(i, vocab_size) for i in reviews]

# padding sentences
padded_reviews = pad_sequences(encoded_reviews, maxlen=pad_len, padding='post')

# specifying the size of our vector embeddings 
vector_size = 5

In [4]:
padded_reviews

array([[184, 263,   0,   0,   0],
       [407, 327,   0,   0,   0],
       [251, 471,   0,   0,   0],
       [403,  34,   0,   0,   0],
       [212, 491,   0,   0,   0],
       [ 54, 152, 184, 174,  65]], dtype=int32)

In [5]:
# defining NLP model
model = Sequential()

# specifyng model layers
model.add(k.layers.Embedding(vocab_size, vector_size, input_length=pad_len, name="embedding"))
model.add(k.layers.Flatten())
model.add(k.layers.Dense(1, activation='sigmoid'))


In [6]:
x = padded_reviews
y = sentiment

In [7]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 5, 5)              2500      
_________________________________________________________________
flatten (Flatten)            (None, 25)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 26        
Total params: 2,526
Trainable params: 2,526
Non-trainable params: 0
_________________________________________________________________


In [8]:
# running our model
model.fit(x, y, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f1de5845b10>

In [9]:
# evaluating the model
loss, accuracy = model.evaluate(x, y)
accuracy



1.0

In [10]:
# getting the weights of our embedding layer i.e. the word embeddings
weights = model.get_layer('embedding').get_weights()[0]
len(weights)

# these weights can be saved and loaded later in the 'Embedding()' layer

500

In [11]:
# testing embedding of 'nice' and 'amazing'
print(weights[184])
print("\n\n")
print(weights[407])

# cosine similarity increases with vocabulary size

[ 0.036169   -0.02938987  0.02602111  0.00319386 -0.03161726]



[-0.01313028  0.02663325  0.01602728  0.04039905  0.01098454]


## *Saving Embeddings*

In [31]:
# Set up a logs directory, so Tensorboard knows where to look for files
log_dir = './Embeddings/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Save Labels separately on a line-by-line manner.
with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
    
    for words in reviews:
        for subwords in words.split():
            print(subwords)
            f.write("{}\n".format(subwords))
            
#     for subwords in y:
#         f.write("{}\n".format(subwords))
  
    # Fill in the rest of the labels with "unknown"
    for unknown in range(1, vocab_size - len(y)):
        f.write("unknown #{}\n".format(unknown))


# Save the weights we want to analyse as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, so
# we will remove that value.

weights = tf.Variable(model.layers[0].get_weights()[0][1:])

# Create a checkpoint from embedding, the filename and key are
# name of the tensor.

checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config
config = projector.ProjectorConfig()
embedding = config.embeddings.add()

# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

nice
food
amazing
restaurant
too
good
horrible
sevice
highly
disgusting
never
recommending
this
to
anyone


In [25]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [33]:
%reload_ext tensorboard

In [37]:
%tensorboard --logdir /Embeddings/

Reusing TensorBoard on port 6006 (pid 311), started 17:12:25 ago. (Use '!kill 311' to kill it.)

In [38]:
!kill 311