In [1]:
# Example of learning an embedding

In [2]:
docs = ['Well done!', 
        'Good work', 
        'Great effort',
       'nice work',
       'Excellent!',
       'Weak',
       'Poor effort!',
       'not good',
       'poor work',
       'Could have done better.']

In [3]:
labels = [1,1,1,1,1,0,0,0,0,0]

In [5]:
from keras.preprocessing.text import one_hot
vocab_size = 50
encoded_docs = [one_hot(d,vocab_size) for d in docs]
print(encoded_docs)

Using TensorFlow backend.


[[37, 25], [2, 6], [33, 15], [25, 6], [24], [4], [14, 15], [23, 2], [14, 6], [37, 38, 25, 14]]


In [6]:
from keras.preprocessing.sequence import pad_sequences

In [7]:
max_length=4
padded_docs = pad_sequences(encoded_docs,maxlen=max_length,padding='post')
print(padded_docs)

[[37 25  0  0]
 [ 2  6  0  0]
 [33 15  0  0]
 [25  6  0  0]
 [24  0  0  0]
 [ 4  0  0  0]
 [14 15  0  0]
 [23  2  0  0]
 [14  6  0  0]
 [37 38 25 14]]


In [8]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [9]:
model = Sequential()
model.add(Embedding(vocab_size,8,input_length=max_length))
model.add(Flatten())
model.add(Dense(1,activation="sigmoid"))
model.compile(optimizer='adam',loss = "binary_crossentropy",metrics=["accuracy"])
model.summary()

W0503 11:52:24.381505 140236123137856 deprecation.py:506] From /home/arvind/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1633: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
W0503 11:52:24.484562 140236123137856 deprecation.py:323] From /home/arvind/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/nn_impl.py:183: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_1 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.fit(padded_docs,labels,epochs=50,verbose=0)

W0503 12:02:02.119668 140236123137856 deprecation.py:323] From /home/arvind/.local/lib/python3.6/site-packages/keras/optimizers.py:550: BaseResourceVariable.constraint (from tensorflow.python.ops.resource_variable_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Apply a constraint manually following the optimizer update step.
W0503 12:02:02.758693 140236123137856 module_wrapper.py:136] From /home/arvind/.local/lib/python3.6/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



<keras.callbacks.callbacks.History at 0x7f8af1303320>

In [11]:
loss,accuracy = model.evaluate(padded_docs,labels,verbose=0)

In [12]:
print("Accuracy: %f"%(accuracy*100))

Accuracy: 89.999998


In [13]:
# Example of using pre-trained Glove embedding 

In [14]:
docs = ['Well done!',
       'Good work',
        'Great effort',
       'nice work',
       'Excellent',
       'Weak',
       'Poor effort',
       'not good',
       'poor work',
       'Could have done better.']

In [15]:
labels = [1,1,1,1,1,0,0,0,0,0]

In [17]:
from keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

In [19]:
len(t.word_index)

14

In [20]:
encoded_docs = t.texts_to_sequences(docs)

In [21]:
print(encoded_docs)

[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]


In [22]:
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length,padding='post')

In [23]:
print(padded_docs)

[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]


In [25]:
from numpy import asarray
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:],dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.'%len(embeddings_index))

Loaded 400000 word vectors.


In [27]:
from numpy import zeros
embedding_matrix = zeros((vocab_size,100))

In [28]:
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [29]:
from keras.layers import Embedding

In [30]:
model = Sequential()
e = Embedding(vocab_size,100,weights=[embedding_matrix],input_length=4,trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1,activation="sigmoid"))
model.compile(optimizer = "adam",loss = "binary_crossentropy",metrics = ["accuracy"])
model.summary()
model.fit(padded_docs, labels, epochs = 50, verbose=0)
loss,accuracy = model.evaluate(padded_docs,labels,verbose=0)
print("Accuracy: %f"%(accuracy*100))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 100)            1500      
_________________________________________________________________
flatten_2 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 401       
Total params: 1,901
Trainable params: 401
Non-trainable params: 1,500
_________________________________________________________________
Accuracy: 100.000000
