In [12]:
import os
import math
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector

tf.compat.v1.disable_eager_execution()


batch_size=64
embedding_dimension = 5
negative_samples =8
LOG_DIR = "../data/logs/word2vec_intro"


digit_to_word_map = {1:"One",2:"Two", 3:"Three", 4:"Four", 5:"Five",
                     6:"Six",7:"Seven",8:"Eight",9:"Nine"}
sentences = []

# Create two kinds of sentences - sequences of odd and even digits
for i in range(10000):
    rand_odd_ints = np.random.choice(range(1,10,2),3)
    sentences.append(" ".join([digit_to_word_map[r] for r in rand_odd_ints]))
    rand_even_ints = np.random.choice(range(2,10,2),3)
    sentences.append(" ".join([digit_to_word_map[r] for r in rand_even_ints]))

In [13]:
sentences[0:10]

['Five Three Nine',
 'Two Two Six',
 'Five Nine Seven',
 'Six Two Four',
 'One Seven Nine',
 'Six Eight Two',
 'One Five Three',
 'Eight Six Two',
 'Nine Seven Nine',
 'Four Four Four']

In [14]:
# Map words to indices
word2index_map ={}
index=0
for sent in sentences:
    for word in sent.lower().split():
        if word not in word2index_map:
            word2index_map[word] = index
            index+=1
index2word_map = {index: word for word, index in word2index_map.items()}
vocabulary_size = len(index2word_map)

In [15]:
# Generate skip-gram pairs
skip_gram_pairs = []
for sent in sentences:
    tokenized_sent = sent.lower().split()
    for i in range(1, len(tokenized_sent)-1) :
        word_context_pair = [[word2index_map[tokenized_sent[i-1]],
                              word2index_map[tokenized_sent[i+1]]],
                              word2index_map[tokenized_sent[i]]]
        skip_gram_pairs.append([word_context_pair[1],
                                word_context_pair[0][0]])
        skip_gram_pairs.append([word_context_pair[1],
                                word_context_pair[0][1]])


def get_skipgram_batch(batch_size):
    instance_indices = list(range(len(skip_gram_pairs)))
    np.random.shuffle(instance_indices)
    batch = instance_indices[:batch_size]
    x = [skip_gram_pairs[i][0] for i in batch]
    y = [[skip_gram_pairs[i][1]] for i in batch]
    return x,y

In [16]:
skip_gram_pairs[0:10]

[[1, 0],
 [1, 2],
 [3, 3],
 [3, 4],
 [2, 0],
 [2, 5],
 [3, 4],
 [3, 6],
 [5, 7],
 [5, 2]]

In [17]:
# Batch example
x_batch,y_batch = get_skipgram_batch(8)
x_batch
y_batch
[index2word_map[word] for word in x_batch]
[index2word_map[word[0]] for word in y_batch]

x_batch

[4, 7, 2, 1, 5, 6, 6, 3]

In [18]:
y_batch

[[4], [5], [0], [5], [2], [8], [8], [6]]

In [19]:
[index2word_map[word] for word in x_batch]

['six', 'one', 'nine', 'three', 'seven', 'four', 'four', 'two']

In [20]:
[index2word_map[word[0]] for word in y_batch]

['six', 'seven', 'five', 'seven', 'nine', 'eight', 'eight', 'four']

In [21]:
# Input data, labels
train_inputs = tf.compat.v1.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.compat.v1.placeholder(tf.int32, shape=[batch_size, 1])

In [22]:
with tf.name_scope("embeddings"):
    embeddings = tf.Variable(
        tf.compat.v1.random_uniform([vocabulary_size, embedding_dimension],
                          -1.0, 1.0),name='embedding')
    # This is essentially a lookup table
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [23]:
# Create variables for the NCE loss
nce_weights = tf.Variable(
        tf.compat.v1.truncated_normal([vocabulary_size, embedding_dimension],
                            stddev=1.0 / math.sqrt(embedding_dimension)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

loss = tf.reduce_mean(
  tf.nn.nce_loss(weights = nce_weights, biases = nce_biases, inputs = embed,
       labels = train_labels, num_sampled = negative_samples, num_classes =
                vocabulary_size))

In [24]:
# Learning rate decay
global_step = tf.Variable(0, trainable=False)
learningRate = tf.compat.v1.train.exponential_decay(learning_rate=0.1,
                                          global_step= global_step,
                                          decay_steps=1000,
                                          decay_rate= 0.95,
                                          staircase=True)
train_step = tf.compat.v1.train.GradientDescentOptimizer(learningRate).minimize(loss)

In [25]:
# Merge all summary ops
merged = tf.compat.v1.summary.merge_all()

with tf.compat.v1.Session() as sess:
    train_writer = tf.compat.v1.summary.FileWriter(LOG_DIR,
                                         graph=tf.compat.v1.get_default_graph())
    saver = tf.compat.v1.train.Saver()

    with open(os.path.join(LOG_DIR,'metadata.tsv'), "w") as metadata:
        metadata.write('Name\tClass\n')
        for k,v in index2word_map.items():
            metadata.write('%s\t%d\n' % (v, k))

    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embeddings.name
    # Link embedding to its metadata file
    embedding.metadata_path = os.path.join(LOG_DIR,'metadata.tsv')
    projector.visualize_embeddings(train_writer, config)

    tf.compat.v1.global_variables_initializer().run()

    for step in range(1000):
        x_batch, y_batch = get_skipgram_batch(batch_size)
        summary,_ = sess.run([merged,train_step],
                             feed_dict={train_inputs:x_batch,
                                        train_labels:y_batch})
        train_writer.add_summary(summary, step)

        if step % 100 == 0:
            saver.save(sess, os.path.join(LOG_DIR, "w2v_model.ckpt"), step)
            loss_value = sess.run(loss,
                                  feed_dict={train_inputs:x_batch,
                                             train_labels:y_batch})
            print("Loss at %d: %.5f" % (step, loss_value))

    # Normalize embeddings before using
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    normalized_embeddings_matrix = sess.run(normalized_embeddings)

TypeError: Fetch argument None has invalid type <class 'NoneType'>

In [26]:
ref_word = normalized_embeddings_matrix[word2index_map["one"]]

cosine_dists = np.dot(normalized_embeddings_matrix,ref_word)
ff = np.argsort(cosine_dists)[::-1][1:10]
for f in ff:
    print(index2word_map[f])
    print(cosine_dists[f])

NameError: name 'normalized_embeddings_matrix' is not defined

In [27]:
import zipfile
import numpy as np
import tensorflow as tf

path_to_glove = "../data/"
PRE_TRAINED = True
GLOVE_SIZE = 300
batch_size = 128
embedding_dimension = 64
num_classes = 2
hidden_layer_size = 32
times_steps = 6

In [28]:
digit_to_word_map = {1: "One", 2: "Two", 3: "Three", 4: "Four", 5: "Five", 6: "Six", 7: "Seven", 8: "Eight", 9: "Nine", 0: "PAD_TOKEN"}
even_sentences = []
odd_sentences = []
seqlens = []
for i in range(10000):
    rand_seq_len = np.random.choice(range(3,7))
    seqlens.append(rand_seq_len)
    rand_odd_ints = np.random.choice(range(1,10,2),
                                     rand_seq_len)
    rand_even_ints = np.random.choice(range(2,10,2),
                                      rand_seq_len)
    if rand_seq_len<6:
        rand_odd_ints = np.append(rand_odd_ints,
                                  [0]*(6-rand_seq_len))
        rand_even_ints = np.append(rand_even_ints,
                                   [0]*(6-rand_seq_len))

    even_sentences.append(" ".join([digit_to_word_map[r] for
                               r in rand_odd_ints]))
    odd_sentences.append(" ".join([digit_to_word_map[r] for
                              r in rand_even_ints]))
data = even_sentences+odd_sentences
# Same seq lengths for even, odd sentences
seqlens*=2
labels = [1]*10000 + [0]*10000
for i in range(len(labels)):
    label = labels[i]
    one_hot_encoding = [0]*2
    one_hot_encoding[label] = 1
    labels[i] = one_hot_encoding

In [29]:
word2index_map ={}
index=0
for sent in data:
    for word in sent.split():
        if word not in word2index_map:
            word2index_map[word] = index
            index+=1

index2word_map = {index: word for word, index in word2index_map.items()}

vocabulary_size = len(index2word_map)

In [30]:
word2index_map

{'Seven': 0,
 'Nine': 1,
 'PAD_TOKEN': 2,
 'Three': 3,
 'One': 4,
 'Five': 5,
 'Six': 6,
 'Two': 7,
 'Eight': 8,
 'Four': 9}

In [31]:
def get_glove(path_to_glove,word2index_map):

    embedding_weights = {}
    count_all_words = 0
    with zipfile.ZipFile(path_to_glove) as z:
        with z.open("glove.840B.300d.txt") as f:
            for line in f:
                vals = line.split()
                word = str(vals[0].decode("utf-8"))
                if word in word2index_map:
                    print(word)
                    count_all_words+=1
                    coefs = np.asarray(vals[1:], dtype='float32')
                    coefs/=np.linalg.norm(coefs)
                    embedding_weights[word] = coefs
                if count_all_words==vocabulary_size -1:
                    break
    return embedding_weights
word2embedding_dict = get_glove(path_to_glove,word2index_map)

IsADirectoryError: [Errno 21] Is a directory: '../data/'

In [32]:
embedding_matrix = np.zeros((vocabulary_size ,GLOVE_SIZE))

for word,index in word2index_map.items():
    if not word == "PAD_TOKEN":
        word_embedding = word2embedding_dict[word]
        embedding_matrix[index,:] = word_embedding

NameError: name 'word2embedding_dict' is not defined

In [33]:
data_indices = list(range(len(data)))
np.random.shuffle(data_indices)
data = np.array(data)[data_indices]
labels = np.array(labels)[data_indices]
seqlens = np.array(seqlens)[data_indices]
train_x = data[:10000]
train_y = labels[:10000]
train_seqlens = seqlens[:10000]

test_x = data[10000:]
test_y = labels[10000:]
test_seqlens = seqlens[10000:]

def get_sentence_batch(batch_size,data_x,
                       data_y,data_seqlens):
    instance_indices = list(range(len(data_x)))
    np.random.shuffle(instance_indices)
    batch = instance_indices[:batch_size]
    x = [[word2index_map[word] for word in data_x[i].split()]
         for i in batch]
    y = [data_y[i] for i in batch]
    seqlens = [data_seqlens[i] for i in batch]
    return x,y,seqlens

In [34]:
_inputs = tf.compat.v1.placeholder(tf.int32, shape=[batch_size,times_steps])
embedding_placeholder = tf.compat.v1.placeholder(tf.float32, [vocabulary_size,
                                                    GLOVE_SIZE])

_labels = tf.compat.v1.placeholder(tf.float32, shape=[batch_size, num_classes])
_seqlens = tf.compat.v1.placeholder(tf.int32, shape=[batch_size])

In [35]:
if PRE_TRAINED:

        embeddings = tf.Variable(tf.constant(0.0, shape=[vocabulary_size,
                                 GLOVE_SIZE]),
                                 trainable=True)
        # If using pretrained embeddings, assign them to the embeddings variable
        embedding_init = embeddings.assign(embedding_placeholder)
        embed = tf.nn.embedding_lookup(embeddings, _inputs)

else:
        embeddings = tf.Variable(
            tf.compat.v1.random_uniform([vocabulary_size,
                               embedding_dimension],
                              -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, _inputs)

In [36]:
with tf.name_scope("biGRU"):
    with tf.compat.v1.variable_scope('forward'):
        gru_fw_cell = tf.compat.v1.nn.rnn_cell.GRUCell(hidden_layer_size)
        gru_fw_cell = tf.compat.v1.nn.rnn_cell.DropoutWrapper(gru_fw_cell)

    with tf.compat.v1.variable_scope('backward'):
        gru_bw_cell = tf.compat.v1.nn.rnn_cell.GRUCell(hidden_layer_size)
        gru_bw_cell = tf.compat.v1.nn.rnn_cell.DropoutWrapper(gru_bw_cell)

    outputs, states = tf.compat.v1.nn.bidirectional_dynamic_rnn(cell_fw=gru_fw_cell,
                                                      cell_bw=gru_bw_cell,
                                                      inputs=embed,
                                                      sequence_length=
                                                      _seqlens,
                                                      dtype=tf.float32,
                                                      scope="BiGRU")
states = tf.concat(values=states, axis=1)

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [37]:
weights = {
    'linear_layer': tf.Variable(tf.compat.v1.truncated_normal([2*hidden_layer_size,
                                                     num_classes],
                                                     mean=0,stddev=.01))
}
biases = {
    'linear_layer':tf.Variable(tf.compat.v1.truncated_normal([num_classes],
                                                   mean=0,stddev=.01))
}

# extract the final state and use in a linear layer
final_output = tf.matmul(states,
                         weights["linear_layer"]) + biases["linear_layer"]

softmax = tf.nn.softmax_cross_entropy_with_logits(logits=final_output,
                                                  labels=_labels)
cross_entropy = tf.reduce_mean(softmax)

train_step = tf.compat.v1.train.RMSPropOptimizer(0.001, 0.9).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(_labels,1),
                              tf.argmax(final_output,1))
accuracy = (tf.reduce_mean(tf.cast(correct_prediction,
                                   tf.float32)))*100

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [38]:
with tf.compat.v1.Session() as sess:
    sess.run(tf.compat.v1.global_variables_initializer())
    sess.run(embedding_init, feed_dict=
             {embedding_placeholder: embedding_matrix})
    for step in range(1000):
        x_batch, y_batch,seqlen_batch = get_sentence_batch(batch_size,
                                                           train_x,train_y,
                                                           train_seqlens)
        sess.run(train_step,feed_dict={_inputs:x_batch, _labels:y_batch,
                                       _seqlens:seqlen_batch})

        if step % 100 == 0:
            acc = sess.run(accuracy,feed_dict={_inputs:x_batch,
                                               _labels:y_batch,
                                               _seqlens:seqlen_batch})
            print("Accuracy at %d: %.5f" % (step, acc))

    for test_batch in range(5):
        x_test, y_test,seqlen_test = get_sentence_batch(batch_size,
                                                        test_x,test_y,
                                                        test_seqlens)
        batch_pred,batch_acc = sess.run([tf.argmax(final_output,1),
                                         accuracy],
                                        feed_dict={_inputs:x_test,
                                                   _labels:y_test,
                                                   _seqlens:seqlen_test})
        print("Test batch accuracy %d: %.5f" % (test_batch, batch_acc))
        print("Test batch accuracy %d: %.5f" % (test_batch, batch_acc))

Accuracy at 0: 45.31250
Accuracy at 100: 47.65625
Accuracy at 200: 100.00000
Accuracy at 300: 100.00000
Accuracy at 400: 100.00000
Accuracy at 500: 100.00000
Accuracy at 600: 100.00000
Accuracy at 700: 100.00000
Accuracy at 800: 100.00000
Accuracy at 900: 100.00000
Test batch accuracy 0: 100.00000
Test batch accuracy 0: 100.00000
Test batch accuracy 1: 100.00000
Test batch accuracy 1: 100.00000
Test batch accuracy 2: 100.00000
Test batch accuracy 2: 100.00000
Test batch accuracy 3: 100.00000
Test batch accuracy 3: 100.00000
Test batch accuracy 4: 100.00000
Test batch accuracy 4: 100.00000
