In [1]:
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
print(tf.__version__)

1.5.0


In [3]:
import utils
import word2vec_utils

In [4]:
import numpy as np

In [5]:
BATCH_SIZE = 128
VOCAB_SIZE = 50000
SKIP_WINDOW = 1 
VISUAL_FLD = 'visualization'

DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
EXPECTED_BYTES = 31344016
NUM_VISUALIZE = 3000        # number of tokens to visualize
EMBED_SIZE = 128

### note:  Dataset from a generator 

In [6]:
def gen():
    yield from word2vec_utils.batch_gen(DOWNLOAD_URL, EXPECTED_BYTES, VOCAB_SIZE, 
                                        BATCH_SIZE, SKIP_WINDOW, VISUAL_FLD)

In [7]:
dataset = tf.data.Dataset.from_generator(gen, 
                                (tf.int32, tf.int32), 
                                (tf.TensorShape([BATCH_SIZE]), tf.TensorShape([BATCH_SIZE, 1])))

In [8]:
# Step 1: get input, output from the dataset
with tf.name_scope('data'):
    iterator = dataset.make_initializable_iterator()
    center_words, target_words = iterator.get_next()

In [9]:
#""" Step 2 + 3: define weights and embedding lookup.
#    In word2vec, it's actually the weights that we care about 
#    """
with tf.name_scope('embed'):
    embed_matrix = tf.get_variable('embed_matrix', 
                                        shape=[VOCAB_SIZE, EMBED_SIZE],
                                        initializer=tf.random_uniform_initializer())
    embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embedding')

In [10]:
# Step 4: construct variables for NCE loss and define loss function
NUM_SAMPLED = 64            # number of negative examples to sample
with tf.name_scope('loss'):
        nce_weight = tf.get_variable('nce_weight', shape=[VOCAB_SIZE, EMBED_SIZE],
                        initializer=tf.truncated_normal_initializer(stddev=1.0 / (EMBED_SIZE ** 0.5)))
        nce_bias = tf.get_variable('nce_bias', initializer=tf.zeros([VOCAB_SIZE]))

        # define loss function to be NCE loss function
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, 
                                            biases=nce_bias, 
                                            labels=target_words, 
                                            inputs=embed, 
                                            num_sampled=NUM_SAMPLED, 
                                            num_classes=VOCAB_SIZE), name='loss')

In [13]:
# Step 5: define optimizer
LEARNING_RATE = 1.0
with tf.name_scope('optimizer'):
    optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)

In [14]:
utils.safe_mkdir('checkpoints')

In [18]:
NUM_TRAIN_STEPS=100000
SKIP_STEP=5000

In [24]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

In [25]:
saver = tf.train.Saver() # defaults to saving all variables - in this case embed_matrix, nce_weight, nce_bias
initial_step = 0
with tf.Session() as sess:
    sess.run(iterator.initializer)
    sess.run(tf.global_variables_initializer())
    ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
    
    # if that checkpoint exists, restore from checkpoint
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
    total_loss =0.0
    writer = tf.summary.FileWriter('graphs/word2vec_simple', sess.graph)

    for index in range(NUM_TRAIN_STEPS):
        try:
            loss_batch, _ = sess.run([loss, optimizer])
            total_loss += loss_batch
            if (index + 1) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                total_loss = 0.0
                saver.save(sess, 'checkpoints/skip-gram', index)
        except tf.errors.OutOfRangeError:
                sess.run(iterator.initializer)
    writer.close()

data/text8.zip already exists
<class 'list'>
50000
('UNK', -1)
('the', 1061396)
('of', 593677)
Average loss at step 4999:  65.4
Average loss at step 9999:  18.2
Average loss at step 14999:   9.5
Average loss at step 19999:   6.7
Average loss at step 24999:   5.7
Average loss at step 29999:   5.2
Average loss at step 34999:   5.0
Average loss at step 39999:   4.8
Average loss at step 44999:   4.8
Average loss at step 49999:   4.8
Average loss at step 54999:   4.8
Average loss at step 59999:   4.7
Average loss at step 64999:   4.6
Average loss at step 69999:   4.7
Average loss at step 74999:   4.6
Average loss at step 79999:   4.7
Average loss at step 84999:   4.6
Average loss at step 89999:   4.7
Average loss at step 94999:   4.6
Average loss at step 99999:   4.6


In [20]:
word2vec_utils.most_common_words(VISUAL_FLD, NUM_VISUALIZE)

In [34]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
    
    # if that checkpoint exists, restore from checkpoint
    if ckpt and ckpt.model_checkpoint_path:
        print("yes... ckpt")
        saver.restore(sess, ckpt.model_checkpoint_path)
    
    final_embed_matrix = sess.run(embed_matrix)
    embedding_var = tf.Variable(final_embed_matrix[:NUM_VISUALIZE], name='embedding')
    sess.run(embedding_var.initializer)
    ff_embed=sess.run(embedding_var)

yes... ckpt
INFO:tensorflow:Restoring parameters from checkpoints/skip-gram-99999


In [36]:
ff_embed.shape

(3000, 128)

In [37]:
final_embed_matrix.shape

(50000, 128)