In [28]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import threading
import time

import tensorflow as tf
import numpy as np

from embedding import gen_word2vec as word2vec

In [2]:
sess = tf.InteractiveSession()

In [4]:
# practice tf.reduce_sum, tf.matmul

tmp = [[1,2,1],[2,3,1]]

print tmp
print tf.reduce_sum(tmp).eval()
print tf.reduce_sum(tmp, 0).eval()
print tf.reduce_sum(tmp, 1).eval()
print "="*30

a = np.ones([2,4])
b = np.ones([2,4])
print tf.matmul(a, b, transpose_b=True).eval()

[[1, 2, 1], [2, 3, 1]]
10
[3 5 2]
[4 6]
[[ 4.  4.]
 [ 4.  4.]]


In [None]:
embed_dim = 200
num_neg_samples = 100

epochs_to_train = 15
learning_rate = 0.025
batch_size = 16
window_size = 5
min_count = 5
subsample = 1e-3

# words : list of words including 'UNK'
# counts : list of counts of words
# words_per_epoch
# current_epoch : start from 0
(words, counts, words_per_epoch, current_epoch, total_words_processed, x, y) = \
    word2vec.skipgram(
        filename = 'text8',
        batch_size = batch_size,
        window_size = window_size,
        min_count = min_count,
        subsample = subsample
    )

(vocab_words, vocab_counts, words_per_epoch) = sess.run([words, counts, words_per_epoch])

In [36]:
vocab_size = len(vocab_words)

id2word = vocab_words
word2id = {}

for i, w in enumerate(id2word):
    word2id[w] = i

print("Vocab size: ", vocab_size - 1, " + UNK")
print("Words per epoch: ", words_per_epoch)

global_step = tf.Variable(0, name="global_step")

Vocab size:  71290  + UNK
Words per epoch:  17005207


In [39]:
print(x.eval())
print(y.eval())

[10770 10770 10770 10770 10770 10770 10770 10770   215   215   215   215
   215   215   215   215]
[    5     6 10770   215     7   105   455    20    59  2733   215     7
  1325   455    20    59]


In [40]:
# w_in : vocab_size x embed_dim
w_embed = tf.Variable(
    tf.random_uniform(
        [vocab_size, embed_dim],
        -0.5 / embed_dim,
        +0.5 / embed_dim
    ),
    name = "w_in"
)

# w_out : vocab_size x embed_dim
w = tf.Variable(tf.zeros([vocab_size, embed_dim]), name="w")
b = tf.Variable(tf.zeros([vocab_size]), name="b")

# Cast [batch_size] -> [batch_size x 1]
labels_matrix = tf.reshape(tf.cast(y, dtype=tf.int64), [batch_size, 1])

# Samples a set of classes using the provided (fixed) base distribution.
neg_sampled_ids, _, _ = (tf.nn.fixed_unigram_candidate_sampler(
    true_classes = labels_matrix,
    num_true = 1,
    num_sampled = num_neg_samples,
    unique = True,
    range_max = vocab_size,
    distortion = 0.75,
    unigrams = vocab_counts.tolist()))

# batch_size x embed_dim
x_emb = tf.nn.embedding_lookup(w_embed, x)

# batch_size x embed_dim
true_w = tf.nn.embedding_lookup(w, y)

# batch_size x 1
true_b = tf.nn.embedding_lookup(b, y)

# element-wise multiplication
# tf.reduced_sum([batch_size x embed_dim] * [batch_size x embed_dim]) + [batch_size x 1]
# tf.reduce_sum(x, 1) : row끼리 더함
true_y_ = tf.reduce_sum(tf.mul(x_emb, true_w), 1) + true_b


########################
# For negative samples
########################

# num_neg_samples x embed_dim
neg_w = tf.nn.embedding_lookup(w, neg_sampled_ids)

# num_neg_samples x 1
neg_b = tf.nn.embedding_lookup(b, neg_sampled_ids)
neg_b = tf.reshape(neg_b, [num_neg_samples])

# matrix multiplication
# [batch_size x embed_dim] x [embed_dim x num_neg_samples] + [num_neg_samples] <- 각 row 마다 neg_b 더함
# neg_y_ = tf.reduce_sum(tf.mul(x_emb, ))
neg_y_ = tf.matmul(x_emb, neg_w, transpose_b=True) + neg_b

# neg_y_ 계산이 이상해 보이지만 true_y_ 구할 때 처럼 element-wise multiplication해서 neg_b 더하는것

#####################################
# Negative Cross Entropy (NCE) Loss
#####################################

# Measures the probability error in discrete classification tasks in which each
# class is independent and not mutually exclusive. For instance, one could
# perform multilabel classification where a picture can contain both an elephant
# and a dog at the same time.
true_loss = tf.nn.sigmoid_cross_entropy_with_logits(true_y_, tf.ones_like(true_y_))
neg_loss = tf.nn.sigmoid_cross_entropy_with_logits(neg_y_, tf.zeros_like(neg_y_))

loss = (tf.reduce_sum(true_loss) + tf.reduce_sum(neg_loss)) / batch_size


#####################################
# Optimization
#####################################

global_step = tf.Variable(0, name="global_step")

words_to_train = float(words_per_epoch * epochs_to_train)

lr = learning_rate * tf.maximum(
    0.001,
    1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train
)

optimizer = tf.train.GradientDescentOptimizer(lr)
train = optimizer.minimize(loss,
                           global_step = global_step,
                           gate_gradients=optimizer.GATE_NONE)

In [None]:
concurrent_steps = 12
statistics_interval = 5
checkpoint_interval = 600

init = tf.initialize_all_variables()
sess.run(init)
saver = tf.train.Saver()

with open(os.path.join("./", "vocab.txt"), "w") as f:
    for i in xrange(vocab_size):
        f.write(vocab_words[i] + " " + str(vocab_counts[i]) + "\n")

def train_thread_body():
    global sess, train, current_epoch
    initial_epoch = sess.run([current_epoch])
    
    while True:
        _, epoch = sess.run([train, current_epoch])
        if epoch != initial_epoch:
            break

###############
# Train
###############

for _ in xrange(epochs_to_train):
    initial_epoch, initial_words = sess.run([current_epoch, words])
    
    summary_op = tf.merge_all_summaries()
    summary_writer = tf.train.SummaryWriter("./",
                                            graph_def=sess.graph_def)
    
    workers = []
    for _ in xrange(concurrent_steps):
        t = threading.Thread(target=train_thread_body())
        t.start()
        workers.append(t)
        
    last_words, last_tme, last_summary_time = initial_words, time.time(), 0
    last_checkpoint_time = 0
    
    while True:
        # Print statistics every n seconds.
        time.sleep(statistics_interval)
        (epoch, step, loss, words, lr) = sess.run(
              [current_epoch, global_step, loss, words, lr])
        
        now = time.time()
        last_words, last_time, rate = words,now, (words - last_words) / (now - last_time)
        print("Epoch %4d Step %8d: lr = %5.3f loss = %6.2f words/sec = %8.0f\r" %
            (epoch, step, lr, loss, rate), end="")
        sys.stdout.flush()
        
        if now - last_checkpoint_time > checkpoint_interval:
            saver.save(sess, "./" + "model", global_step = step.astype(int))
            last_checkpoint_time = now
            
        if epoch != initial_epoch:
            break
            
    for t in workers:
        t.join()

# global_step : to distinguish step from other saved models
saver.save(sess, os.path.join("./", "model.ckpt"), global_step=model.global_step)

In [None]:
########################
# Graph for evaluation
########################

analogy_a = tf.placeholder(dtype=tf.int32) # [N]
analogy_b = tf.placeholder(dtype=tf.int32) # [N]
analogy_c = tf.placeholder(dtype=tf.int32) # [N]

normalized_w_embed = tf.nn.l2_normalize(w_embed, 1)

# [N x embed_dim]
a_w_embed = tf.gather(normalized_w_embed, analogy_a)
b_w_embed = tf.gather(normalized_w_embed, analogy_b)
c_w_embed = tf.gather(normalized_w_embed, analogy_c)

target = c_w_embed + (b_w_embed - a_w_embed)

# Compute cosine distance between each pair of target and vocab.
# [N x embed_dim] * [embed_dim x vocab_size]  = [N x vocab_size]
dist = tf.matmul(target, normalized_embed, transpose_b = True)

_, pred_idx = tf.nn.top_k(dist, 4)

nearby_word = tf.placeholder(dtype=tf.int32)

# [N x embed_dim]
nearby_w_embed = tf.gather(normalized_w_embed, nearby_word)

nearby_dist = tf.matmul(nearby_emb, nemb, transpose_b=True)

In [None]:
tf.nn.l2_normalize?

In [None]:
tf.gather?