### word2vec

* 编写 data 读取
* 编写 w2v 的图构建代码
* 训练

通过 w2v 代码学习 ... 的使用方法

In [None]:
# tf name scope, 将节点在 tensor board 中归并到一起
with tf.name_scope('data'):
    iterator = dataset.make_initializable_iterator()

# tf variable scope
# 在相同的 scope 中，variable 可以被重新使用
# 例如下面代码，变量不会出现重复定义的错误
# ValueError: Variable h1_weights already exists, disallowed. Did you mean to set reuse=True in VarScope?
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope:
        w = tf.get_variable("weights", [x.shape[1], output_dim], initializer=tf.random_normal_initializer())
        b = tf.get_variable("biases", [output_dim], initializer=tf.constant_initializer(0.0))
        return tf.matmul(x, w) + b
# 或者使用下面的方法
with tf.variable_scope('two_layers') as scope:
    logits1 = two_hidden_layers(x1)
    scope.reuse_variables()
    logits2 = two_hidden_layers(x2)
    

In [None]:
# tf.train.Saver 保存训练中的变量，不保存 Graph
saver = tf.train.Saver()
# step 记录特定的训练步数
saver.save(sess, 'ckpt_path/model_name', global_step=step)

# 选择保存特定的变量
v1 = tf.Variable(..., name='v1') 
v2 = tf.Variable(..., name='v2') 
saver = tf.train.Saver({'v1': v1, 'v2': v2})
saver = tf.train.Saver([v1, v2])
saver = tf.train.Saver({v.op.name: v for v in [v1, v2]}) # similar to a dict

# 从 ckpt 中恢复图，同样需要先构建好图
ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
# check if there is a checkpoint and valid checkpoint path
if ckpt and ckpt.model_checkpoint_path:
     saver.restore(sess, ckpt.model_checkpoint_path)

In [None]:
# summary -> 可以通过 tensor board 查看训练的变化图
with tf.name_scope("summaries"):
    tf.summary.scalar("loss", self.loss)
    tf.summary.scalar("accuracy", self.accuracy)            
    tf.summary.histogram("histogram loss", self.loss)
    summary_op = tf.summary.merge_all()

loss_batch, _, summary = sess.run([loss, optimizer, summary_op])
writer = tf.summary.FileWriter('./graphs', sess.graph)
writer.add_summary(summary, global_step=step)

In [None]:
# 添加随机种子
# op level
c = tf.random_uniform([], -10, 10, seed=2)
d = tf.random_uniform([], -10, 10, seed=2)

with tf.Session() as sess:
    print(sess.run(c)) # >> 3.57493
    print(sess.run(d)) # >> 3.57493

# session level
c = tf.random_uniform([], -10, 10, seed=2)

with tf.Session() as sess:
    print(sess.run(c)) # >> 3.57493
    print(sess.run(c)) # >> -5.97319

with tf.Session() as sess:
    print(sess.run(c)) # >> 3.57493

with tf.Session() as sess:
    print(sess.run(c)) # >> 3.57493

# graph level
tf.set_random_seed(2)
c = tf.random_uniform([], -10, 10)
d = tf.random_uniform([], -10, 10)

with tf.Session() as sess:
    print(sess.run(c)) # >> -4.00752
    print(sess.run(d)) # >> -2.98339

In [None]:
# 全局训练步数记录
# 注：trainable 设置为 False
global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
# 传给 optimizer，这样 optimizer 就可以做 rate decay，并且自动更新 optimizer
optimizer = tf.train.AdamOptimizer(lr).minimize(loss, global_step=global_step)

In [None]:
# auto differentiation
# 根据图的定义，tf 会自动根据反向依赖关系，求出导数
# 使用链式法则，计算图中的导数求解很容易
tf.gradients(ys, xs, grad_ys=None, ...) # 求 ys 对 xs 的导数
tf.stop_gradient(input, name=None) # 防止输入也被自动求导
tf.clip_by_value(t, clip_value_min, clip_value_max, name=None) # 梯度截断
tf.clip_by_norm(t, clip_norm, axes=None, name=None) # 梯度截断

In [110]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import tensorflow as tf
import zipfile
import random
import numpy as np

In [111]:
def read_data():
    file_path = 'data/text8.zip'
    with zipfile.ZipFile(file_path) as f:
        words = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return words

In [112]:
from collections import Counter

def build_vocab(words, vocab_size, visual_fld):
    """ Build vocabulary of VOCAB_SIZE most frequent words and write it to
    visualization/vocab.tsv
    """
    file = open(os.path.join(visual_fld, 'vocab.tsv'), 'w')
    
    dictionary = dict()
    count = [('UNK', -1)]
    index = 0
    count.extend(Counter(words).most_common(vocab_size - 1))
    
    for word, _ in count:
        dictionary[word] = index
        index += 1
        file.write(word + '\n')
    
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    file.close()
    return dictionary, index_dictionary

In [113]:
vocab_size, visual_fld = 50000, 'visualization'
# dictionary, index_dict = build_vocab(words, vocab_size, visual_fld)

In [114]:
def convert_words_to_index(words, dictionary):
    """ Replace each word in the dataset with its index in the dictionary """
    return [dictionary[word] if word in dictionary else 0 for word in words]

# index_words = convert_words_to_index(words, dictionary)

In [115]:
skip_window = 5
def generate_sample(index_words, context_window_size):
    """ Form training pairs according to the skip-gram model. """
    for index, center in enumerate(index_words):
        context = random.randint(1, context_window_size)
        # get a random target before the center word
        for target in index_words[max(0, index - context): index]:
            yield center, target
        # get a random target after the center wrod
        for target in index_words[index + 1: index + context + 1]:
            yield center, target
single_gen = generate_sample(index_words, skip_window)

In [116]:
batch_size = 128
def batch_gen():
    local_dest = 'data/text8.zip'
    words = read_data()
    dictionary, _ = build_vocab(words, vocab_size, visual_fld)
    index_words = convert_words_to_index(words, dictionary)
    del words           # to save memory
    single_gen = generate_sample(index_words, skip_window)
    
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(single_gen)
        yield center_batch, target_batch

In [117]:
tf.reset_default_graph()
dataset = tf.data.Dataset.from_generator(batch_gen, 
                            (tf.int32, tf.int32), 
                            (tf.TensorShape([batch_size]), tf.TensorShape([batch_size, 1])))

In [118]:
embed_size = 128
num_sampled = 64
learning_rate = 1.0
# 构建图
# 读取训练数据

with tf.name_scope('data'):
    iterator = dataset.make_initializable_iterator()
    center_words, target_words = iterator.get_next()

# 定义 embedding
# embedding lookup
with tf.name_scope('embed'):
    embed_matrix = tf.get_variable('embed_matrix', 
                                    shape=[vocab_size, embed_size],
                                    initializer=tf.random_uniform_initializer())
    embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embedding')

# 定义损失
with tf.name_scope('loss'):
    nce_weight = tf.get_variable('nce_weight', shape=[vocab_size, embed_size],
                    initializer=tf.truncated_normal_initializer(stddev=1.0/(embed_size ** 0.5)))
    nce_bias = tf.get_variable('nce_bias', initializer=tf.zeros([vocab_size]))

    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
                                        biases=nce_bias,
                                        labels=target_words,
                                        inputs=embed,
                                        num_sampled=64,
                                        num_classes=vocab_size), name='loss')

# 定义优化函数
with tf.name_scope('optimizer'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

writer = tf.summary.FileWriter('graphs/word2vec_simple', tf.get_default_graph())
writer.close()

In [120]:
# 运行
sess = tf.Session()
sess.run(iterator.initializer)
sess.run(tf.global_variables_initializer())
total_loss = 0.0
num_train_steps = 100
skip_step = 5
for index in range(num_train_steps):
    try:
        loss_batch, _ = sess.run([loss, optimizer])
        total_loss += loss_batch
        if (index+1) % 5 == 0:
            print('Average loss at step {}: {:5.1f}'.format(index, total_loss / skip_step))
            total_loss = 0
    except tf.errors.OutOfRangeError:
        sess.run(iterator.initializer)

sess.close()

Average loss at step 4: 271.0
Average loss at step 9: 247.7
Average loss at step 14: 237.9
Average loss at step 19: 229.6
Average loss at step 24: 232.7
Average loss at step 29: 230.0
Average loss at step 34: 224.5
Average loss at step 39: 207.1
Average loss at step 44: 208.5
Average loss at step 49: 220.3
Average loss at step 54: 216.3
Average loss at step 59: 192.6
Average loss at step 64: 200.2
Average loss at step 69: 214.4
Average loss at step 74: 210.7
Average loss at step 79: 190.5
Average loss at step 84: 165.5
Average loss at step 89: 188.5
Average loss at step 94: 200.5
Average loss at step 99: 198.8


In [121]:
from tensorflow.contrib.tensorboard.plugins import projector

In [122]:
# 使用 projector 对词向量进行可视化

sess = tf.Session()
sess.run(tf.global_variables_initializer())
final_embed_matrix = sess.run(embed_matrix)
embedding_var = tf.Variable(final_embed_matrix[:1000], name='embedding')
sess.run(embedding_var.initializer)
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = embedding_var.name
summary_writer = tf.summary.FileWriter('visualization')
projector.visualize_embeddings(summary_writer, config)
saver_embed = tf.train.Saver([embedding_var])
saver_embed.save(sess, os.path.join('visualization', 'model.ckpt'), 1)
sess.close()