## tensorflow 实现 word2vec

In [1]:
import numpy as np
import tensorflow as tf
import os
import random
import collections

### 参数

In [68]:
# Training Parameters
learning_rate = 0.01 #学习率
batch_size = 128 # batch size
num_steps = 300000 # 迭代次数
display_step = 2000 # 迭代多少次显示信息
eval_step = 3000 # 迭代多少轮进行验证

# Evaluation Parameters
eval_words = ['five','of','going','hardware','american','britain']

# Word2Vec Parameters
embedding_size = 200  # word2vec 向量维度
max_vocabulary_size = 50000 # 词表大小
min_count = 10 # 最少出现次数
window_size = 3 # 窗口大小


num_skips = 2 # 
num_sampled = 64  # 

### 构建词典
### 读取文件

In [3]:
import zipfile

url = 'http://mattmahoney.net/dc/text8.zip'

data_path = './data/text8.zip'
if not os.path.exists(data_path):
    print("Downloading the dataset... (It may take some time)")
    filename, _ = urllib.request.urlretrieve(url, data_path)
    print("Done!")
    
with zipfile.ZipFile(data_path) as f:
    text_words = f.read(f.namelist()[0]).lower().split()

In [4]:
len(text_words),text_words[:10]

(17005207,
 [b'anarchism',
  b'originated',
  b'as',
  b'a',
  b'term',
  b'of',
  b'abuse',
  b'first',
  b'used',
  b'against'])

In [20]:
text_words = [text.decode('utf-8') for text in text_words]

In [21]:
text_words[0:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

### 构建词典
将稀少的词用 UNK 替换

In [32]:
def create_dict(text_words):
    count = [('UNK', -1)]
    # 获取 max_vocabulary_size 最高频的词
    count.extend(collections.Counter(text_words).most_common(max_vocabulary_size))
    #移除出现次数少于min_count10的
    count = [item for item in count if item[1]>=min_count]
    #计算vocabulary_size
    vocabulary_size = len(count)
    #word2id
    word2id = dict()
    for i,(word,_) in enumerate(count):
        word2id[word] = i
    #存储word id
    data = list()
    unk_count = 0
    for word in text_words:
        index = word2id.get(word,0)
        #检索一个单词id，如果不在字典中，则为它分配索引0 ('UNK')
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0]= ('UNK', unk_count)
    #id2word
    id2word = dict((k,v) for v,k in word2id.items())
    return word2id,id2word,vocabulary_size,count,text_words,data

In [34]:
word2id,id2word,vocabulary_size,count,text_words,data = create_dict(text_words)
print("Words count:", len(text_words))
print("Unique words:", len(set(text_words)))
print("Vocabulary size:", vocabulary_size)
print("Most common words:", count[:10])

Words count: 17005207
Unique words: 253854
Vocabulary size: 47134
Most common words: [('UNK', 1505572), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430), ('two', 192644)]


In [38]:
len(data),data[0:10]

(17005207, [5233, 3080, 11, 5, 194, 1, 3133, 45, 58, 155])

In [25]:
word2id['five'],id2word[15]

(15, 'five')

### generate training batch for skip gram

In [42]:
data_index = 0
def next_batch(batch_size, num_skips, window_size):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2*window_size
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size,1), dtype=np.int32)
    
    #get window size (words left and right + current one)
    span = 2*window_size + 1
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index+span])
    data_index+=span
    
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != window_size]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips+j] = buffer[window_size]
            labels[i * num_skips+j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index+=1
    
    # data
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

### 构建word2vec

In [49]:
with tf.name_scope('input'):
    X = tf.placeholder(tf.int32, shape=[None], name='input_x')
    y = tf.placeholder(tf.int32, shape=[None, 1], name='input_y')
    
with tf.device('/cpu:0'):
    with tf.name_scope('embedding'):
        embedding = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]), name='embedding')
        X_embed = tf.nn.embedding_lookup(embedding, X)
        tf.summary.scalar('embedding',X_embed)
    
    with tf.name_scope('weights'):
        w = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]),name='weight')
        tf.summary.scalar('weights',w)
        b = tf.Variable(tf.zeros([vocabulary_size]), name='bias')
        tf.summary.scalar('bias',b)

with tf.name_scope('loss'):
    loss_op = tf.reduce_mean(tf.nn.nce_loss(weights=w,
                                        biases=b,
                                        labels=y,
                                        inputs=X_embed,
                                        num_sampled=num_sampled,
                                        num_classes=vocabulary_size))
    tf.summary.scalar('loss',loss_op)
    
with tf.name_scope('optimizer'):
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss_op)

with tf.name_scope('evaluation'):
    #Compute the cosine similarity between input data embedding and every embedding vectors
    X_embed_norm = X_embed / tf.sqrt(tf.reduce_sum(tf.square(X_embed)))
    embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    cosin_sim = tf.matmul(X_embed_norm, embedding_norm, transpose_b=True)
    tf.summary.scalar('cosin_sim',cosin_sim)

In [52]:
tensorboard_dir = 'runs/word2vec'
if not os.path.exists(tensorboard_dir):
    os.makedirs(tensorboard_dir)

merged_summary = tf.summary.merge_all()

In [75]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # testing data
    x_test = np.array([word2id[w] for w in eval_words])
    
#     writer = tf.summary.FileWriter(tensorboard_dir,sess.graph)
    
    average_loss = 0
    for step in range(1,num_steps + 1):
        batch_x, batch_y = next_batch(batch_size, num_skips, window_size)

        _,loss = sess.run([train_op,loss_op], feed_dict={X:batch_x, y:batch_y})
        average_loss += loss
        
        if step % display_step == 0 or step == 1:
            # train
#             writer.add_summary(summaries, step)
            if step > 1:
                average_loss /= display_step
            print("Step " + str(step) + ", Average Loss= " + "{:.4f}".format(average_loss))
            average_loss=0
        
        # evaluation
        if step % eval_step == 0 or step == 1:
            print('Evaluation...')
            sim = sess.run(cosin_sim, feed_dict={X:x_test})
            for i in range(len(eval_words)):
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i,:]).argsort()[1:top_k+1]
                log_str = '"%s" nearest neighbors:' %eval_words[i]
                for k in range(top_k):
                    log_str = '%s %s,' % (log_str, id2word[nearest[k]])
                print(log_str)
                
# saver = tf.train.Saver()
# saver.save(sess, 'runs/word2vec', global_step=num_steps)

Step 1, Average Loss= 517.9537
Evaluation...
"five" nearest neighbors: dispelled, negotiation, signifier, depicted, mascot, racist, supplement, rez,
"of" nearest neighbors: restraints, obliquely, sw, confusingly, approximates, comedies, purging, dante,
"going" nearest neighbors: mules, buzz, claim, uncertainty, bound, cesare, sending, ramadan,
"hardware" nearest neighbors: affluence, libretto, pendulum, sets, iaea, tempo, ahenobarbus, hairstyle,
"american" nearest neighbors: burdened, eugenicists, aloft, azteca, pains, bonzos, airborne, mpaa,
"britain" nearest neighbors: esperantists, vs, ignatius, methodus, shortage, bumper, soon, blizzards,


KeyboardInterrupt: 