# Word2vec

- 褚则伟 zeweichu@gmail.com
- 稀牛学院NLP课程资料



### 定义一个Skip Gram的Model class

In [1]:
import tensorflow as tf
import os

def make_dir(path):
    try:
        os.mkdir(path)
    except OSError:
        pass

In [2]:
import zipfile
from collections import Counter
import sys
import zipfile

import numpy as np
from six.moves import urllib
import random

DOWNLOAD_URL = "http://mattmahoney.net/dc/"
EXPECTED_BYTES = 31344016
DATA_FOLDER = "./data/"
FILE_NAME = "text8.zip"


### 下载数据

In [3]:
def download(file_name, expected_bytes):
    if not os.path.exists(DATA_FOLDER):
        make_dir(DATA_FOLDER)
    file_path = DATA_FOLDER + file_name
    if os.path.exists(file_path):
        print("Dataset ready")
        return file_path
    file_name, _ = urllib.request.urlretrieve(DOWNLOAD_URL + 
                                              file_name, file_path)
    file_stat = os.stat(file_path)
    if file_stat.st_size == expected_bytes:
        print("Successfully downloaded the file", file_name)
    else:
        raise Exception("File " + file_name + 
                        ' might be corrupted. You should try \
                        downloading it with a browser. ')
    return file_path
file_path = download(FILE_NAME, EXPECTED_BYTES)
file_path

Dataset ready


'./data/text8.zip'

### 读取数据

In [4]:
def read_data(file_path):
    with zipfile.ZipFile(file_path) as f:
        words = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return words

words = read_data(file_path)
print(len(words))
print(words[:5])

17005207
['anarchism', 'originated', 'as', 'a', 'term']


### 构建dataset
把word都转换成index

In [5]:
def build_dataset(words, vocab_size):
    # 构建一个 word -> index 的 dictionary，
    # 以及一个 index -> word 的 reverse_dictionary
    dictionary = {}
    count  = [["UNK", -1]]
    count.extend(Counter(words).most_common(vocab_size - 1))
    index = 0
    make_dir("processed")
    with open("processed/vocab_1000.tsv", "w") as f:
        for word, _ in count:
            dictionary[word] = index
            if index < 1000:
                f.write(word + "\n")
            index += 1
    word_index = [dictionary[word] if word in dictionary else 0 for word in words]
    count[0][1] = word_index.count(0)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return word_index, count, dictionary, reverse_dictionary

VOCAB_SIZE = 50000
word_index, count, dictionary, reverse_dictionary = build_dataset(words, VOCAB_SIZE)
del words # 可以节省内存
print("Most common words: {}", count[:5])
print("Sample data: {}, {}".format(word_index[:10], [reverse_dictionary[i] for i in word_index[:10]]))

Most common words: {} [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data: [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156], ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


### 生成训练的batch

In [6]:
import collections
data_index = 0
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window # 我们希望拿到的skip数量比window单词的数量少，这样不会产生重复的sample
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2*skip_window + 1
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(word_index):
        data_index = 0
    buffer.extend(word_index[data_index:data_index+span])
    data_index += span
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid: 
                target = random.randint(0, span-1) # 随机从window里面选一个target_word
            batch[i*num_skips + j] = buffer[skip_window]
            labels[i*num_skips + j, 0] = buffer[target]
        if data_index == len(word_index): # 如果data_index已经到这个文档的末尾了，我们就从头开始
            buffer[:] = word_index[:span]
            data_index = span
        else: # 把整个window往后面移一个位置
            buffer.append(word_index[data_index])
            data_index += 1
    data_index = (data_index + len(word_index) - span) % len(word_index)
    return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]],
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

3081 originated -> 12 as
3081 originated -> 12 as
12 as -> 6 a
12 as -> 6 a
6 a -> 195 term
6 a -> 195 term
195 term -> 2 of
195 term -> 2 of


### 定义model

In [None]:
EMBED_SIZE = 128
NUM_SAMPLED = 64
LEARNING_RATE = 1.0
BATCH_SIZE = 128

valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64

# valid_size = 16
# valid_window = 100
# valid_examples = np.random.choice(valid_window, valid_size, replace=False)

graph = tf.Graph()

with graph.as_default():
    target_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE])
    context_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
            
    with tf.device("/cpu:0"):
        global_step = tf.Variable(0, dtype=tf.int32, trainable = False)
        embeddings = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, target_words)
        nce_weight = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE],
                                                            stddev=1.0/(EMBED_SIZE ** 0.5)))
        nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]))
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, biases=nce_bias, 
                                             labels=context_words,
                                             inputs=embed, num_sampled=NUM_SAMPLED,
                                             num_classes=VOCAB_SIZE))
                
        optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
        
        
        # 计算cosine similarity 
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings/norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
        similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
        
        
        init = tf.global_variables_initializer() 
        

### 训练model

In [None]:
SKIP_STEP = 2000
WEIGHTS_FOLDER = 'processed'
NUM_TRAIN_STEPS = 100000
SKIP_WINDOW = 1      
NUM_SKIPS = 2  

# 构造一个 Saver object，会默认保存所有variables

initial_step = 0
make_dir("checkpoints")

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())

    total_loss = 0.0
    initial_step = global_step.eval()
    
    for index in range(initial_step, initial_step + NUM_TRAIN_STEPS):
        target_words_batch, context_words_batch = generate_batch(BATCH_SIZE, NUM_SKIPS, SKIP_WINDOW)#next(batch_gen)

        loss_batch, _ = sess.run([loss, optimizer],
                                 feed_dict = {target_words: target_words_batch, 
                                              context_words: context_words_batch})
        total_loss += loss_batch
        
        if (index + 1) % SKIP_STEP == 0:
            print("Average loss at step {}: {:5.1f}".format(index, total_loss / SKIP_STEP))
            total_loss = 0.0
    
        if (index + 1) % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = "Nearest to {}:".format(valid_word)
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "{} {},".format(log_str, close_word)
                print(log_str)
        final_embeddings = normalized_embeddings.eval()



Average loss at step 1999: 114.4
Average loss at step 3999:  52.9
