# Word2vec

- 褚则伟 zeweichu@gmail.com
- 稀牛学院NLP课程资料



In [1]:
import tensorflow as tf
import os

### 定义一个Skip Gram的Model class

In [2]:
def make_dir(path):
    try:
        os.mkdir(path)
    except OSError:
        pass
    
def huber_loss(labels, predictions, delta=1.0):
    residual = tf.abs(predictions - labels)
    def f1(): return 0.5*tf.square(residual)
    def f2(): return delta*residual - 0.5*tf.square(delta)
    return tf.cond(residual<delta, f1, f2)

In [3]:
import zipfile
from collections import Counter
import os
import sys
import zipfile

import numpy as np
from six.moves import urllib
import tensorflow as tf
import random

DOWNLOAD_URL = "http://mattmahoney.net/dc/"
EXPECTED_BYTES = 31344016
DATA_FOLDER = "./data/"
FILE_NAME = "text8.zip"


### 下载数据

In [4]:
def download(file_name, expected_bytes):
    if not os.path.exists(DATA_FOLDER):
        make_dir(DATA_FOLDER)
    file_path = DATA_FOLDER + file_name
    if os.path.exists(file_path):
        print("Dataset ready")
        return file_path
    file_name, _ = urllib.request.urlretrieve(DOWNLOAD_URL + 
                                              file_name, file_path)
    file_stat = os.stat(file_path)
    if file_stat.st_size == expected_bytes:
        print("Successfully downloaded the file", file_name)
    else:
        raise Exception("File " + file_name + 
                        ' might be corrupted. You should try \
                        downloading it with a browser. ')
    return file_path
file_path = download(FILE_NAME, EXPECTED_BYTES)
file_path

Dataset ready


'./data/text8.zip'

### 读取数据

In [5]:
def read_data(file_path):
    with zipfile.ZipFile(file_path) as f:
        words = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return words

words = read_data(file_path)
print(len(words))
print(words[:5])

17005207
['anarchism', 'originated', 'as', 'a', 'term']


### 构建dataset

In [6]:
def build_dataset(words, vocab_size):
    # 构建一个 word -> index 的 dictionary，
    # 以及一个 index -> word 的 reverse_dictionary
    dictionary = {}
    count  = [["UNK", -1]]
    count.extend(Counter(words).most_common(vocab_size - 1))
    index = 0
    make_dir("processed")
    with open("processed/vocab_1000.tsv", "w") as f:
        for word, _ in count:
            dictionary[word] = index
            if index < 1000:
                f.write(word + "\n")
            index += 1
    word_index = [dictionary[word] if word in dictionary else 0 for word in words]
    count[0][1] = word_index.count(0)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return word_index, count, dictionary, reverse_dictionary

VOCAB_SIZE = 50000
word_index, count, dictionary, reverse_dictionary = build_dataset(words, VOCAB_SIZE)
del words # 可以节省内存
print("Most common words: {}", count[:5])
print("Sample data: {}, {}".format(word_index[:10], [reverse_dictionary[i] for i in word_index[:10]]))

Most common words: {} [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data: [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156], ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


### 把word都转换成index

### 生成一些训练用的sample data

In [7]:
BATCH_SIZE = 128
def generate_sample(index_words, context_window_size):
    for index, center in enumerate(index_words):
        # 选出一个随机数当做window_size
        context = random.randint(1, context_window_size)
        for target in index_words[max(0, index-context):index]:
            yield center, target
        for target in index_words[index + 1: index + context + 1]:
            yield center, target
            
            
single_gen = generate_sample(word_index, BATCH_SIZE)
single_gen

<generator object generate_sample at 0x11099fc50>

### 构造一个iterator可以生成batch

In [8]:
def get_batch(iterator, batch_size):
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1], dtype=np.int32)
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(iterator)
        yield center_batch, target_batch
        
batch_gen = get_batch(single_gen, BATCH_SIZE)
batch_gen

<generator object get_batch at 0x123b37570>

### 定义model

In [9]:
EMBED_SIZE = 128
NUM_SAMPLED = 64
LEARNING_RATE = 0.1

# valid_size = 16
# valid_window = 100
# valid_examples = np.random.choice(valid_window, valid_size, replace=False)

graph = tf.Graph()

with graph.as_default():
    target_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE])
    context_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1])
            
    with tf.device("/cpu:0"):
        global_step = tf.Variable(0, dtype=tf.int32, trainable = False)
        embeddings = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, target_words)
        nce_weight = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE],
                                                            stddev=1.0/(EMBED_SIZE ** 0.5)))
        nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]))
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, biases=nce_bias, 
                                             labels=context_words,
                                             inputs=embed, num_sampled=NUM_SAMPLED,
                                             num_classes=VOCAB_SIZE))
                
        optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
            
#         norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
#         normalized_embeddings = embeddings/norm
#         valid_embeddings = tf.nn.embedding_loop
        init = tf.global_variables_initializer() 
        

### 训练model

In [12]:
SKIP_STEP = 2000
WEIGHTS_FOLDER = 'processed'
NUM_TRAIN_STEPS = 100000

# 构造一个 Saver object，会默认保存所有variables

initial_step = 0
make_dir("checkpoints")

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())

    total_loss = 0.0
    initial_step = global_step.eval()
    
    for index in range(initial_step, initial_step + NUM_TRAIN_STEPS):
        target_words_batch, context_words_batch = next(batch_gen)

        loss_batch, _ = sess.run([loss, optimizer],
                                 feed_dict = {target_words: target_words_batch, 
                                              context_words: context_words_batch})
        total_loss += loss_batch
        
        if (index + 1) % SKIP_STEP == 0:
            print("Average loss at step {}: {:5.1f}".format(index, total_loss / SKIP_STEP))
            total_loss = 0.0
    



Average loss at step 1999: 167.7
Average loss at step 3999:  98.4
Average loss at step 5999:  74.8
Average loss at step 7999:  53.0
Average loss at step 9999:  41.0
Average loss at step 11999:  39.7
Average loss at step 13999:  36.3
Average loss at step 15999:  35.4
Average loss at step 17999:  36.6
Average loss at step 19999:  31.3
Average loss at step 21999:  26.3
Average loss at step 23999:  20.0
Average loss at step 25999:  20.9
Average loss at step 27999:  23.2
Average loss at step 29999:  18.9
Average loss at step 31999:  19.8
Average loss at step 33999:  18.5
Average loss at step 35999:  16.8
Average loss at step 37999:  16.3
Average loss at step 39999:  18.0
Average loss at step 41999:  15.2
Average loss at step 43999:  14.4
Average loss at step 45999:  16.0
Average loss at step 47999:  14.5
Average loss at step 49999:  18.7
Average loss at step 51999:  17.0
Average loss at step 53999:  14.8
Average loss at step 55999:  13.3
Average loss at step 57999:  11.8
Average loss at ste