In [1]:
import collections
import math
import os
import random
import zipfile
import urllib
import numpy as np
import tensorflow as tf

### 1. 检查是否有数据集,如果没有在线下载

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    if not os.path.exists(filename):
        filename, _= urllib.request.urlretrieve(url+filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + filename + '.Can you get to it with a brower?')
    return filename

In [3]:
filename = maybe_download('text8.zip',31344016)

Found and verified text8.zip


### 2. 解压下载的文件,并用tf.compat.as_str转换为单词的列表

In [4]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        # zipfile.ZipFile(filename).read(name[, pwd])获取zip文档内指定文件的二进制数据。
        # zipfile.ZipFile(filename).namelist()获取zip文档内所有文件的名称列表
        # tf.compat.as_str将字节或 unicode 转换为 bytes，使用 UTF-8 编码进行文本处理
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()                  
    return data

In [5]:
words = read_data(filename)
print('Data size', len(words))

Data size 17005207


In [6]:
print(type(words))
print(words[:30])

<class 'list'>
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst']


In [7]:
import pandas as pd
pd.Series(words).describe()

count     17005207
unique      253854
top            the
freq       1061396
dtype: object

### 3. 接下来创建vocabulary词汇表
+ 使用collections.Counter统计单词列表中单词的频数,使用most_common获取top50000的单词作为vocabulary
+ 创建一个字典,把top50000的词汇的vocabulary放入,以便快速查询,dic查询复杂度为O(1)
+ 将全部单词转换为一系列的编号,top50000之外的的编号为0
------------
一些数据类型
* count(长度50000)[['UNK', 418391],('the', 1061396),('of', 593677),('and', 416629),('one', 411764),........]
* dictionary(长度50000){'UNK':0, 'the':1, 'of':2,.......}
* reverse_dictionary(长度50000){0:'UNK', 1:'the', 2:'of',.......}
* data(长度17005207)[5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156,......] 按照words的词顺序,每个词的编号

In [8]:
vocabulary_size = 50000
def build_dataset(words):
    count = [['UNK', -1]]
    # extend() 函数用于在列表末尾一次性追加另一个序列中的多个值（用新列表扩展原来的列表）
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) 
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) # dictionary的键值互换
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)

In [9]:
del words # 删除原始单词列表,节约内存

In [10]:
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5238, 3084, 12, 6, 195, 2, 3136, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


### 4. 生成Word2Vec的训练样本
用Skip-Gram模式(从目标单词反推语境),如果窗口大小为1，则将原始数据"the quick brown fox jumped over the lazy dog"可转为(quick,the),(quick,brown),(brown,quick),(brown,fox)等样本
- batch_size: batch大小。
- num_skips: 对每个单词生成样本数。他不能大于skip_window的两倍,并且batch_size必须是他的整数倍,确保每个batch包含一个单词的所有样本。
- skip_window: 单词最远可以联系的距离。
- span：对某个单词创建相关样本时会使用到的单词数量,包括目标单词本身和他前后的单词,因此,span=2*skip_window+1

In [11]:
data_index = 0
def generate_batch(batch_size, num_skips, skip_window):
    #　定义全局变量使得我们反复调用这个函数时可以遍历整个数据集
    global data_index
    # 使用assert确保满足前提条件
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1
    # 最大容量为span的双向队列,在对deque使用append方法添加变量时,只会保留最后插入的span个变量
    buffer = collections.deque(maxlen=span) 
    
    # span个单词顺序装入buffer中作为初始值，此时buffer已填满
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # 第一层循的batch_size // num_skips表示有batch_size // num_skips个目标单词
    for i in range(batch_size // num_skips):
        # buffer中第skip_window个变量为目标单词
        target = skip_window
        # 定义生成样本时需要避免的单词列表，首先填充目标单词本身作为初始化
        targets_to_avoid = [ skip_window ]
        # 对每个目标单词生成num_skips个样本
        for j in range(num_skips):
            # 先产生随机数，直到随机数不在targets_to_avoid中，代表可以使用的语境单词
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            # 把使用过的语境单词放入targets_to_avoid
            targets_to_avoid.append(target)
            # 产生一个样本，feature为目标词汇buffer[skip_window]，label为buffer[target]
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        # 在对一个目标单词生成完所有样本后，再读入下一个单词，即把滑窗向后移动一位
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

测试generate_batch的功能:
- 以第一个样本为例,3081为originated的编号,这个单词对应的语境是anarchism,anarchism的编号是5234

In [12]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i,0], reverse_dictionary[labels[i,0]])

3084 originated -> 5238 anarchism
3084 originated -> 12 as
12 as -> 3084 originated
12 as -> 6 a
6 a -> 195 term
6 a -> 12 as
195 term -> 2 of
195 term -> 6 a


设置训练时的参数:

In [13]:
batch_size = 128
embedding_size = 128 # 词向量的维度，一般是50-100这个范围内的值
skip_window = 1
num_skips = 2

valid_size = 16 # 用来抽取的验证单词数
valid_window = 100 # 验证单词只从频数最高的valid_window个里来选
valid_examples = np.random.choice(valid_window, valid_size, replace=False) # 随机抽取的频数高的一些单词
num_sampled = 64 # 训练时用来做负样本的噪声单词数量

### 5. 定义Skip-Gram Word2Vec模型的网络结构
- NCE Loss(Noise-Contrastive Estimation)

In [14]:
graph = tf.Graph()        # 创建一个tf.Graph
with graph.as_default():  # 设置为默认Graph
    # 输入接口
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    
    # 将前面产生的valid_examples转为tensorflow中的constant
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # 限定以下计算在CPU上操作,因为tf.nn.embedding_lookup在GPU上还没实现
    with tf.device('/cpu:0'):
        # 用tf.random_uniform随机生成50000个词,128维度的词向量
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        # tf.nn.embedding_lookup查找train_inputs对应的embeddings
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # 构造NCE loss的变量
    # tf.truncated_normal初始化NCE loss中的权重参数nce_weights
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size], 
                            stddev=1.0 / math.sqrt(embedding_size)))
    #  nce_biases初始化为零
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))   

    # 计算一个batch的平均NCE loss
    # tf.nce_loss自动提取(draw)每个负标签的新样本
    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                         biases=nce_biases,
                                         labels=train_labels,
                                         inputs=embed,
                                         num_sampled=num_sampled,
                                         num_classes=vocabulary_size))

    # 构造梯度下降算法优化器,学习率设为1
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # 得到每个单词词向量的L2范数
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    # 再将embeddings除以其L2范数得到标准化后的normalized_embeddings
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)  # tf.nn.embedding_lookup查询验证单词的嵌入向量
    # 计算验证单词的嵌入向量与词汇表中所有单词的相似性
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

    init = tf.global_variables_initializer() # 初始化所有模型变量参数

### 6. 开始训练模型

In [15]:
num_steps = 100001  # 定义最大迭代次数为100000次

with tf.Session(graph=graph) as session:      # 创建并设置默认的session
    init.run()   # 在使用session之前,初始化所有模型变量参数
    print("Initialized")

    average_loss = 0
    for step in range(num_steps):
        # 在每一步迭代中先使用generate_batch生成一个batch的inputs和labels
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)    
        feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels} # 然后把他们送入到输入中

        # 执行一次优化器运算(即一次参数更新)和损失运算
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val # 将loss累计到average_loss
        
        # 之后每2000次循环一次,计算平均loss并显示出来
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

        # 每10000次循环,计算一次验证单词与全部单词的相似度,并将与每个验证单词最相似的8个单词展现出来
        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log_str = "Nearest to %s:" % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step  0 :  262.2593688964844
Nearest to which: shortage, lemy, catiline, dots, twist, obstacles, poincar, mothra,
Nearest to not: optimally, oxidizes, kinetics, libation, carvings, spin, humid, ikea,
Nearest to system: deep, ludvig, intuitionistic, gandhi, delusional, anglian, reel, raptor,
Nearest to zero: thespis, immobile, omission, hillary, childless, goeldi, cheap, scientific,
Nearest to for: electrostatics, episcopalians, truman, mysterious, jag, adnan, rcc, presenter,
Nearest to UNK: consort, tld, zen, ide, buttocks, praises, minimalism, faxes,
Nearest to be: bestiary, vos, nacl, guns, bloomsbury, leads, nieces, exiled,
Nearest to would: monitors, heliocentric, balts, repaired, thrower, accommodates, transmitted, appreciable,
Nearest to called: guevara, orchestrated, essentially, timeline, computability, eagerly, waldorf, bermuda,
Nearest to some: jezebel, pardoned, pmid, stationed, naismith, aichi, fermented, glu,
Nearest to at: acadian, barroso, hal

### 7. 可视化Word2Vec词向量

In [16]:
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  #in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i,:]
        # 显示散点图
        plt.scatter(x, y)
        #展示单词本身
        plt.annotate(label,
                    xy=(x, y),
                    xytext=(5, 2),
                    textcoords='offset points',
                    ha='right',
                    va='bottom')

    plt.savefig(filename)

In [17]:
try:
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt

    # 降维,降到2维
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    plot_only = 200 # 显示词频最高的200个词
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
    labels = [reverse_dictionary[i] for i in range(plot_only)]
    plot_with_labels(low_dim_embs, labels)

except ImportError:
    print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")