# 實現 Word2Vec

In [1]:
from __future__ import division
import collections
import math
import os
import random
import zipfile
import numpy as np
import tensorflow as tf

使用urllib.urlretrieve下載資料的壓縮檔案，並核對檔案大小，若已經下載則可跳過。

In [2]:
from urllib.request import urlretrieve
def maybe_download(filename, expected_bytes):
    if not os.path.exists(filename):
        url =  "http://mattmahoney.net/dc/"
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


在瀏覽器输入 http://mattmahoney.net/dc/text8.zip 下载資料集。

接下来解壓縮資料集，並使用 tf.compat.as_str 將資料集轉乘單詞的列表。

In [0]:
# 將詞存入 word 列表中
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data(filename)
print ('Data size', len(words))

Data size 17005207


通過Output知道資料集最後被轉換成一個包含 17005207 個單詞的列表。

In [0]:
vocabulary_size = 50000  # 將出现频率最高的 50000 個單詞放入 count 列表中，然後放入 dictionary 中


def build_dataset(words):
    count = [['UNK', -1]]  # 前面是詞彙，最後是出现的次数，這裡的 -1 在下面會填上 UNK 出现的頻率數
    # 將出现頻率最高的 50000 個詞存入count
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))  # -1 因為 UNK 已经佔了一個了

    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
   
    # Encoding：如果不出现在 dictionary 中，就以 0 作為編號，否则以 dictionary 中的編號為主   
    # 也就是將 words 中的所有詞的編號存在 data 中，並查一下 UNK 有多少，以便替换 count 中的 -1
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)

    count[0][1] = unk_count

    # 編號：詞
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary


data, count, dictionary, reverse_dictionary = build_dataset(words)

words[:10]

输出：
['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

data[:10] 

输出：
[5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156]

count[:10]

输出：
[['UNK', 418391],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430)]

dictionary # 词：编号

输出：
{'fawn': 45848,
 'homomorphism': 9648,
 'nordisk': 39343,
 'nunnery': 36075,
 'chthonic': 33554,
 'sowell': 40562,
 'sonja': 38175,
 'showa': 32906,
 'woods': 6263,
 'hsv': 44222,
 'spiders': 14623,
 'hanging': 8021,
 'woody': 11150,
 ...
}

dictionary['UNK']

输出：
0

dictionary['a']

输出：
6

reverse_dictionary # 编号：词

输出：
{0: 'UNK',
 1: 'the',
 2: 'of',
 3: 'and',
 4: 'one',
 5: 'in',
 6: 'a',
 7: 'to',
 8: 'zero',
 ...
}

In [0]:
del words  # 刪除原始的單詞表，節省空間的浪費
print ('Most common word (+UNK)', count[:5])  # 列出最高頻率的詞彙
print ('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) # 前10個單詞編碼與單詞

Most common word (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [0]:
# 生成 Word2Vec 訓練樣本
data_index = 0


def generate_batch(batch_size, num_skips, skip_window):
    global data_index  
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window

    # 將 batch 和 labels 初始化
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

    # 對某個單詞創建相關樣本時會使用到的單詞數量，包括目標單詞本身和它前後的單詞
    span = 2 * skip_window + 1

    # 創建最大容量為 span 的 deque（雙向隊列）
    # 在用 append 對 deque 添加變量時，只會保留最後插入的 span 個變量
    buffer = collections.deque(maxlen=span)

    # 從 data_index 開始，把 span 個單詞順序讀入 buffer 作為初始值，buffer 中存的是詞的編號
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # buffer 容量是 span，所以此時 buffer 已經填滿，後續的數據將替換掉前面的數據

    # 每次循環內對一個目標單詞生成樣本，前方已經斷言能整除，這裡使用 // 是為了保證結果是 int
    for i in range(batch_size // num_skips):  # //除法只保留結果整數部分（python3中），python2中直接 /
        # 現在 buffer 中是目標單詞和所有相關單詞
        target = skip_window  # buffer 中第 skip_window 個單詞為目標單詞（注意第一個目標單詞是 buffer[skip_window]，並不是 buffer[0]）
        targets_to_avoid = [skip_window]  # 接下來生成相關（上下文語境）單詞，應將目標單詞拒絕

        # 每次循環對一個語境單詞生成樣本
        for j in range(num_skips):
            # 先產生一個隨機數，直到隨機數不在 targets_to_avoid 中，就可以將之作為語境單詞
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)  # 因為這個語境單詞被使用了，所以要加入到 targets_to_avoid

            batch[i * num_skips + j] = buffer[skip_window]  # feature 是目標詞彙
            labels[i * num_skips + j, 0] = buffer[target]  # label 是 buffer[target]

        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

調用 generate_batch 函數測試一下功能。

In [0]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print (batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

3081 originated -> 12 as
3081 originated -> 5234 anarchism
12 as -> 3081 originated
12 as -> 6 a
6 a -> 12 as
6 a -> 195 term
195 term -> 6 a
195 term -> 2 of


In [0]:
# 訓練需要的參數
batch_size = 128
embedding_size = 128  # 將單詞轉為稠密向量的維度，一般是500~1000這個範圍內的值，這裡設為128
skip_window = 1  # 單詞間最遠可以聯繫到的距離
num_skips = 2   #對每個目標單詞提取的樣本數

# 生成驗證數據，隨機抽取一些頻數最高的單詞，看向量空間上跟它們距離最近的單詞是否相關性比較高
valid_size = 16  # 抽取的驗證單詞數
valid_window = 100  # 驗證單詞只從頻數最高的 100 個單詞中抽取
valid_examples = np.random.choice(valid_window, valid_size, replace=False)  # 隨機抽取
num_sampled = 64  # 訓練時用來做負樣本的噪聲單詞的數量

In [0]:
graph = tf.Graph()
with graph.as_default():
    
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)  # 將前面隨機產生的 valid_examples 轉為 TensorFlow 中的 constant

    with tf.device('/cpu:0'): # 限定所有计算在 CPU 上
    #with tf.device('/gpu:0'):
        # 隨機生成所有單詞的詞向量 embeddings，單詞表大小 5000，向量維度 128
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        # 查找 train_inputs 對應的向量 embed
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        # 使用 NCE Loss 作為訓練的優化目標
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
        nce_bias = tf.Variable(tf.zeros([vocabulary_size]))

    # 使用 tf.nn.nce_loss 計算學習出的詞向量 embed 在訓練數據上的 loss，並使用 tf.reduce_mean 進行匯總
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights, biases=nce_bias, labels=train_labels, inputs=embed, num_sampled=num_sampled,
                       num_classes=vocabulary_size))

    # 定義優化器為 SGD，且學習速率為 1.0
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # 計算嵌入向量 embeddings 的 L2 範數 norm
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    # 標準化
    normalized_embeddings = embeddings / norm
    # 查詢驗證單詞的嵌入向量，併計算驗證單詞的嵌入向量與詞彙表中所有單詞的相似性
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

    # 初始化所有模型参数
    init = tf.global_variables_initializer()

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [0]:
num_steps = 10001
#num_steps = 100001

with tf.Session(graph=graph) as session:
    init.run()
    print ('Initialized')

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            print ('Average loss at step {} : {}'.format(step, average_loss))
            average_loss = 0

        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to {} :'.format(valid_word)

                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '{} {},'.format(log_str, close_word)
                print (log_str)
        final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0 : 272.80242919921875
Nearest to after : pacifists, callimico, hellboy, fibrous, retroactively, kodak, prot, galois,
Nearest to will : lifespan, durability, novelty, philippi, respects, canning, vespasian, breda,
Nearest to one : glaukos, minesweeper, roch, infiltration, researchers, greatness, mistrust, undesired,
Nearest to four : homonyms, motherboard, humphries, stenella, buchan, cans, eriksson, myasthenia,
Nearest to are : star, xenophobic, drusus, voiceless, pup, freeze, deposing, refreshed,
Nearest to is : sam, toothed, censuses, capricornus, weaknesses, subclass, papen, unlicensed,
Nearest to see : godavari, falconer, endothermic, covenants, shiite, mendes, metamorphic, caused,
Nearest to five : horace, contributing, logical, dreamt, ubangi, solvent, resorts, nicolau,
Nearest to have : covariant, polymorphic, convoys, overthrowing, rd, masculists, papacy, funniest,
Nearest to on : transitioning, numerology, genomes, shuffle, meaningless, torres

下面定義一個用來可視化 Word2Vec 效果的函數。這裡 low_dim_embs 是降維到 2 維 的單詞的空間向量，我們將在圖表中展示每個單詞的位置。使用 plt.scatter 顯示散點圖（單詞的位置），並用 plt.annotate 展示單詞本身，同時，使用 plt.savefig 保存圖片到本地文件。

In [0]:
def plot_with_labels(low_dim_embs,labels,filename='tsne.png'):
    assert low_dim_embs.shape[0]>=len(labels),'More labels then embeddings'
    plt.figure(figsize=(18,18))
    for i,label in enumerate(labels):
        x,y=low_dim_embs[i,:]
        plt.scatter(x,y)
        plt.annotate(label,xy=(x,y),xytext=(5,2),textcoords='offset points',ha='right',va='bottom')
    plt.savefig(filename)

我們使用 sklearn.manifold.TSNE 實現降維，這裡直接將原始的 128 維的嵌入向量降到 2 維，再用前面的 plot_with_labels 函數進行展示。這裡只展示詞頻最高的 100 個單詞的可視化結果。

In [0]:
from sklearn.manifold import TSNE
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
tsne=TSNE(perplexity=30,n_components=2,init='pca',n_iter=5000)
plot_only=100
low_dim_embs=tsne.fit_transform(final_embeddings[:plot_only,:])
labels=[reverse_dictionary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs,labels)