In [1]:
import jieba
import collections
import math
import os
import random
import zipfile
import string
import numpy as np
import urllib.request
import tensorflow as tf
import zhon.hanzi as zh

### 1. 下载数据

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):  # 如果不存在，就下载
    filename, _ = urllib.request.urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    print(statinfo.st_size)
    raise Exception(
        'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

In [3]:
filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [4]:
!unzip -l text8.zip

Archive:  text8.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
100000000  2006-06-09 20:40   text8
---------                     -------
100000000                     1 file


读取成一个 list

```
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data

words = read_data(filename)  # 读取成 list
print('Data size', len(words))  
```

In [5]:
def read_data(filename):
    """Read and cut texts, return a long string."""
    jieba.setLogLevel(20)
    jieba.enable_parallel(4)
    with open(filename, 'r') as f:
        data = f.read()
    texts = ' '.join(jieba.cut(data))
#     texts = re.split(r'[{}|\n]'.format(zh.punctuation), texts) # 按标点分会太稀疏
    texts = texts.split('\n')
    return texts

In [6]:
texts = read_data('tst_jin.txt')

In [7]:
with open('stop_words_chinese.txt') as f:
    chinese_stops = f.read()
    chinese_stops = chinese_stops.split('\n')

In [8]:
%%time
def clean_text(texts, chinese_stops, english_stops=None):
    # 把大写单词转为小写
    texts = [x.lower() for x in texts]
    # 去除英文标点
    texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
    # 去除中文标点符号, 上面以句子作为切分所以不需要再清楚中文标点了
    texts = [''.join(c for c in x if c not in zh.punctuation) for x in texts]
    # 去除中文停止词(停止词里面有数字，就不另外清理数字了)
    texts = [' '.join([word for word in x.split() if word not in (chinese_stops)]) for x in texts]
    # 去除英文停止词
#     texts = [' '.join([word for word in x.split() if word not in (english_stops)]) for x in texts]
    # 清楚多余的空格
    texts = [' '.join(x.split()) for x in texts]
    return texts

texts = clean_text(texts, chinese_stops)

CPU times: user 1.96 s, sys: 4 ms, total: 1.96 s
Wall time: 1.96 s


In [9]:
split_sentences = [s.split() for s in texts]
words = [x for sublist in split_sentences for x in sublist]

### 2. 创建一个字典

* data: 转换后的编码
* count：频数统计
* dictionary：词汇表
* reverse_dictionary：反转形式

In [10]:
vocabulary_size = 50000

In [11]:
def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)  # 频率越高的词，在 dict 里面排的越前  
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count += 1
    data.append(index)  # 最后返回的就是一个从 dict 获取的对 1700 万单词的编号
  count[0][1] = unk_count  # 修改 UNK 的值
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
del words  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0

Most common words (+UNK) [['UNK', 0], ('道', 1360), ('胡斐', 784), ('说', 562), ('便', 403)]
Sample data [11439, 534, 552, 11945, 391, 137, 125, 7124, 17289, 6] ['金庸', '飞狐', '外传', '第一章', '大雨', '商家堡', '胡一刀', '曲池', '天枢', '苗人凤']


### 3. 生成 Word2Vec 的训练样本

* batch_size：batch 的大小，
* skip_window: 指的是单词最远可以联系到的距离，设为 1，表示只能跟紧邻的两个单词生成样本
* num_skips：是对每个单词生成多少个样本，不能大于 skip_window 值的两倍，并且 batch_size 必须是它的整数倍(确保每个 batch 包含了一个词汇对应的所有样本)
* span: 对每个单词创建的相关样本时会使用到的单词数量，包括目标单词本身和前后的单词
* buffer: 在对 deque 使用 append 方法时，只会保留最后插入的 span 个变量

In [12]:
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    # _ 表示对循环中的实际值我们没有兴趣，为的是把 buffer 填充满,这里是 3
    for _ in range(span):  
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)  # 这个是过滤用的，如果没有过滤，底下 labels 肯定会重复
            batch[i * num_skips + j] = buffer[skip_window]  # skip_window 是不变的，但 buffer 变了
            # labels 在这里结束一次循环，target 因为上面过滤过，肯定没在targets_to_avoid里面的，不然到不了这里
            labels[i * num_skips + j, 0] = buffer[target]  
        buffer.append(data[data_index])  # 第二层循环完了刚好遍历完一个词，然后所有样本后读入下一个词
        data_index = (data_index + 1) % len(data) 
    return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], 
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

534 飞狐 -> 11439 金庸
534 飞狐 -> 552 外传
552 外传 -> 534 飞狐
552 外传 -> 11945 第一章
11945 第一章 -> 391 大雨
11945 第一章 -> 552 外传
391 大雨 -> 11945 第一章
391 大雨 -> 137 商家堡


### 4. 构建模型

In [13]:
batch_size = 128
embedding_size = 128  # 即将单词转为稠密向量的维度
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # 用来抽取的验证单词数
valid_window = 100  # 验证单词只从频数最高的100个单词中抽取
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.


In [None]:
graph = tf.Graph()
with graph.as_default():

    # 输入数据
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # 查找输入数据的 embeddings
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)  # 矩阵取行
    
        # Construct the variables for the NCE loss
        # 初始化权重参数，并把 nce_biases 设为 0
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                        biases=nce_biases,
                        labels=train_labels,
                        inputs=embed,
                        num_sampled=num_sampled,
                        num_classes=vocabulary_size))
    
    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))  # L2 范数
    normalized_embeddings = embeddings / norm  # 标准化后的 normalized_embeddings
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)  # 验证单词的嵌入向量
    # 计算验证单词的嵌入向量与词汇表中所有单词的相似性
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)  
    
    # Add variable initializer.
    init = tf.global_variables_initializer()

# Step 5: Begin training.
num_steps = 100001

### 5. 开始训练

In [None]:
with tf.Session(graph=graph) as session:
    # We must initialize all variables before we use them.
    init.run()
    print("Initialized")

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(
            batch_size, num_skips, skip_window)
        feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
    
        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val  # 累积 loss
    
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # 2000 步后计算一次平均 loss
            # The average loss is an estimate of the loss over the last 2000 batches.
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    # 每 1 万次循环计算一次验证单词与全部单词的相似度
    if step % 10000 == 0:
        sim = similarity.eval()
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8 # number of nearest neighbors
            nearest = (-sim[i, :]).argsort()[1:top_k+1]
            log_str = "Nearest to %s:" % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = "%s %s," % (log_str, close_word)
            print(log_str)
    final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step  0 :  289.953857422
Average loss at step  2000 :  128.231767511
Average loss at step  4000 :  53.9451430967
Average loss at step  6000 :  30.4074878168
Average loss at step  8000 :  19.211094013
