### 环境说明：

In [1]:
%load_ext watermark
%watermark -a 'Scott Ming' -v -m -d -p numpy,pandas,matplotlib,tensorflow

Scott Ming 2017-04-01 

CPython 3.5.2
IPython 5.2.2

numpy 1.12.1
pandas 0.19.2
matplotlib 2.0.0
tensorflow 1.0.1

compiler   : GCC 4.9.2
system     : Linux
release    : 3.16.0-4-amd64
machine    : x86_64
processor  : 
CPU cores  : 4
interpreter: 64bit


## 4. 神经网络语言模型

In [2]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import jieba
import random
import os
import re
import string
import requests
import collections
import io
import tarfile
import gzip
import zhon.hanzi as zh
from nltk.corpus import stopwords
from tensorflow.python.framework import ops
ops.reset_default_graph()

### 4.1 读取并清理数据

In [3]:
def read_data(filename):
    """Read and cut texts, return a long string."""
    jieba.setLogLevel(20)
    jieba.enable_parallel(4)
    with open(filename, 'r') as f:
        data = f.read()
    texts = ' '.join(jieba.cut(data))
#     texts = re.split(r'[{}|\n]'.format(zh.punctuation), texts) # 按标点分会太稀疏
    texts = texts.split('\n')
    return texts

读取 stop_words

In [4]:
with open('stop_words_chinese.txt') as f:
    chinese_stops = f.read()
    chinese_stops = chinese_stops.split('\n')
    
english_stops = stopwords.words('english')

In [5]:
%%time 
texts = read_data('tst_jin.txt')

CPU times: user 1.1 s, sys: 104 ms, total: 1.21 s
Wall time: 1.76 s


In [6]:
%%time
def clean_text(texts, chinese_stops, english_stops=None):
    # 把大写单词转为小写
    texts = [x.lower() for x in texts]
    # 去除英文标点
    texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
    # 去除中文标点符号, 上面以句子作为切分所以不需要再清楚中文标点了
    texts = [''.join(c for c in x if c not in zh.punctuation) for x in texts]
    # 去除中文停止词(停止词里面有数字，就不另外清理数字了)
    texts = [' '.join([word for word in x.split() if word not in (chinese_stops)]) for x in texts]
    # 去除英文停止词
    texts = [' '.join([word for word in x.split() if word not in (english_stops)]) for x in texts]
    # 清楚多余的空格
    texts = [' '.join(x.split()) for x in texts]
    return texts

texts = clean_text(texts, chinese_stops, english_stops)


# Texts must contain at least 3 words
texts = [x for x in texts if len(x.split()) > 2]

CPU times: user 2.13 s, sys: 4 ms, total: 2.14 s
Wall time: 2.13 s


### 4.2 定义模型参数

In [7]:
batch_size = 50         # 每次训练的 batch_size
embedding_size = 200    # 即将单词转为稠密向量的维度
vocabulary_size = 10000 # 预计进入训练的单词，这里设的是 top 10000
generations = 5000     # 训练次数
print_loss_every = 500  # 迭代 500 次，打印一次 loss

num_sampled = int(batch_size/2)  # 负样本的噪声单词熟练
window_size = 2         # How many words to consider left and right.

In [8]:
print_valid_every = 2000
valid_words = '说 杀 内力'.split()

### 4.3 生成字典

In [9]:
# Build dictionary of words
def build_dictionary(sentences, vocabulary_size):
    # Turn sentences (list of strings) into lists of words
    split_sentences = [s.split() for s in sentences]
    words = [x for sublist in split_sentences for x in sublist]
    
    # Initialize list of [word, word_count] for each word, starting with unknown
    count = [['RARE', -1]]
    
    # Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    
    # Now create the dictionary
    word_dict = {}
    # For each word, that we want in the dictionary, add it, then make it
    # the value of the prior dictionary length
    for word, word_count in count:
        word_dict[word] = len(word_dict)
    
    return word_dict, count  # 返回 count 方便查看频率和验证词

### 4.4 单词转为数字

In [10]:
def text_to_numbers(sentences, word_dict):
    # Initialize the returned data
    data = []
    for sentence in sentences:
        sentence_data = []
        # For each word, either use selected index or rare word index
        for word in sentence:
            if word in word_dict:
                word_ix = word_dict[word]
            else:
                word_ix = 0
            sentence_data.append(word_ix)
        data.append(sentence_data)
    return(data)

# Build our data set and dictionaries
word_dictionary, count = build_dictionary(texts, vocabulary_size)
word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))
text_data = text_to_numbers(texts, word_dictionary)

# Get validation word keys
valid_examples = [word_dictionary[x] for x in valid_words]

### 4.5 生成 word2vec 训练样本

In [11]:
# Generate data randomly (N words behind, target, N words ahead)
def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
    # Fill up data batch
    batch_data = []
    label_data = []
    while len(batch_data) < batch_size:
        # select random sentence to start
        rand_sentence = np.random.choice(sentences)
        # Generate consecutive windows to look at
        window_sequences = [rand_sentence[max((ix-window_size),0):(ix+window_size+1)] for ix, x in enumerate(rand_sentence)]
        # Denote which element of each window is the center word of interest
        label_indices = [ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]
        
        # Pull out center word of interest for each window and create a tuple for each window
        if method=='skip_gram':
            batch_and_labels = [(x[y], x[:y] + x[(y+1):]) for x,y in zip(window_sequences, label_indices)]
            # Make it in to a big list of tuples (target word, surrounding word)
            tuple_data = [(x, y_) for x,y in batch_and_labels for y_ in y]
        elif method=='cbow':
            batch_and_labels = [(x[:y] + x[(y+1):], x[y]) for x,y in zip(window_sequences, label_indices)]
            # Make it in to a big list of tuples (target word, surrounding word)
            tuple_data = [(x_, y) for x,y in batch_and_labels for x_ in x]
        else:
            raise ValueError('Method {} not implemented yet.'.format(method))
            
        # extract batch and labels
        batch, labels = [list(x) for x in zip(*tuple_data)]
        batch_data.extend(batch[:batch_size])
        label_data.extend(labels[:batch_size])
    # Trim batch and label at the end
    batch_data = batch_data[:batch_size]
    label_data = label_data[:batch_size]
    
    # Convert to numpy array
    batch_data = np.array(batch_data)
    label_data = np.transpose(np.array([label_data]))
    
    return(batch_data, label_data)

In [12]:
batch_data, label_data = generate_batch_data(text_data, batch_size, window_size)

In [13]:
batch_data

array([2651, 2651,  316,  316,  316,  423,  423,  423,  423,    0,    0,
          0,    0,    0,    0,    0,    0,  948,  948,  948,  948,    0,
          0,    0,    0,  917,  917,  917,  917,    0,    0,    0,    0,
          0,    0,    0,    0,  902,  902,  902,  902,    0,    0,    0,
          0,  356,  356,  356,  356,    0])

### 4.6 构建模型

#### 1. 定义神经网络的结果和前向传播的输出结果

In [14]:
# Define Embeddings:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

# NCE loss parameters
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                               stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

#### 2. 定义前向传播过程

In [15]:
# Create data/target placeholders
x_inputs = tf.placeholder(tf.int32, shape=[batch_size])
y_target = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Lookup the word embedding:
embed = tf.nn.embedding_lookup(embeddings, x_inputs)

#### 3. 定义损失函数

In [16]:
# Get loss from prediction
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                     biases=nce_biases,
                                     labels=y_target,
                                     inputs=embed,
                                     num_sampled=num_sampled,
                                     num_classes=vocabulary_size))

# Create optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)

# Cosine similarity between words
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

#### 4. 创建一个会话来运行TensorFlow程序。

In [17]:
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    # Run the skip gram model.
    loss_vec = []
    loss_x_vec = []
    for i in range(generations):
        batch_inputs, batch_labels = generate_batch_data(text_data, batch_size, window_size)
        feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}
    
        # Run the train step
        sess.run(optimizer, feed_dict=feed_dict)
    
        # Return the loss
        if (i+1) % print_loss_every == 0:
            loss_val = sess.run(loss, feed_dict=feed_dict)
            loss_vec.append(loss_val)
            loss_x_vec.append(i+1)
            print("Loss at step {} : {}".format(i+1, loss_val))
          
        # Validation: Print some random words and top 5 related words
        if (i+1) % print_valid_every == 0:
            sim = sess.run(similarity, feed_dict=feed_dict)
            for j in range(len(valid_words)):
                valid_word = word_dictionary_rev[valid_examples[j]]
                top_k = 5 # number of nearest neighbors
                nearest = (-sim[j, :]).argsort()[1:top_k+1]
                log_str = "Nearest to {}:".format(valid_word)
                for k in range(top_k):
                    close_word = word_dictionary_rev[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)

Loss at step 500 : 63.2232780456543
Loss at step 1000 : 3.889535903930664
Loss at step 1500 : 18.402950286865234
Loss at step 2000 : 22.192529678344727
Nearest to 说: 紧, 怔, 底, 田, 露,
Nearest to 杀: 渐, 般, 显, 养, 前,
Nearest to 内力: 想不起, 一说, 这时候, 洞小头, 没法,
Loss at step 2500 : 28.179908752441406
Loss at step 3000 : 12.110696792602539
Loss at step 3500 : 2.838730573654175
Loss at step 4000 : 5.9628400802612305
Nearest to 说: RARE, 紧, 底, 握, 怔,
Nearest to 杀: RARE, 养, 渐, 虚, 前,
Nearest to 内力: 想不起, 一说, 这时候, 洞小头, 没法,
Loss at step 4500 : 13.232660293579102
Loss at step 5000 : 7.305245876312256


## Refrences:

* [原始论文](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)
* [Approximating the Softmax for Learning Word Embeddings](http://sebastianruder.com/word-embeddings-softmax/)
* [word2vec 中的数学原理详解（多图，WIFI下阅读） - 机器学习 - 算法组](http://suanfazu.com/t/word2vec-zhong-de-shu-xue-yuan-li-xiang-jie-duo-tu-wifixia-yue-du/178)
* [Word2Vec原理之层次Softmax算法 | 一灯@qiancy.com](http://qiancy.com/2016/08/17/word2vec-hierarchical-softmax/)
* [Tensorflow 的Word2vec demo解析 - 阁子 - 博客园](http://www.cnblogs.com/rocketfan/p/4976806.html)
* [Word2Vec-知其然知其所以然 - 作业部落 Cmd Markdown 编辑阅读器](https://www.zybuluo.com/Dounm/note/591752#322-使用negative-sampling优化)