## word2vec_skip-gram

In [3]:
'''Importing the required packages'''
import random
import collections
import math
import os
import zipfile
import time
import re
import numpy as np
import tensorflow as tf

from matplotlib import pylab
%matplotlib inline

from six.moves import range
from six.moves.urllib.request import urlretrieve

'''Make sure the dataset link is copied correctly'''
dataset_link = 'http://mattmahoney.net/dc/'
zip_file = 'text8.zip'

In [2]:
# 下载有Matt Mahoney 收集和清理的危机百科文章数据集，并将其存储为当前工作目录下的单独文件
def data_download(zip_file):
    """Download the required file"""
    if not os.path.exists(zip_file):
        zip_file, _ = urlretrieve(dataset_link + zip_file, zip_file)
        print('File downloaded successfully!')
    return None
data_download(zip_file)

In [4]:
# 压缩的文本数据集在内部文件夹数据集中提取，稍后将用于训练模型
"""Extracting the dataset in separate folder"""
extracted_folder = 'dataset'

if not os.path.isdir(extracted_folder):
    with zipfile.ZipFile(zip_file) as zf:
        zf.extractall(extracted_folder)
        
with open('dataset/text8') as ft_:
    full_text = ft_.read()

In [5]:
# 由于输入数据的文本中有多个标点符号和其他符号，相同的符号将被替换为带有标点符号名称和符号类型的相应字符
# 有助于让模型单独识别每个标点符号和其他符号并生成向量
def text_processing(ft8_text):
    """Replacing punctuation marks with tokens"""
    ft8_text = ft8_text.lower()
    ft8_text = ft8_text.replace('.', '<period>')
    ft8_text = ft8_text.replace(',', '<comma>')
    ft8_text = ft8_text.replace('"', '<quotation>')
    ft8_text = ft8_text.replace(';', '<semicolon>')
    ft8_text = ft8_text.replace('!', '<exclamation>')
    ft8_text = ft8_text.replace('?', '<question>')
    ft8_text = ft8_text.replace('(', '<paren_l>')
    ft8_text = ft8_text.replace(')', '<paren_r>')
    ft8_text = ft8_text.replace('--', '<hyphen>')
    ft8_text = ft8_text.replace(':', '<colon>')
    ft8_text_tokens = ft8_text.split()
    return ft8_text_tokens

ft_tokens = text_processing(full_text)

In [6]:
# 为了提高所产生的向量表示的质量，建议去除与单词相关的噪音，即输入数据集中词频小于7的单词，因为这些单词没有足够的信息来提供它们的上下文
# 可以通过检查单词数和数据集中的分布来调整此阈值，在此处设为7
"""Shortlisting words with frequency more than 7"""
word_cnt = collections.Counter(ft_tokens)
shortlisted_words = [w for w in ft_tokens if word_cnt[w] > 7]

# 列出数据集中词频最高的几个单词
print(shortlisted_words[:15])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including']


In [7]:
# 检查数据集中所有单词的统计信息
print("Total number of shortlisted words : ", len(shortlisted_words))
print("Unique number of shortlisted_words: ", len(set(shortlisted_words)))

Total number of shortlisted words :  16616688
Unique number of shortlisted_words:  53721


In [8]:
# 为了处理语料库中存在的独特单词，我们制作了一组单词和它们在训练数据集中的词频
# 创建一个字典并将单词转换为整数，反之，将整数转换为单词
# 词频最高的单词被赋予最小值0， 其他单词也通过相似方式被赋予数值，从单词转换而来的整数倍存储在一个单独的数组中
def dict_creation(shortlisted_words):
    """The function creates a dictionary of the words present in dataset along with their frequency order"""
    counts = collections.Counter(shortlisted_words)
    vocabulary = sorted(counts, key=counts.get, reverse=True)
    rev_dictionary_ = {ii: word for ii, word in enumerate(vocabulary)}
    # print(rev_dictionary_)
    dictionary_ = {word: ii for ii, word in rev_dictionary_.items()}
    # print(dictionary_)
    return dictionary_, rev_dictionary_

dictionary_, rev_dictionary_ = dict_creation(shortlisted_words)
words_cnt = [dictionary_[word] for word in shortlisted_words]

### skip-gram 模型采用子采样的方法来处理文本中的停止词
### 通过在词频上设置阈值，可以消除所有那些词频较高且中心词周围没有任何重要上下文的单词，这带来了更快的训练速度和更好的词向量表示

### skip-gram论文中给出的概率分数函数，对于训练集中的每个单词，我们将根据以下公式给定的概率来决定是否将其移除
$$ P(w_{i}) = 1-\left( \sqrt\frac{t}{f(w_{i})}\right)$$
### 其中, t是阈值参数，$f(w_{i})$是单词$w_i$在总数据集中的词频

In [38]:
"""Creaing the threshold and performing the subsampling"""
thresh = 0.00005
word_counts = collections.Counter(words_cnt)
total_count = len(words_cnt)
freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1-np.sqrt(thresh/freqs[word]) for word in word_counts}
train_words = [word for word in words_cnt if p_drop[word] < random.random()]

### 当skip-gram模型接受中心词并预测其周围单词时，skipG_target_set_generation()函数以所需格式创建skip-gram模型的输入

In [39]:
def skipG_target_set_generation(batch_, batch_index, word_window):
    """The function combines the words of given word_window size next to the index,
    for the SkipGram model"""
    random_num = np.random.randint(1, word_window+1)
    words_start = batch_index - random_num if (batch_index - random_num) > 0 else 0
    words_stop = batch_index + random_num
    window_target = set(batch_[words_start:batch_index] + batch_[batch_index+1:words_stop+1])
    return list(window_target)

### skipG_batch_creation()函数调用skipG_target_set_generation()函数，并创建中心词及其周围单词的组合格式，将其作为目标文本并返回批输出

In [40]:
def skipG_batch_creation(short_words, batch_length, word_window):
    """The function internally makes use of the skipG_target_set_generation() function and
    combines each of the label words in the shortlisted_words with the words of word_window size around"""
    batch_cnt = len(short_words)//batch_length
    short_words = short_words[:batch_cnt*batch_length]
    
    for word_index in range(0, len(short_words), batch_length):
        input_words, label_words = [], []
        word_batch = short_words[word_index:word_index+batch_length]

    for index_ in range(len(word_batch)):
        batch_input = word_batch[index_]
        batch_label = skipG_target_set_generation(word_batch, index_, word_window)
        # Appending the label and inputs to the initial list. Replicating input to the size of labels in he window
        label_words.extend(batch_label)
        input_words.extend([batch_input]*len(batch_label))
        yield input_words, label_words

### 注册一个用于skip-gram实现的TensorFlow图，并声明变量的输入和标签占位符，他们将用于按照中心词和周围单词的组合为输入单词和大小不同的批量分配单热编码常量

In [41]:
tf_graph = tf.Graph()
with tf_graph.as_default():
    input_ = tf.placeholder(tf.int32, [None], name='input_')
    label_ = tf.placeholder(tf.int32, [None, None], name='label_')

### 下面的代码声明嵌入矩阵的变量，该矩阵的维度等于词汇表的大小和词嵌入向量的维度

In [42]:
with tf_graph.as_default():
    word_embed = tf.Variable(tf.random_uniform((len(rev_dictionary_), 300), -1, -1))
    embedding = tf.nn.embedding_lookup(word_embed, input_)

### tf.train.AdamOprimier使用Adam算法来控制学习率

In [43]:
"""The code includes the following :
    # Initializing weights and bias to be used in the softmax layer
    # Loss function calculation using the Negative Sampling 
    # Usage of Adam Optimizer
    # Negative sampling on 100 words, to be included in the loss function
    # 300 is the word embedding vector size"""
vocabulary_size = len(rev_dictionary_)

with tf_graph.as_default():
    sf_weights = tf.Variable(tf.truncated_normal((vocabulary_size, 300), stddev=0.1))
    sf_bias = tf.Variable(tf.zeros(vocabulary_size))
    
    loss_fn = tf.nn.sampled_softmax_loss(weights=sf_weights, biases=sf_bias, labels=label_,
                                         inputs=embedding, num_sampled=100, num_classes=vocabulary_size)
    cost_fn = tf.reduce_mean(loss_fn)
    optim = tf.train.AdamOptimizer().minimize(cost_fn)

### 为了确保单词的向量表示保持了单词之间的语义相似性，我们在下面的代码部分生成一个验证集。
### 它将在语料库中选择常见和不常见词的组合，并基于词向量之间的余弦相似性返回最接近它们的单词

In [44]:
"""The below code performs the following operations : 
    # Performing validation here by making use of a random selection of 16 words from the dictionary of desired size
    # Selecting 8 words randomly from range of 1000
    # Using the cosine distance to calculate the similarity between the words"""
with tf_graph.as_default():
    validation_cnt = 16
    validation_dict = 100
    
    validation_words = np.array(random.sample(range(validation_dict), validation_cnt//2))
    validation_words = np.append(validation_words, random.sample(range(1000, 1000+validation_dict), validation_cnt//2))
    validation_data = tf.constant(validation_words, dtype=tf.int32)
    
    normalization_embed = word_embed / (tf.sqrt(tf.reduce_sum(tf.square(word_embed), 1, keepdims=True)))
    validation_embed = tf.nn.embedding_lookup(normalization_embed, validation_data)
    word_similarity = tf.matmul(validation_embed, tf.transpose(normalization_embed))

### 在当前工作目录中创建文件夹model_checkpoint以存储模型检查点

In [45]:
"""Creating the model checkpoint directory"""
# !mkdir model_checkpoint # if the dir do not exist, run the code
epochs = 2 # Increase it as per computation resources. It has been kept low here for users to replicate the process, increase to 100 or more.
batch_length = 1000
word_window = 10

with tf_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=tf_graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())
    
    for e in range(1, epochs+1):
        batches = skipG_batch_creation(train_words, batch_length, word_window)
        start = time.time()
        for x, y in batches:
            train_loss, _ = sess.run([cost_fn, optim], feed_dict={input_:x, label_:np.array(y)[:, None]})
            
            loss += train_loss
            print(loss)
            if iteration % 100 == 0:
                end = time.time()
                print("Epoch {}/{}".format(e, epochs), ",Iteration: {}".format(iteration), 
                      ", Avg. Training loss: {:.4f}".format(loss/100), ", Processing : {:.4f} sec/batch".format((end-start)/100))
                loss = 0
                start = time.time()
                
            if iteration % 2000 == 0:
                similarity_ = word_similarity.eval()
                for i in range(validation_cnt):
                    validated_words = rev_dictionary_[validation_words[i]]
                    top_k = 8 # number of nearest neighbors
                    nearest = (-similarity_[i, :]).argsort()[1:top_k+1]
                    log = 'Nearest to %s: ' % validated_words
                    for k in range(top_k):
                        close_word = rev_dictionary_[nearest[k]]
                        log = '%s %s, ' % (log, close_word)
                    print(log)
            iteration += 1
    save_path = saver.save(sess, "model_checkpoint/skipGram_test8.ckpt")
    embed_mat = sess.run(normalization_embed)

7.568842887878418
13.640870094299316
21.048076629638672
27.72291851043701
34.18672800064087
40.04692459106445
47.378024101257324
53.42362117767334
59.93447208404541
65.40264081954956
70.12728977203369
75.31607007980347
80.2936224937439
84.5669298171997
88.76335191726685
92.29589533805847
95.77223372459412
99.04659080505371
102.69746041297913
106.1497175693512
109.70588755607605
112.56033873558044
115.28139472007751
118.2389907836914
121.42175269126892
124.29704594612122
127.2165846824646
130.08540105819702
132.50801157951355
135.02510333061218
137.24190139770508
139.11446678638458
141.51191818714142
143.93525302410126
145.9112514257431
147.97297751903534
149.7645285129547
151.6667252779007
153.5666607618332
155.40721476078033
157.13450229167938
158.8536570072174
160.54844629764557
162.00453579425812
164.53635728359222
166.13683414459229
167.88430547714233
169.3502563238144
171.09871542453766
172.48077714443207
174.19820153713226
175.75819671154022
177.21798646450043
178.47900366783142


KeyboardInterrupt: 

### 所有其他迭代也将打印出类似的输出结果，经过训练的网络将被还原，供以后使用

In [None]:
"""The Saver class adds ops to save and restore variables to and from checkpoints."""
with tf_graph.as_default():
    saver = tf.train.Saver()
    
with tf.Session(graph = tf_graph) as sess:
    """Restaring the trained network"""
    saver.restore(sess, tf.train.latest_checkpoint('model_checkpoint'))
    embed_mat = sess.run(word_embed)

### 使用t分布随机邻嵌入(t-SNE)来实现可视化
### 250个随机单词的300度高维向量表示已经在二维向量空间中使用
### t-SNE确保了向量的初始结构可以在新维度中被保留，甚至是在转换后

In [36]:
word_graph = 250
tsne = TSNE()
word_embedding_tsne = tsne.fit_transform(embed_mat[:word_graph, :])

NameError: name 'TSNE' is not defined

### 具有语义相似性的单词在其二维空间中表示中彼此更接近，从而及时在维度进一步减小之后也保持着它们的相似性。
### 注入year、years和age之类的单词的位置较为接近，并且与international和religious等单词距离较远。
### 训练模型是可以采用更多迭代，一实现更好的词嵌入表示，并且通过改变阈值来调整结果。