# 训练词向量



## 利用`gensim.models.word2vec`训练词向量
- 原始语料为[中文维基](https://dumps.wikimedia.org/zhwiki/)

0. 原始语料为 `xml` 格式，需要提取出正文，使用 `WikiExtractor` 包
    0. 命令行提取正文：`python WikiExtractor.py -b 500M -o wiki zhwiki-20190720-pages-articles-multistream.xml.bz2`
    0. 获得的文件中，正文被包含在 `<doc></doc>` 标签内
0. 或者 `gensim.corpora.WikiCorpus` 直接处理 `xml.bz2` 文件
0. 由上两步，获得的文本先经过预处理，**每一行一句话，单词间用空格隔开**

In [None]:
!python WikiExtractor.py -b 500M -o datasets/wiki datasets/zhwiki-20190720.xml.bz2

In [None]:
import re

def preprocess_zhwiki_v1():
    # 提取文本信息，分句、分词、繁体转简体，然后将单词用空格连接
    regex = re.compile("(^<doc.*>$)|(^</doc>$)")
    sent_spliter = re.compile("。|！|？")

    input_file = open(input_file_path, 'r', encoding='utf-8')
    output_file = open(output_file_path, 'w+', encoding='utf-8')

    line = input_file.readline()
    while line:
        if line.strip() and not regex.match(line):
            sentences = sent_spliter.split(line)
            for s in sentences:
                s = zhconv.convert(s, 'zh-cn')
                words = jieba.cut(s.strip('\n'))
                sent = ' '.join(words)
                output_file.write(sent + '\n')
        line = input_file.readline()

    input_file.close()
    output_file.close()

In [None]:
from gensim.corpora import WikiCorpus


def preprocess_zhwiki_v2():
    # 提取文本信息，分句、分词、繁体转简体，然后将单词用空格连接
    # WikiCorpus 会将标点符号都被删除
    space = ' '
    i = 0
    l = []

    output_file = open(output_file_path, 'w+', encoding='utf-8')

    wiki = WikiCorpus(input_file_path, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        for temp_sentence in text:
            temp_sentence = zhconv.convert(s, 'zh-cn')
            seg_list = list(jieba.cut(temp_sentence))
            for temp_term in seg_list:
                l.append(temp_term)
        output_file.write(space.join(l) + '\n')
        l = []
        i = i + 1

        if (i % 200 == 0):
            print('Saved ' + str(i) + ' articles')
    f.close()


input_file_path = r'datasets/wiki/AA/wiki_00'
output_file_path = r'datasets/wiki/AA/wiki_corpus'
preprocess_zhwiki_v2()

- 利用上一步生成的处理后的满足 `LineSentence` 格式的文本，创建模型

In [None]:
from gensim.models import word2vec

corpus_path = output_file_path
model_path = r"models/wiki_corpus.model"


def build_model(corpus_path):
    wiki_news = word2vec.LineSentence(corpus_path)
    model = word2vec.Word2Vec(
        wiki_news,
        sg=0,  # 模型类型 CBOW
        size=50,  # 词向量维度     
        window=5,  # 窗口尺寸
        min_count=5, # 忽略词频少于 5 的单词
        workers=9)
    model.save(model_path)
    return model


build_model()

- 验证训练得到的模型

In [None]:
from gensim.models import word2vec
model_path = r"models/zhwiki.50d.word2vec"
model = word2vec.Word2Vec.load(model_path)

In [None]:
model.wv.most_similar('数学')

In [None]:
model.wv.most_similar('哲学')

In [None]:
model.wv.most_similar(positive=['女人', '国王'], negative=['男人'])

In [None]:
two_corpus = ["腾讯","阿里巴巴"]
res = model.wv.similarity(two_corpus[0],two_corpus[1])
print("similarity:%.4f"%res)

- 将词向量降维后进行可视化

In [None]:
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.sans-serif'] = ['SimHei']
%matplotlib inline

word_vectors = model.wv


def get_model_matrix(word_vectors, required_words):
    import random
    words = list(word_vectors.vocab.keys())
    random.shuffle(words)
    words = words[:10000]
    print("Putting %i words into word2Ind and matrix M..." % len(words))
    word2Ind = {}
    M = []
    curInd = 0
    for w in words:
        try:
            M.append(word_vectors.word_vec(w))
            word2Ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    for w in required_words:
        try:
            M.append(word_vectors.word_vec(w))
            word2Ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    M = np.stack(M)
    print("Done.")
    return M, word2Ind


words = [
    '数学', '算术', '公理', '积分', '统计', '善恶', '哲学', '伦理', '中国政府', '美国国会', '武侠小说',
    '风靡', '海内外', '受欢迎', '通俗小说', '中华人民共和国', '文化大革命', '反思', '伤痕', '一批', '白话文',
    '诗人', '古诗', '欢迎', '中华民国', '撤退', '台湾', '区别', '思潮', '过渡时期', '通称', '文献', '兴趣',
    '钻研', '语言学', '神秘主义', '更加', '经典', '历史学', '文学', '学术界', '享有', '前所未有', '趋势',
    '受到', '人文主义者', '巨量', '规则', '机器人', '精准', '身躯', '脑', '视频', '确保', '高质量', '适中',
    '价格', '软件设计', '构成', '互补', '并行', '系统分析', '程序设计', '支持', '高级', '课程', '训练',
    '工业', '技能', '羧酸', '柠檬酸', '高效率', '肽键', '细胞骨架', '细胞周期', '氯仿', '甘油', '变型',
    '鞘', '类固醇', '醛', '酮', '糖原', '单糖', '半乳糖', '葡萄糖', '糖苷键', '含氮', '杂环', '嘌呤',
    '辅酶', '底物', '化学能', '磷酸化', '哈康', '延斯', '挪威海', '捕鲸', '挪威政府', '成人礼', '巴伦支海',
    '哥德堡', '区域规划', '润州', '邳州市', '东海县', '丹阳市', '武进区', '临河', '嘈杂', '霰弹枪', '讲席',
    '一滴', '调换', '香港金融管理局', '美圆', '金管局', '毫', '大额', '铜币', '一圆', '镍币', '爆竹',
    '管理科', '中区', '收兑', '财政司'
]

M, word2Ind = get_model_matrix(word_vectors, words)

In [None]:
# 利用 svd 算法进行降维
def reduce_to_k_dim(M, k=2):
    n_iters = 10
    print("Running Truncated SVD over %i words..." % (M.shape[0]))
    svd = TruncatedSVD(n_components=k, n_iter=n_iters)
    M_reduced = svd.fit_transform(M)
    print("Done.")
    return M_reduced

M_reduced = reduce_to_k_dim(M)

In [None]:
def plot_embeddings(M_reduced, word2Ind, words):
    fig, ax = plt.subplots(1, 1, figsize=(24,24))
    for word in words:
        index = word2Ind[word]
        x, y = M_reduced[index]
        plt.scatter(x, y, marker='o', color='red')
        plt.text(x, y, word, fontsize=9)
        
plot_embeddings(M_reduced, word2Ind, words)

In [None]:
# 利用 TSNE 算法进行降维
from sklearn.manifold import TSNE

def tsne_plot(M, word2Ind, words):

    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    M_reduced = tsne_model.fit_transform(M)

    fig, ax = plt.subplots(1, 1, figsize=(32, 32))
    for word in words:
        index = word2Ind[word]
        x, y = M_reduced[index]
        ax.scatter(x, y, marker='o', color='red')
        ax.text(x, y, word, fontsize=9)
        
tsne_plot(M, word2Ind, words)

# TSNE 降维效果比 SVD 要好，但效率更低

- 关键词提取，从 `wv.most_similar()` 出发获取给定单词的相关单词
    - 词向量 `wv.most_similar()` 获得的为出现在相似上下文中的同类词，并不是通常语义含义上的相似词

In [None]:
from collections import defaultdict


def get_related_words(initial_words, model):
    unseen = [initial_words]
    seen = defaultdict(int)

    max_size = 500

    while unseen and len(seen) < max_size:
        if len(seen) % 50 == 0:
            print('search length: {}'.format(len(seen)))

        node = unseen.pop(0)
        new_expanding = [w for w, _ in model.most_similar(node, topn=20)]
        unseen += new_expanding

        seen[node] += 1
    return seen


actions = get_related_words("说", word_vectors)
actions

- `wordcloud` 实现词云

In [None]:
# 培根散文集的词云
data_path = r'datasets/Bacon Francis - Essays.txt'

import os

from os import path
from wordcloud import WordCloud

text = open(data_path).read()

wordcloud = WordCloud().generate(text)

import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()