In [2]:
import pandas as pd

# 0. gensim实践：

1. 读取预处理好的数据
2. 训练
3. 完事

# 1. 数据集路径

In [3]:
merger_data_path = 'data/merged_train_test_seg_data.csv'

# 2. 载入数据

In [4]:
merger_df = pd.read_csv(merger_data_path,header=None)
print('merger_data_path data size {}'.format(len(merger_df)))
merger_df.head()

merger_data_path data size 102871


Unnamed: 0,0
0,方向机 重 助力 泵 方向机 都 换 新 都 换 助力 泵 方向机 换 方向机 带 助力 重...
1,奔驰 ML500 排气 凸轮轴 调节 错误 有没有 电脑 检测 故障 代码 有发 一下 发动...
2,2010 款 宝马X1 2011 年 出厂 20 排量 通用 6L45 变速箱 原地 换挡 ...
3,30V6 发动机 号 位置 照片 最好 右侧 排气管 上方 缸体 上 靠近 变速箱 是不是 ...
4,2012 款 奔驰 c180 维修保养 动力 值得 拥有 家庭 用车 入手 维修保养 费用 ...


# 3. 模型创建

Gensim中 Word2Vec 模型的期望输入是进过分词的句子列表，即是某个二维数组。这里我们暂时使用 Python 内置的数组，不过其在输入数据集较大的情况下会占用大量的 RAM。Gensim 本身只是要求能够迭代的有序句子列表，因此在工程实践中我们可以使用自定义的生成器，只在内存中保存单条语句。

## Word2Vec 参数
+ min_count

在不同大小的语料集中，我们对于基准词频的需求也是不一样的。譬如在较大的语料集中，我们希望忽略那些只出现过一两次的单词，这里我们就可以通过设置min_count参数进行控制。一般而言，合理的参数值会设置在0~100之间。

+ size

size参数主要是用来设置神经网络的层数，Word2Vec 中的默认值是设置为100层。更大的层次设置意味着更多的输入数据，不过也能提升整体的准确度，合理的设置范围为 10~数百。

+ workers

workers参数用于设置并发训练时候的线程数，不过仅当Cython安装的情况下才会起作用：

In [5]:
# 引入 word2vec
from gensim.models.word2vec import LineSentence
from gensim.models import word2vec
import gensim

# 引入日志配置
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



# 构建训练

In [6]:
merger_data_path

'data/merged_train_test_seg_data.csv'

In [7]:
model = word2vec.Word2Vec(LineSentence(merger_data_path), workers=8,min_count=5,size=200)

2019-11-18 12:28:24,372 : INFO : collecting all words and their counts
2019-11-18 12:28:24,377 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-18 12:28:24,627 : INFO : PROGRESS: at sentence #10000, processed 941592 words, keeping 36787 word types
2019-11-18 12:28:24,838 : INFO : PROGRESS: at sentence #20000, processed 1897707 words, keeping 54137 word types
2019-11-18 12:28:25,045 : INFO : PROGRESS: at sentence #30000, processed 2842327 words, keeping 66972 word types
2019-11-18 12:28:25,250 : INFO : PROGRESS: at sentence #40000, processed 3758961 words, keeping 77905 word types
2019-11-18 12:28:25,469 : INFO : PROGRESS: at sentence #50000, processed 4736128 words, keeping 87815 word types
2019-11-18 12:28:25,698 : INFO : PROGRESS: at sentence #60000, processed 5774810 words, keeping 97787 word types
2019-11-18 12:28:25,935 : INFO : PROGRESS: at sentence #70000, processed 6836809 words, keeping 107409 word types
2019-11-18 12:28:26,145 : INFO : PROGRE

2019-11-18 12:28:47,948 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-11-18 12:28:47,949 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-11-18 12:28:47,957 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-11-18 12:28:47,958 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-11-18 12:28:47,958 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-11-18 12:28:47,959 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-11-18 12:28:47,959 : INFO : EPOCH - 5 : training on 9748048 raw words (8612407 effective words) took 4.2s, 2070204 effective words/s
2019-11-18 12:28:47,960 : INFO : training on a 48740240 raw words (43056727 effective words) took 20.9s, 2062033 effective words/s


# 查找最近的词

In [8]:
model.wv.most_similar(['奇瑞'],topn=10)

2019-11-18 12:28:47,963 : INFO : precomputing L2-norms of word weight vectors


[('名爵', 0.8551163673400879),
 ('东南', 0.8445316553115845),
 ('海马', 0.8362988233566284),
 ('二代', 0.8353272676467896),
 ('江淮', 0.8290162086486816),
 ('东风风行', 0.8268694281578064),
 ('猎豹', 0.8268465995788574),
 ('铃木', 0.823360025882721),
 ('瑞虎', 0.8204773664474487),
 ('帕杰罗', 0.8157345056533813)]

# 保存模型

In [9]:
save_model_path='data/wv/word2vec.model'

In [10]:
model.save(save_model_path)

2019-11-18 12:28:48,018 : INFO : saving Word2Vec object under data/wv/word2vec.model, separately None
2019-11-18 12:28:48,021 : INFO : not storing attribute vectors_norm
2019-11-18 12:28:48,023 : INFO : not storing attribute cum_table
2019-11-18 12:28:48,382 : INFO : saved data/wv/word2vec.model


# 载入模型

In [11]:
model = word2vec.Word2Vec.load(save_model_path)

2019-11-18 12:28:48,392 : INFO : loading Word2Vec object from data/wv/word2vec.model
2019-11-18 12:28:48,715 : INFO : loading wv recursively from data/wv/word2vec.model.wv.* with mmap=None
2019-11-18 12:28:48,715 : INFO : setting ignored attribute vectors_norm to None
2019-11-18 12:28:48,716 : INFO : loading vocabulary recursively from data/wv/word2vec.model.vocabulary.* with mmap=None
2019-11-18 12:28:48,716 : INFO : loading trainables recursively from data/wv/word2vec.model.trainables.* with mmap=None
2019-11-18 12:28:48,717 : INFO : setting ignored attribute cum_table to None
2019-11-18 12:28:48,717 : INFO : loaded data/wv/word2vec.model


In [12]:
model.wv.most_similar(['奇瑞'],topn=10)

2019-11-18 12:28:48,761 : INFO : precomputing L2-norms of word weight vectors


[('名爵', 0.8551163673400879),
 ('东南', 0.8445316553115845),
 ('海马', 0.8362988233566284),
 ('二代', 0.8353272676467896),
 ('江淮', 0.8290162086486816),
 ('东风风行', 0.8268694281578064),
 ('猎豹', 0.8268465995788574),
 ('铃木', 0.823360025882721),
 ('瑞虎', 0.8204773664474487),
 ('帕杰罗', 0.8157345056533813)]

# 参考

1. https://radimrehurek.com/gensim/models/word2vec.html 