In [2]:
from gensim.test.utils import common_texts
from gensim.models import TfidfModel, LdaModel, LsiModel
from gensim.corpora import Dictionary

In [3]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

# 一、数据加载

In [4]:
# 构建字典
common_dictionary = Dictionary(common_texts)
# 各个文本对应的词袋法的值
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
print("原始数据:\n{}".format(common_texts))
print("\n词袋法后的值:\n{}".format(common_corpus))
print(f"文本数目:{len(common_texts)}")
print(f"去重后单词数目:{len(common_dictionary)}")

原始数据:
[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]

词袋法后的值:
[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]
文本数目:9
去重后单词数目:12


In [5]:
other_texts = [
    ['computer', 'time', 'graph'],  # 文本1
    ['survey', 'response', 'eps'],  # 文本2
    ['human', 'system', 'computer']  # 文本3
]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
print("测试数据对应的词袋法的值:\n{}".format(other_corpus))

测试数据对应的词袋法的值:
[[(0, 1), (6, 1), (10, 1)], [(3, 1), (4, 1), (8, 1)], [(0, 1), (1, 1), (5, 1)]]


# 二、TF-IDF Model

In [5]:
# 模型构建
model = TfidfModel(corpus=common_corpus)

In [6]:
# 预测
vectors = model[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.6282580468670046), (6, 0.6282580468670046), (10, 0.45889394536615247)]
[(3, 0.5773502691896257), (4, 0.5773502691896257), (8, 0.5773502691896257)]
[(0, 0.6282580468670046), (1, 0.6282580468670046), (5, 0.45889394536615247)]


# 三、LDA Model

In [7]:
# 模型构建&训练
model = LdaModel(common_corpus, num_topics=4)

In [8]:
# 模型保存
model.save('./datas/lda_model.pkl')

In [9]:
# 模型加载
lda = LdaModel.load('./datas/lda_model.pkl')

In [10]:
# 模型结果获取(文本向量)
vectors = lda[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.062689744), (1, 0.39406234), (2, 0.06856733), (3, 0.47468057)]
[(0, 0.31178728), (1, 0.06719473), (2, 0.06908342), (3, 0.55193454)]
[(0, 0.065304086), (1, 0.06668729), (2, 0.06265142), (3, 0.8053572)]


In [11]:
# 更新模型（在当前模型基础上继续更新模型参数）
lda.update(other_corpus)

In [12]:
# 更新后模型结果获取(文本向量)
vectors = lda[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.062604725), (1, 0.44880527), (2, 0.06382402), (3, 0.424766)]
[(0, 0.3178306), (1, 0.06355622), (2, 0.064631194), (3, 0.55398196)]
[(0, 0.06322161), (1, 0.06382847), (2, 0.06261651), (3, 0.81033343)]


# 四、Other

官网文档：https://radimrehurek.com/gensim/apiref.html

In [13]:
print("各个单词对应的主题向量:")
word_embedding_tabel = model.get_topics().T
print(type(word_embedding_tabel))
print(word_embedding_tabel.shape)
print(word_embedding_tabel)

各个单词对应的主题向量:
<class 'numpy.ndarray'>
(12, 4)
[[0.03636932 0.01629709 0.04301159 0.18496804]
 [0.03635216 0.07850609 0.04243105 0.10349216]
 [0.17556524 0.01643681 0.04359925 0.10232528]
 [0.03657178 0.07820462 0.04238318 0.10378326]
 [0.03657138 0.01643091 0.20198143 0.10398141]
 [0.17578383 0.1414825  0.04364618 0.10424715]
 [0.0367495  0.07797532 0.04268932 0.10382433]
 [0.17487854 0.07846285 0.04342537 0.10450925]
 [0.17512098 0.07856907 0.04264564 0.02139184]
 [0.0402857  0.20012091 0.04582827 0.02280044]
 [0.03784909 0.13937141 0.20610845 0.02274404]
 [0.03790245 0.07814243 0.20225023 0.02193274]]


In [14]:
common_dictionary.token2id

{'computer': 0,
 'human': 1,
 'interface': 2,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'user': 7,
 'eps': 8,
 'trees': 9,
 'graph': 10,
 'minors': 11}