In [1]:
from gensim.test.utils import common_texts
from gensim.models import TfidfModel, LdaModel, LsiModel
from gensim.corpora import Dictionary

In [2]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

# 一、数据加载

In [3]:
# 构建字典
common_dictionary = Dictionary(common_texts)
# 各个文本对应的词袋法的值
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
print("原始数据:\n{}".format(common_texts))
print("\n词袋法后的值:\n{}".format(common_corpus))
print(f"文本数目:{len(common_texts)}")
print(f"去重后单词数目:{len(common_dictionary)}")

原始数据:
[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]

词袋法后的值:
[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]
文本数目:9
去重后单词数目:12


In [4]:
other_texts = [
    ['computer', 'time', 'graph'],  # 文本1
    ['survey', 'response', 'eps'],  # 文本2
    ['human', 'system', 'computer']  # 文本3
]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
print("测试数据对应的词袋法的值:\n{}".format(other_corpus))

测试数据对应的词袋法的值:
[[(0, 1), (6, 1), (10, 1)], [(3, 1), (4, 1), (8, 1)], [(0, 1), (1, 1), (5, 1)]]


# 二、TF-IDF Model

In [6]:
# 模型构建
model = TfidfModel(corpus=common_corpus)

In [7]:
# 预测
vectors = model[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.6282580468670046), (6, 0.6282580468670046), (10, 0.45889394536615247)]
[(3, 0.5773502691896257), (4, 0.5773502691896257), (8, 0.5773502691896257)]
[(0, 0.6282580468670046), (1, 0.6282580468670046), (5, 0.45889394536615247)]


# 三、LDA Model

In [17]:
# 模型构建&训练
model = LdaModel(common_corpus, num_topics=5)

In [18]:
# 模型保存
model.save('./datas/lda_model.pkl')

In [19]:
# 模型加载
lda = LdaModel.load('./datas/lda_model.pkl')

In [20]:
# 模型结果获取(文本向量)
vectors = lda[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.05001298), (1, 0.40197802), (2, 0.05164329), (3, 0.05006429), (4, 0.44630143)]
[(0, 0.29893586), (1, 0.05063484), (2, 0.050723635), (3, 0.050070092), (4, 0.5496356)]
[(0, 0.41517657), (1, 0.05001251), (2, 0.4329673), (3, 0.05006193), (4, 0.05178168)]


In [21]:
lda.get_topics() # 单词对应的主题向量矩阵

array([[0.01955971, 0.11496038, 0.11521181, 0.01951915, 0.01954965,
        0.30639446, 0.01948438, 0.11488739, 0.21023877, 0.02060254,
        0.02002409, 0.01956766],
       [0.01965677, 0.0195905 , 0.01971216, 0.11507357, 0.01958347,
        0.0196957 , 0.11518445, 0.11534413, 0.01963847, 0.21045387,
        0.21097454, 0.11509235],
       [0.12738489, 0.12779273, 0.12713522, 0.02166208, 0.12778345,
        0.02178541, 0.02166858, 0.02181517, 0.02174724, 0.12590733,
        0.12747896, 0.1278389 ],
       [0.08254214, 0.08244435, 0.08338046, 0.08251099, 0.08247575,
        0.08360141, 0.08238702, 0.08319774, 0.08269517, 0.08730888,
        0.08451759, 0.08293849],
       [0.14225376, 0.02416207, 0.02415838, 0.14229538, 0.14193347,
        0.1423937 , 0.14223087, 0.14255324, 0.0241116 , 0.02525066,
        0.02439517, 0.02426168]], dtype=float32)

In [11]:
# 更新模型（在当前模型基础上继续更新模型参数）
lda.update(other_corpus)

In [12]:
# 更新后模型结果获取(文本向量)
vectors = lda[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.062604725), (1, 0.44880527), (2, 0.06382402), (3, 0.424766)]
[(0, 0.3178306), (1, 0.06355622), (2, 0.064631194), (3, 0.55398196)]
[(0, 0.06322161), (1, 0.06382847), (2, 0.06261651), (3, 0.81033343)]


# 四、Other

官网文档：https://radimrehurek.com/gensim/apiref.html

In [15]:
print("各个单词对应的主题向量:")
word_embedding_tabel = model.get_topics().T
print(type(word_embedding_tabel))
print(word_embedding_tabel.shape)
print(word_embedding_tabel)

各个单词对应的主题向量:
<class 'numpy.ndarray'>
(12, 6)
[[0.04181563 0.01860848 0.1297038  0.08307857 0.02791506 0.10596586]
 [0.04196068 0.0187414  0.1293612  0.08355447 0.19393642 0.01528224]
 [0.04198239 0.12948352 0.12958613 0.08312406 0.02792645 0.0152902 ]
 [0.04192816 0.1292706  0.018688   0.08316412 0.02794438 0.10604487]
 [0.04177731 0.0186189  0.01863831 0.08297346 0.02783804 0.19674736]
 [0.04201462 0.12957418 0.01871953 0.08331634 0.36022606 0.10606261]
 [0.04180914 0.12947886 0.01862992 0.0829841  0.02786918 0.10603964]
 [0.0419612  0.24036069 0.01871369 0.08317102 0.02796732 0.10608108]
 [0.04178712 0.12959756 0.01865163 0.08304744 0.19387208 0.01521703]
 [0.29087844 0.0188076  0.23988827 0.08434808 0.02829659 0.01554708]
 [0.29024214 0.0187848  0.12972496 0.08400527 0.02820492 0.1058884 ]
 [0.04184326 0.0186733  0.12969454 0.08323296 0.02800352 0.1058336 ]]


In [14]:
common_dictionary.token2id

{'computer': 0,
 'human': 1,
 'interface': 2,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'user': 7,
 'eps': 8,
 'trees': 9,
 'graph': 10,
 'minors': 11}