In [66]:
import jieba
from gensim import corpora,models,similarities

In [67]:
doc0 = "我不喜欢上海"
doc1 = "上海是一个好地方"
doc2 = "北京是一个好地方"
doc3 = "上海好吃的在哪里"
doc4 = "上海好玩的在哪里"
doc5 = "上海是好地方"
doc6 = "上海路和上海人"
doc7 = "喜欢小吃"
doc_test="我喜欢上海的小吃"

In [68]:
all_doc = []
all_doc.append(doc0)
all_doc.append(doc1)
all_doc.append(doc2)
all_doc.append(doc3)
all_doc.append(doc4)
all_doc.append(doc5)
all_doc.append(doc6)
all_doc.append(doc7)

In [69]:
all_doc

['我不喜欢上海',
 '上海是一个好地方',
 '北京是一个好地方',
 '上海好吃的在哪里',
 '上海好玩的在哪里',
 '上海是好地方',
 '上海路和上海人',
 '喜欢小吃']

In [70]:
all_doc_list = []
for doc in all_doc:
    doc_list = [word for word in jieba.cut(doc)]
    all_doc_list.append(doc_list)

In [71]:
print(all_doc_list)

[['我', '不', '喜欢', '上海'], ['上海', '是', '一个', '好', '地方'], ['北京', '是', '一个', '好', '地方'], ['上海', '好吃', '的', '在', '哪里'], ['上海', '好玩', '的', '在', '哪里'], ['上海', '是', '好', '地方'], ['上海', '路', '和', '上海', '人'], ['喜欢', '小吃']]


In [72]:
doc_test_list = [word for word in jieba.cut(doc_test)]
doc_test_list

['我', '喜欢', '上海', '的', '小吃']

In [73]:
#首先用dictionary方法获取词袋（bag-of-words)
dictionary = corpora.Dictionary(all_doc_list)

In [74]:
#词袋中用数字对所有词进行了编号
dictionary.keys()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [75]:
#编号与词之间的对应关系
dictionary.token2id

{'一个': 4,
 '上海': 0,
 '不': 1,
 '人': 14,
 '北京': 8,
 '和': 15,
 '哪里': 9,
 '喜欢': 2,
 '在': 10,
 '地方': 5,
 '好': 6,
 '好吃': 11,
 '好玩': 13,
 '小吃': 17,
 '我': 3,
 '是': 7,
 '的': 12,
 '路': 16}

In [76]:
#以下使用doc2bow制作语料库

corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]

In [77]:
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(0, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(0, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(0, 1), (9, 1), (10, 1), (12, 1), (13, 1)], [(0, 1), (5, 1), (6, 1), (7, 1)], [(0, 2), (14, 1), (15, 1), (16, 1)], [(2, 1), (17, 1)]]


In [78]:
#把测试文档也转换为二元组的向量

doc_test_vec = dictionary.doc2bow(doc_test_list)
doc_test_vec

[(0, 1), (2, 1), (3, 1), (12, 1), (17, 1)]

In [79]:
#对每个目标文档，分析测试文档的相似度
#Compute similarity against a corpus of documents by storing the sparse index matrix in memory. 
#The similarity measure used is cosine between two vectors.
index = similarities.SparseMatrixSimilarity(corpus, num_features=len(dictionary.keys()))
sim = index[doc_test_vec]
sim

array([0.67082036, 0.19999999, 0.        , 0.39999998, 0.39999998,
       0.2236068 , 0.3380617 , 0.6324555 ], dtype=float32)

余弦相似度的计算：
test和doc0总共有三个词相同，所以，分子是3；分母分别计算test和doc0的长度，分别是sqrt(5)和2；所以是3/(2*sqrt(5))=-.6708

In [80]:
#根据相似度排序

sorted(enumerate(sim), key=lambda item: -item[1])

[(0, 0.67082036),
 (7, 0.6324555),
 (3, 0.39999998),
 (4, 0.39999998),
 (6, 0.3380617),
 (5, 0.2236068),
 (1, 0.19999999),
 (2, 0.0)]

In [81]:
#使用TF-IDF模型对语料库建模
tfidf = models.TfidfModel(corpus)

#获取测试文档中，每个词的TF-IDF值
tfidf[doc_test_vec]

[(0, 0.08112725037593049),
 (2, 0.3909393754390612),
 (3, 0.5864090631585919),
 (12, 0.3909393754390612),
 (17, 0.5864090631585919)]

In [82]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
sim = index[tfidf[doc_test_vec]]
sim

array([0.54680777, 0.01055349, 0.        , 0.17724207, 0.17724207,
       0.01354522, 0.01279765, 0.70477605], dtype=float32)

In [83]:
#根据相似度排序

sorted(enumerate(sim), key=lambda item: -item[1])

[(7, 0.70477605),
 (0, 0.54680777),
 (3, 0.17724207),
 (4, 0.17724207),
 (5, 0.013545224),
 (6, 0.01279765),
 (1, 0.010553493),
 (2, 0.0)]

很有意思的一点是，使用tf-idf，句子7的相似度最高

In [84]:
sent1 = "Alice比Bob跑得快"
sent2 = "Bob比Alice跑得快"
sent3 = "小红比小明跑得快"
all_sent = []
all_sent.append(sent1)
all_sent.append(sent3)
all_sent_list = []
for sent in all_sent:
    sent_list = [word for word in jieba.cut(sent)]
    all_sent_list.append(sent_list)
    
sent_test_list = [word for word in jieba.cut(sent2)]
sent_test_list

['Bob', '比', 'Alice', '跑得快']

In [85]:
dictionary = corpora.Dictionary(all_sent_list)
corpus = [dictionary.doc2bow(sent) for sent in all_sent_list]
sent_test_vec = dictionary.doc2bow(sent_test_list)


In [86]:
index = similarities.SparseMatrixSimilarity(corpus, num_features=len(dictionary.keys()))
sim = index[sent_test_vec]
sim

array([1.        , 0.28867513], dtype=float32)

In [87]:
#使用TF-IDF模型对语料库建模
tfidf = models.TfidfModel(corpus)

#获取测试文档中，每个词的TF-IDF值
tfidf[sent_test_vec]

[(0, 0.5773502691896258), (1, 0.5773502691896258), (2, 0.5773502691896258)]

In [88]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
sim = index[tfidf[doc_test_vec]]
sim

array([0.81649655, 0.        ], dtype=float32)

可以讨论下为啥sent2和sent3的相似度为0？