# 词袋模型及相似度

In [24]:
# import nltk
# nltk.download('punkt')
from nltk import word_tokenize
from math import sqrt

In [4]:
sent1 = "I love sky, I love sea."
sent2 = "I like running, I love reading."

In [14]:
# word_tokenize 只能对英文进行分词，中文分词可以使用jieba进行分词
sents = [sent1, sent2]
texts = [[word for word in word_tokenize(sent)] for sent in sents]
print("texts:", texts)

texts: [['I', 'love', 'sky', ',', 'I', 'love', 'sea', '.'], ['I', 'like', 'running', ',', 'I', 'love', 'reading', '.']]


In [16]:
# 构建语料库
all_list = []
for text in texts:
    all_list += text
corpus = set(all_list)
print(corpus)


{',', '.', 'like', 'love', 'sea', 'reading', 'running', 'sky', 'I'}


In [21]:
# 语料库词典映射
corpus_dict = dict(zip(corpus, range(len(corpus))))
print(corpus_dict)

{',': 0, '.': 1, 'like': 2, 'love': 3, 'sea': 4, 'reading': 5, 'running': 6, 'sky': 7, 'I': 8}


In [23]:
# 构建句子向量
def sentense_vector(text, corpus_dict):
    vec = []
    for key in corpus_dict.keys():
        if key in text:
            vec.append((corpus_dict[key], text.count(key)))
        else:
            vec.append((corpus_dict[key], 0))
    vec = sorted(vec, key=lambda x: x[0])
    return vec

vec1 = sentense_vector(texts[0], corpus_dict)
vec2 = sentense_vector(texts[1], corpus_dict)
print("vec1: ", vec1)
print("vec2: ", vec2)

vec1:  [(0, 1), (1, 1), (2, 0), (3, 2), (4, 1), (5, 0), (6, 0), (7, 1), (8, 2)]
vec2:  [(0, 1), (1, 1), (2, 1), (3, 1), (4, 0), (5, 1), (6, 1), (7, 0), (8, 2)]


In [26]:
# 计算余弦相似度
def similarity_with_2_vector(vec1, vec2):
    inner_product = 0
    square_length_vec1 = 0
    square_length_vec2 = 0
    for tup1, tup2 in zip(vec1, vec2):
        inner_product += tup1[1] * tup2[1]
        square_length_vec1 += tup1[1] ** 2
        square_length_vec2 += tup2[1] ** 2
    
    return inner_product/sqrt(square_length_vec1*square_length_vec2)

cosine_sim = similarity_with_2_vector(vec1, vec2)
print('两个句子的余弦相似度为： %.4f。'%cosine_sim)

两个句子的余弦相似度为： 0.7303。


In [27]:
# 使用gensim包

In [28]:
from gensim import corpora
from gensim.similarities import Similarity

In [33]:
# 语料库
dictionary = corpora.Dictionary(texts)
print("type(dictionary): ", type(dictionary))
print("dictionary: ", dictionary)

type(dictionary):  <class 'gensim.corpora.dictionary.Dictionary'>
dictionary:  Dictionary<9 unique tokens: [',', '.', 'I', 'love', 'sea']...>


In [37]:
# doc2bow 作为词袋模型
corpus = [dictionary.doc2bow(text) for text in texts]
similarity = Similarity('-Similarity-index', corpus, num_features=len(dictionary))

print("corpus: ", corpus)
print("similarity: ", similarity)

corpus:  [[(0, 1), (1, 1), (2, 2), (3, 2), (4, 1), (5, 1)], [(0, 1), (1, 1), (2, 2), (3, 1), (6, 1), (7, 1), (8, 1)]]
similarity:  Similarity<2 documents in 0 shards stored under -Similarity-index>


In [40]:
# 计算相似度
new_sensence = sent1
test_corpus_1 = dictionary.doc2bow(word_tokenize(new_sensence))

cosine_sim = similarity[test_corpus_1][1]
print("利用gensim计算得到两个句子的相似度： %.4f。" % cosine_sim)

利用gensim计算得到两个句子的相似度： 0.7303。


In [41]:
similarity[test_corpus_1]

array([0.99999994, 0.73029673], dtype=float32)