In [1]:
from datasketch import MinHash, MinHashLSH, MinHashLSHForest
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba.posseg as pseg
import re

In [2]:
# 读取文件
f = open('weibos.txt', 'r', encoding='UTF-8')
text = f.read()
# 以句号, 叹号, 问号作为分隔, 去掉\n换行符号
sentences = re.split('[。！？]', text.replace('\n', ''))

In [3]:
# 如果最后一行为空, 则删除
if sentences[len(sentences)-1] == '':
    sentences.pop()

In [4]:
# 将item_text进行分词
def get_item_str(item_text):
    item_str = ""
    item = (pseg.cut(item_text))
    for i in list(item):
        # 去掉停用词
        if i.word not in list(stop):
            item_str += i.word
            # tfidf_vectorizer.fit_transform的输入需要空格分隔的单词
            item_str += " "
    return item_str

In [5]:
# 对item_str创建MinHash
def get_minhash(item_str):
    temp = MinHash()
    for d in item_str:
        temp.update(d.encode('utf8'))
    return temp

In [6]:
# 设置停用词
stop = [line.strip().encode('utf-8') for line in open('stopword.txt').readlines()]
# 得到分词后的documents
documents = []
for item_text in sentences:
    # 将item_text进行分词
    item_str = get_item_str(item_text)
    documents.append(item_str)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\Public\Documents\Wondershare\CreatorTemp\jieba.cache
Loading model cost 0.695 seconds.
Prefix dict has been built successfully.


In [7]:
# 创建LSH Forest及MinHash对象
minhash_list = []
forest = MinHashLSHForest()
for i in range(len(documents)):
    # 得到train_documents[i]的Minhash
    temp = get_minhash(documents[i])
    minhash_list.append(temp)
    forest.add(i, temp)
# index所有key, 以便可以进行检索
forest.index()

In [8]:
query = '爱护生命，远离男足'
# 将item_text进行分词
item_str = get_item_str(query)
# 得到item_str的MinHash
minhash_query = get_minhash(item_str)

In [9]:
# 查询forest中与m1相似的Top-K个邻居
result = forest.query(minhash_query, 3)
for i in range(len(result)):
    print(result[i], minhash_query.jaccard(minhash_list[result[i]]), documents[result[i]].replace(' ', ''))
print("Top 3 邻居", result)

34 0.234375 国足输给叙利亚后，里皮坐不住了，直接辞职了难怪有网友说，爱护生命，远离男足
35 0.2109375 男足的水平也就跟南极洲企鹅踢球
22 0.0703125 ”，球员委屈的说：“七十多分钟了，哪个晓得那个龟儿子往他们家球门踢嘛
Top 3 邻居 [34, 35, 22]
