In [1]:
import random
import numpy as np
import string
from zhon.hanzi import punctuation
import jieba
import textdistance
import sys

In [2]:
def load_sentence(path, encoding='utf-8'):
    
    transtable = str.maketrans('', '', string.punctuation + punctuation + ' ')
    
    sentences = []
    with open(path, encoding=encoding) as f:
        for line in f:
            line = json.loads(line)
            text = line["title"] + line['content']
            text = text.translate(transtable)
            text = jieba.lcut(text)
            sentences.append(text)
    return sentences

In [3]:
sentences = load_sentence("tag_news.json")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\rovenr12\AppData\Local\Temp\jieba.cache
Loading model cost 0.447 seconds.
Prefix dict has been built successfully.


In [4]:
class StringKmeans:
    def __init__(self, data, cluster_mean_len, cluster_num = 20):
        self.data = data
        self.cluster_num = cluster_num
        self.cluster_mean_len = cluster_mean_len
        self.centers = np.random.choice(data, cluster_num, replace=False)
        self.centers = [self.center([center]) for center in self.centers]
        
    def distance(self, word1, word2):
        return textdistance.sorensen(word1, word2)
    
    def center(self, words):
        word_dict = {}
        for word_list in words:
            for word in word_list:
                word_dict[word] = word_dict.get(word, 0) + 1

        word_list = sorted(word_dict.items(), key=lambda x:x[1], reverse=True)
        center_len = int(min(self.cluster_mean_len, len(word_list)))
        
        # 取中間的詞 (避免取很常見的字)
        if center_len > len(word_list) * 0.8:
            start_idx = int(len(word_list) * 0.8) - 1
            word_list = [word[0] for word in word_list[start_idx:center_len + start_idx]]
        else:
            word_list = [word[0] for word in word_list[:center_len]]
            
        return word_list
    
    def cluster(self):
        result = []
        for i in range(self.cluster_num):
            result.append([])
        for item in self.data:
            distance_min = sys.maxsize
            index = -1
            for i in range(len(self.centers)):
                distance = self.distance(item, self.centers[i])
                if distance < distance_min:
                    distance_min = distance
                    index = i
            result[index].append(item)
            
        new_center = [self.center(words) for words in result]
        
        has_no_changed = True
        for old_center, new_center in zip(self.centers, new_center):
            if self.distance(old_center, new_center) < 0.8:
                has_no_changed = False
                break

        if not has_no_changed and len(new_center) == self.cluster_num:
            self.centers = new_center
            return self.cluster()
    

In [5]:
sk = StringKmeans(sentences, 20, 10)

In [6]:
sk.cluster()

In [7]:
center_word = ["/".join(center) for center in sk.centers]

In [8]:
center_word

['的/创意/中国/设计师/时尚/在/2010/作品/国际/奖/当代/设计/艺术/将/邀请赛/北京/和/使/与/创意设计',
 '的/刘翔/调查/退赛/影响/在/奥运会/110/米栏/比赛/中国/观众/奥运/对/你/有何/8/月/18/日',
 '的/是/最佳/电视剧/百强/华鼎奖/榜/中国/提名/男女/黄/渤/在/题材/黄渤/回应/个/失误/摘得/这',
 '的/战机/加油/在/试验/F35/进行/与/F35B/首次/空中加油/海军陆战队/美军/联合/成功/软管/将/陆战队/版/报道',
 '的/了/波什/也/是/热火/在/他/两连败/分/篮板/数据/让/但/比赛/有/个/不错/都/场上',
 '的/皮草/四十五度/了/广场/是/时尚/上/蓝色/港湾/品牌/给/消费者/和/在/人/将/都/价格/对于',
 '时间/的/艾弗森/在/推迟/贝西/克/塔斯/球队/他/土耳其/2/天/球迷/要/11/月/日/消息/俱乐部',
 '的/秋冬/军装/是/搭配/华丽/宫廷/典雅/应用/在/把/俏皮/一个/2010/五类/流行/复古/中裙/率性/风在',
 '的/赤鹿/在/雄鹿/争夺/鹿角/雌鹿/展开/大战/雄性/开始/是/英国/为/就/公园/野生/一头/14/照片',
 '的/最/黑暗/料理/你/看上去/奇葩/吃/是/就/鲱鱼/我们/这个/还/不错/12/个/世界各地/各国/特色']