In [1]:
import gensim
import logging
import re
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import pymorphy2
from datetime import datetime



In [2]:
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    txt = f.read()
stw = set(txt.split('\n'))

In [3]:
morph = pymorphy2.MorphAnalyzer()

In [4]:
start_time = datetime.now()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def preprocess(str):
    # remove links
    # str = re.sub(r'http(s)?:\/\/\S*? ', "", str)
    str = re.sub(r'[^а-яёА-ЯЁ ]+', '', str)

    return str

class Documents(object):
    def __init__(self, documents):
        self.documents = documents

    def __iter__(self):
        for i, doc in enumerate(self.documents):
            yield TaggedDocument(words = doc, tags = [i])

file = 'train_headlines.txt'


corpus = open(file, "r", encoding='utf-8')
lines = corpus.read().lower().split("\n")
count = len(lines)
preprocessed = []

duplicate_dict = {}

for t in lines:
    if t not in duplicate_dict:
        duplicate_dict[t] = True
        #t = preprocess(t)
        fixedNoStop = []
        fixed =''.join([x if x.isalnum() or x.isspace() else " " for x in t ]).split()
        for fix in fixed:
            if fix not in stw:
                fix = morph.parse(fix)[0].normal_form
                fixedNoStop.append(fix)
        preprocessed.append(fixedNoStop)

documents = Documents(preprocessed)
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:12:26.004794


In [34]:
#train model

#iter = 1, because we keep training ourselves :)
model = Doc2Vec(size=80, dbow_words= 1, dm=0, iter=1,  window=2, seed=1337, min_count=6, 
                workers=4,alpha=0.025, min_alpha=0.025)
model.build_vocab(documents)
for epoch in range(10):
    print("epoch "+str(epoch))
    model.train(documents, total_examples=count, epochs=1)
    model.save('noStopLemma_PV-DBOW_wrd-vec_1it_2win_6mincount_alpha25-25_sz80.model')
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay

2018-05-06 21:32:27,402 : INFO : collecting all words and their counts
2018-05-06 21:32:27,406 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-05-06 21:32:27,612 : INFO : PROGRESS: at example #10000, processed 70313 words (347365/s), 9299 word types, 10000 tags
2018-05-06 21:32:27,757 : INFO : PROGRESS: at example #20000, processed 140114 words (490024/s), 12398 word types, 20000 tags
2018-05-06 21:32:27,916 : INFO : PROGRESS: at example #30000, processed 209942 words (445929/s), 14741 word types, 30000 tags
2018-05-06 21:32:28,046 : INFO : PROGRESS: at example #40000, processed 279980 words (549897/s), 16598 word types, 40000 tags
2018-05-06 21:32:28,226 : INFO : PROGRESS: at example #50000, processed 349973 words (393458/s), 18080 word types, 50000 tags
2018-05-06 21:32:28,399 : INFO : PROGRESS: at example #60000, processed 419910 words (411510/s), 19352 word types, 60000 tags
2018-05-06 21:32:28,559 : INFO : PROGRESS: at example #70000, processed

epoch 0


2018-05-06 21:32:39,268 : INFO : PROGRESS: at 0.95% examples, 9246 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:32:40,978 : INFO : PROGRESS: at 8.70% examples, 33649 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:32:42,744 : INFO : PROGRESS: at 16.48% examples, 39243 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:32:44,448 : INFO : PROGRESS: at 24.24% examples, 42134 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:32:45,480 : INFO : PROGRESS: at 30.06% examples, 44924 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:32:47,171 : INFO : PROGRESS: at 35.88% examples, 43597 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:32:48,315 : INFO : PROGRESS: at 39.75% examples, 42874 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:32:49,430 : INFO : PROGRESS: at 46.56% examples, 45250 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:32:50,910 : INFO : PROGRESS: at 51.41% examples, 44180 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:32:51,979 : INFO : PROGRESS: at 58.13% examples, 46147 words/s, in_qsiz

epoch 1


2018-05-06 21:33:03,651 : INFO : PROGRESS: at 0.95% examples, 9252 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:04,713 : INFO : PROGRESS: at 4.82% examples, 24206 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:05,889 : INFO : PROGRESS: at 8.70% examples, 28433 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:07,093 : INFO : PROGRESS: at 12.58% examples, 30295 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:08,139 : INFO : PROGRESS: at 17.45% examples, 34120 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:09,206 : INFO : PROGRESS: at 23.27% examples, 38229 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:33:10,877 : INFO : PROGRESS: at 28.12% examples, 36947 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:11,927 : INFO : PROGRESS: at 33.94% examples, 39630 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:33:13,001 : INFO : PROGRESS: at 38.79% examples, 40646 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:14,163 : INFO : PROGRESS: at 42.67% examples, 40249 words/s, in_qsize

epoch 2


2018-05-06 21:33:29,188 : INFO : PROGRESS: at 1.91% examples, 20543 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:30,194 : INFO : PROGRESS: at 7.73% examples, 41859 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:31,818 : INFO : PROGRESS: at 12.59% examples, 37853 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:32,978 : INFO : PROGRESS: at 16.48% examples, 37590 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:34,029 : INFO : PROGRESS: at 21.33% examples, 39932 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:35,611 : INFO : PROGRESS: at 28.12% examples, 41462 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:36,664 : INFO : PROGRESS: at 33.94% examples, 43859 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:38,188 : INFO : PROGRESS: at 39.76% examples, 43563 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:33:39,224 : INFO : PROGRESS: at 46.56% examples, 46214 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:40,602 : INFO : PROGRESS: at 51.41% examples, 45370 words/s, in_qsi

epoch 3


2018-05-06 21:33:54,010 : INFO : PROGRESS: at 4.82% examples, 32724 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:55,768 : INFO : PROGRESS: at 12.59% examples, 40936 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:57,498 : INFO : PROGRESS: at 20.35% examples, 43739 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:33:58,551 : INFO : PROGRESS: at 25.21% examples, 44890 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:00,139 : INFO : PROGRESS: at 32.00% examples, 45325 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:34:01,750 : INFO : PROGRESS: at 39.75% examples, 46634 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:03,417 : INFO : PROGRESS: at 47.54% examples, 47318 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:05,068 : INFO : PROGRESS: at 55.25% examples, 47879 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:06,953 : INFO : PROGRESS: at 62.92% examples, 47522 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:07,976 : INFO : PROGRESS: at 69.82% examples, 49173 words/s, in_qs

epoch 4


2018-05-06 21:34:16,731 : INFO : PROGRESS: at 0.97% examples, 8271 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:17,857 : INFO : PROGRESS: at 5.79% examples, 26461 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:34:19,643 : INFO : PROGRESS: at 12.58% examples, 33007 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:20,647 : INFO : PROGRESS: at 16.48% examples, 34818 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:21,723 : INFO : PROGRESS: at 22.30% examples, 39037 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:34:23,385 : INFO : PROGRESS: at 28.12% examples, 38910 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:24,485 : INFO : PROGRESS: at 33.94% examples, 41258 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:25,695 : INFO : PROGRESS: at 37.82% examples, 40547 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:34:27,194 : INFO : PROGRESS: at 43.64% examples, 40814 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:34:28,328 : INFO : PROGRESS: at 49.48% examples, 42185 words/s, in_qsiz

epoch 5


2018-05-06 21:34:42,086 : INFO : PROGRESS: at 2.89% examples, 29403 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:43,150 : INFO : PROGRESS: at 6.76% examples, 34654 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:44,942 : INFO : PROGRESS: at 12.59% examples, 35129 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:34:45,986 : INFO : PROGRESS: at 16.48% examples, 36323 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:46,988 : INFO : PROGRESS: at 22.30% examples, 40921 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:48,557 : INFO : PROGRESS: at 28.12% examples, 40876 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:49,570 : INFO : PROGRESS: at 34.91% examples, 44768 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:50,962 : INFO : PROGRESS: at 39.76% examples, 43867 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:34:52,016 : INFO : PROGRESS: at 43.64% examples, 43534 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:34:53,019 : INFO : PROGRESS: at 49.48% examples, 45225 words/s, in_qsi

epoch 6


2018-05-06 21:35:05,996 : INFO : PROGRESS: at 4.82% examples, 30733 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:07,700 : INFO : PROGRESS: at 12.58% examples, 40321 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:08,732 : INFO : PROGRESS: at 16.48% examples, 40532 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:10,343 : INFO : PROGRESS: at 24.24% examples, 43797 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:11,988 : INFO : PROGRESS: at 32.00% examples, 45518 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:35:13,693 : INFO : PROGRESS: at 39.75% examples, 46281 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:15,115 : INFO : PROGRESS: at 47.53% examples, 48061 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:35:16,177 : INFO : PROGRESS: at 53.33% examples, 49150 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:17,615 : INFO : PROGRESS: at 59.09% examples, 48648 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:19,068 : INFO : PROGRESS: at 66.83% examples, 49573 words/s, in_qs

epoch 7


2018-05-06 21:35:26,771 : INFO : PROGRESS: at 4.82% examples, 36038 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:35:28,130 : INFO : PROGRESS: at 12.58% examples, 48699 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:29,521 : INFO : PROGRESS: at 20.37% examples, 52849 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:31,052 : INFO : PROGRESS: at 28.12% examples, 53592 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:35:32,787 : INFO : PROGRESS: at 35.88% examples, 52612 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:33,818 : INFO : PROGRESS: at 41.70% examples, 53777 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:35:35,351 : INFO : PROGRESS: at 47.53% examples, 51922 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:36,346 : INFO : PROGRESS: at 53.33% examples, 52989 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:37,970 : INFO : PROGRESS: at 59.09% examples, 51291 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:35:39,141 : INFO : PROGRESS: at 62.92% examples, 50002 words/s, in_qs

epoch 8


2018-05-06 21:35:49,098 : INFO : PROGRESS: at 0.95% examples, 9355 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:50,131 : INFO : PROGRESS: at 7.73% examples, 39511 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:51,783 : INFO : PROGRESS: at 12.59% examples, 36371 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:53,521 : INFO : PROGRESS: at 20.35% examples, 40337 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:35:55,220 : INFO : PROGRESS: at 28.12% examples, 42630 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:56,361 : INFO : PROGRESS: at 32.96% examples, 43197 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:58,052 : INFO : PROGRESS: at 39.76% examples, 43364 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:35:59,780 : INFO : PROGRESS: at 47.53% examples, 44231 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:36:00,836 : INFO : PROGRESS: at 54.29% examples, 46391 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:36:02,567 : INFO : PROGRESS: at 59.09% examples, 44525 words/s, in_qsiz

epoch 9


2018-05-06 21:36:12,939 : INFO : PROGRESS: at 2.87% examples, 26716 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:36:14,577 : INFO : PROGRESS: at 8.70% examples, 33858 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:36:16,272 : INFO : PROGRESS: at 16.48% examples, 40046 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:36:17,898 : INFO : PROGRESS: at 24.24% examples, 43299 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:36:19,591 : INFO : PROGRESS: at 32.00% examples, 44824 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:36:21,313 : INFO : PROGRESS: at 39.75% examples, 45662 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:36:22,488 : INFO : PROGRESS: at 43.64% examples, 44637 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:36:23,575 : INFO : PROGRESS: at 50.45% examples, 46842 words/s, in_qsize 8, out_qsize 0
2018-05-06 21:36:25,071 : INFO : PROGRESS: at 55.25% examples, 45581 words/s, in_qsize 7, out_qsize 0
2018-05-06 21:36:26,770 : INFO : PROGRESS: at 62.92% examples, 46087 words/s, in_qsi

In [35]:
model = Doc2Vec.load('noStopLemma_PV-DBOW_wrd-vec_1it_2win_6mincount_alpha25-25_sz80.model')

2018-05-06 21:53:04,414 : INFO : loading Doc2Vec object from noStopLemma_PV-DBOW_wrd-vec_1it_2win_6mincount_alpha25-25_sz80.model
2018-05-06 21:53:04,649 : INFO : loading wv recursively from noStopLemma_PV-DBOW_wrd-vec_1it_2win_6mincount_alpha25-25_sz80.model.wv.* with mmap=None
2018-05-06 21:53:04,649 : INFO : setting ignored attribute syn0norm to None
2018-05-06 21:53:04,664 : INFO : loading docvecs recursively from noStopLemma_PV-DBOW_wrd-vec_1it_2win_6mincount_alpha25-25_sz80.model.docvecs.* with mmap=None
2018-05-06 21:53:04,664 : INFO : loading doctag_syn0 from noStopLemma_PV-DBOW_wrd-vec_1it_2win_6mincount_alpha25-25_sz80.model.docvecs.doctag_syn0.npy with mmap=None
2018-05-06 21:53:04,836 : INFO : setting ignored attribute cum_table to None
2018-05-06 21:53:04,836 : INFO : loaded noStopLemma_PV-DBOW_wrd-vec_1it_2win_6mincount_alpha25-25_sz80.model


In [38]:
model.most_similar('ои')

[('олимпиада', 0.7108860015869141),
 ('пи', 0.7027755975723267),
 ('паралимпиада', 0.6913758516311646),
 ('пхенчхан', 0.6395276784896851),
 ('финал', 0.6351224780082703),
 ('хоккеист', 0.6099418997764587),
 ('1992', 0.6050775647163391),
 ('керлингист', 0.5994738340377808),
 ('лёд', 0.5924407243728638),
 ('немец', 0.5889979600906372)]

In [1]:
import nltk, math, codecs
from gensim.models import Doc2Vec
from nltk.cluster.kmeans import KMeansClusterer
import re
import pymorphy2
from datetime import datetime

from datetime import datetime

fname = 'noStopLemma_PV-DBOW_wrd-vec_1it_2win_6mincount_alpha25-25_sz80.model'

model = Doc2Vec.load(fname)

morph = pymorphy2.MorphAnalyzer()



In [5]:
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    txt = f.read().split('\n')
stw = set(txt)

In [42]:
#lemmatized test

start_time = datetime.now()
corpus = codecs.open('test_headlines_short.txt', mode="r", encoding="utf-8")
lines = corpus.read().lower().split('\r\n')
lemm_lines = []
for line in lines:
    fixed = ''.join([x if x.isalnum() or x.isspace() else " " for x in line ]).split()
    fixedNoStop = []
    for fix in fixed:
        if fix not in stw:
            fix = morph.parse(fix)[0].normal_form
            fixedNoStop.append(fix)
    lemm_lines.append(fixedNoStop)

    
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:13.619497


In [47]:
print(lines[:10])

['актера бена аффлека обвинили в сексуальных домогательствах', 'рфпи заключит сделку с ближневосточными фондами по покупке доли в "пулково"', 'вс признал законным приговор жителю эстонии за шпионаж в россии', 'марсово поле в петербурге исключили из списка гайд-парков', 'при землетрясении в китае могли погибнуть сотни человек', 'выработка электроэнергии станциями сгк в первом полугодии снизилась на 8%', 'битва буша с дождевиком на инаугурации трампа вызвала волну шуток в интернете', 'собчак открыла штаб в екатеринбурге', 'в правительстве объяснили расширение списка продэмбарго', 'источник: пять человек остаются в больницах после аварии поездов москве']


In [9]:
NUM_CLUSTERS = 30 #25

def preprocess(str):
    # remove links
    str = re.sub(r'http(s)?:\/\/\S*? ', "", str)
    return str


def preprocess_document(text):
    #text = preprocess(text)
    fixedNoStop = []
    fixed = ''.join([x if x.isalnum() or x.isspace() else " " for x in text ]).split()
    for fix in fixed:
        if fix not in stw:
            fix = morph.parse(fix)[0].normal_form
            fixedNoStop.append(fix)
    return fixedNoStop

start_time = datetime.now()

#data = <sparse matrix that you would normally give to scikit>.toarray()

corpus = codecs.open('test_headlines_short.txt', mode="r", encoding="utf-8")
lines = corpus.read().lower().split('\r\n')
count = len(lines)

vectors = []

print("inferring vectors")
duplicate_dict = {}
used_lines = []
for i, t in enumerate(lines):
    if t not in duplicate_dict:#i % 2 == 0 and
        duplicate_dict[t] = True
        used_lines.append(t)
        vectors.append(model.infer_vector(preprocess_document(t)))

print("done")



kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)
print('Cluster assigning done!')
    
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

inferring vectors
done
Cluster assigning done!
Duration: 0:18:38.780532


In [26]:
#with open('stopwords.txt', 'r', encoding='utf-8') as f:
#    txt = f.read()
#stw = txt.split('\n')

clustersizes = []

def distanceToCentroid():
    for i in range(0,NUM_CLUSTERS):
        clustersize = 0
        for j in range(0,len(assigned_clusters)):
            if (assigned_clusters[j] == i):
                clustersize+=1
        clustersizes.append(clustersize)
        dist = 0.0
        centr = kclusterer.means()[i]
        for j in range(0,len(assigned_clusters)):
            if (assigned_clusters[j] == i):
                dist += pow(nltk.cluster.util.cosine_distance(vectors[j], centr),2)/clustersize
        dist = math.sqrt(dist)
        print("distance cluster: "+str(i)+" RMSE: "+str(dist)+" clustersize: "+str(clustersize))



def get_titles_by_cluster(id):
    list = []
    for x in range(0, len(assigned_clusters)):
        if (assigned_clusters[x] == id):
            list.append(used_lines[x])
    return list

def get_topics(titles, stw):
    from collections import Counter
    words = [preprocess_document(x) for x in titles]
    words = [word for sublist in words for word in sublist]
    filtered_words = [word for word in words if word not in stw]
    count = Counter(filtered_words)
    print(count.most_common()[:5])


def cluster_to_topics(id):
    get_topics(get_titles_by_cluster(id), stw)

In [11]:
for clstr in range(30):
    cluster_to_topics(clstr)

[('рубль', 9), ('миллиард', 8), ('рассказать', 7), ('долг', 4), ('доллар', 4)]
[('теракт', 14), ('иго', 9), ('заявить', 8), ('турция', 7), ('боевик', 7)]
[('каталония', 8), ('оппозиция', 7), ('заявить', 5), ('венесуэла', 5), ('поддержка', 4)]
[('рубль', 11), ('миллион', 7), ('вырасти', 7), ('миллиард', 7), ('2017', 6)]
[('память', 4), ('рассказать', 4), ('игра', 4), ('путин', 4), ('назвать', 4)]
[('метро', 7), ('московский', 6), ('станция', 5), ('поезд', 4), ('петербург', 4)]
[('путин', 24), ('встреча', 22), ('сша', 12), ('мид', 11), ('тиллерсон', 10)]
[('рассказать', 9), ('рост', 8), ('путин', 7), ('нефть', 7), ('заявить', 7)]
[('выборы', 14), ('выбор', 12), ('макрон', 10), ('президент', 9), ('франция', 8)]
[('ес', 15), ('мид', 13), ('заявить', 12), ('сша', 11), ('сотрудничество', 9)]
[('полиция', 11), ('задержать', 10), ('ск', 6), ('убийство', 6), ('подозревать', 6)]
[('кндр', 35), ('сша', 26), ('заявить', 15), ('китай', 12), ('япония', 10)]
[('заявить', 11), ('путин', 10), ('рассказ

In [17]:
get_titles_by_cluster(8)

['в германии проходят парламентские выборы',
 'аналитик: скандал вокруг жены может помешать фийону в президентской гонке',
 'почти половина британцев поддержала отставку мэй, показал опрос',
 'академия наук франции призвала голосовать за макрона во втором туре',
 'по оценке телеканала france 2, фийон и меланшон не проходят во второй тур',
 'компромисс или раздор: как в молдавии утверждали смешанную систему выборов',
 'рейтинг ле пен упал на один пункт перед вторым туром выборов, показал опрос',
 'глава белоруссии поздравил жээнбекова с избранием президентом киргизии',
 'путин гарантировал абхазии независимость и самостоятельность',
 'москве импонирует победа вучича на выборах в сербии, заявили в кремле',
 'депутат бундестага: германия не забудет о роли коля в объединении страны',
 'путин: нет никаких доказательств вмешательства россии в выборы в сша',
 'лавров прокомментировал слова макрона про sputnik и rt',
 'более трети россиян надеются на улучшение жизни через пять лет, показал опр

In [27]:
distanceToCentroid()

distance cluster: 0 RMSE: 0.34556933743794466 clustersize: 54
distance cluster: 1 RMSE: 0.3528981227558469 clustersize: 63
distance cluster: 2 RMSE: 0.3390824584165182 clustersize: 40
distance cluster: 3 RMSE: 0.3220099381149507 clustersize: 78
distance cluster: 4 RMSE: 0.3634730756012227 clustersize: 70
distance cluster: 5 RMSE: 0.33283500446726594 clustersize: 58
distance cluster: 6 RMSE: 0.3100571960331825 clustersize: 80
distance cluster: 7 RMSE: 0.3241008842732013 clustersize: 84
distance cluster: 8 RMSE: 0.31014477452878925 clustersize: 63
distance cluster: 9 RMSE: 0.3242564380523431 clustersize: 102
distance cluster: 10 RMSE: 0.33650713890331685 clustersize: 59
distance cluster: 11 RMSE: 0.3269431076504982 clustersize: 87
distance cluster: 12 RMSE: 0.3558726625155311 clustersize: 61
distance cluster: 13 RMSE: 0.3365885567978864 clustersize: 65
distance cluster: 14 RMSE: 0.3093931754714953 clustersize: 124
distance cluster: 15 RMSE: 0.3249165101006558 clustersize: 90
distance clu