In [1]:
import nltk, math, codecs
from gensim.models import Doc2Vec
from nltk.cluster.kmeans import KMeansClusterer
import re
import pymorphy2
from datetime import datetime

fname = 'noStopLemma_PV-DBOW_wrd-vec_1it_2win_6mincount_alpha25-25_sz80.model'

model = Doc2Vec.load(fname)

morph = pymorphy2.MorphAnalyzer()



In [2]:
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    txt = f.read().split('\n')
stw = set(txt)

In [3]:
NUM_CLUSTERS = 35 #25 #35 #25 #20 #40 #30 

def preprocess(str):
    # remove links
    str = re.sub(r'http(s)?:\/\/\S*? ', "", str)
    return str


def preprocess_document(text):
    #text = preprocess(text)
    fixedNoStop = []
    fixed = ''.join([x if x.isalnum() or x.isspace() else " " for x in text ]).split()
    for fix in fixed:
        if fix not in stw:
            fix = morph.parse(fix)[0].normal_form
            fixedNoStop.append(fix)
    return fixedNoStop

start_time = datetime.now()

#data = <sparse matrix that you would normally give to scikit>.toarray()

corpus = codecs.open('test_headlines_short.txt', mode="r", encoding="utf-8")
lines = corpus.read().lower().split('\r\n')
count = len(lines)

vectors = []

print("inferring vectors")
duplicate_dict = {}
used_lines = []
for i, t in enumerate(lines):
    if t not in duplicate_dict:#i % 2 == 0 and
        duplicate_dict[t] = True
        used_lines.append(t)
        vectors.append(model.infer_vector(preprocess_document(t)))

print("done")



kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=20)
assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)
print('Cluster assigning done!')
    
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

#Ты переобучилась что ли???

inferring vectors
done
Cluster assigning done!
Duration: 0:19:41.869816


In [4]:
def distanceToCentroid():
    for i in range(0, NUM_CLUSTERS):
        clustersize = 0
        for j in range(0, len(assigned_clusters)):
            if (assigned_clusters[j] == i):
                clustersize+=1
        clustersizes.append(clustersize)
        dist = 0.0
        centr = kclusterer.means()[i]
        for j in range(0, len(assigned_clusters)):
            if (assigned_clusters[j] == i):
                dist += pow(nltk.cluster.util.cosine_distance(vectors[j], centr),2)/clustersize
        dist = math.sqrt(dist)
        print("distance cluster: "+str(i)+" RMSE: "+str(dist)+" clustersize: "+str(clustersize))

def nClosestToCentroid(cluster_id, n):
    #clustersize = len(get_titles_by_cluster(cluster_id))
    centr = kclusterer.means()[cluster_id]
    distances = []
    for j in range(0, len(assigned_clusters)):
        if (assigned_clusters[j] == cluster_id):
            distances.append((used_lines[j], nltk.cluster.util.cosine_distance(vectors[j], centr)))
    distances = sorted(distances, key=lambda tup: tup[1])
    return distances[:n]


def get_titles_by_cluster(id):
    list = []
    for x in range(0, len(assigned_clusters)):
        if (assigned_clusters[x] == id):
            list.append(used_lines[x])
    return list

def get_topics(titles):
    from collections import Counter
    words = [preprocess_document(x) for x in titles]
    words = [word for sublist in words for word in sublist]
    #filtered_words = [word for word in words if word not in stw]
    count = Counter(words)
    print(count.most_common()[:5])


def cluster_to_topics(id):
    get_topics(get_titles_by_cluster(id))

In [15]:
#35 кластеров, 20 повторений
clustersizes = []
for clstr in range(NUM_CLUSTERS):
    n = round(len(get_titles_by_cluster(clstr))/3)
    print(str(clstr)+'\t', nClosestToCentroid(clstr, n))
    print('####\n')

0	 [('в каталонии задержали мужчину, ранившего полицейских', 0.14698683472591556), ('в москве арестовали мужчину, обвиняемого в изнасиловании девочки в подъезде', 0.16682536250271818), ('в нижнем новгороде мужчину подозревают в убийства соседа, нагрубившего его бабушке', 0.17747970462095719), ('в петербурге задержали подростка, стрелявшего из пневматики по трамваю', 0.17981296384948198), ('одного из задержанных в ходе расследования теракта в манчестере освободили', 0.19579815396133216), ('задержанный по подозрению в подготовке теракта в вене признал связь с иг', 0.20175297506031076), ('в зеленограде мужчина устроил стрельбу в кафе', 0.20240975055330346), ('в петербурге задержали организаторов тренингов из-за самоубийства участницы', 0.20613291640880627), ('в мексике задержали предполагаемого убийцу модели-иностранки', 0.21098254750655565), ('в чечне торговцев оружием подозревают в связях с террористами', 0.21178258065621003), ('в балашихе арестовали мужчину, агитировавшего в соцсети за

In [43]:
#choose indexes of possibly polemic clusters and type their ids in a list
indxs = [6,7,12,16,20,21,24,30,33,34]
all_polemics = []
for indx in indxs:
    arr = []
    n = len(get_titles_by_cluster(indx))
    head_dists = nClosestToCentroid(indx, n)
    for head in head_dists:
        arr.append(head[0])
    all_polemics.append((indx, arr))

In [49]:
for topic in all_polemics:
    print(topic[0],len(topic[1]))

6 60
7 52
12 59
16 59
20 79
21 107
24 44
30 104
33 86
34 67


In [46]:
with open('polemic_headlines_by_themes.txt', 'w', encoding='utf-8') as f:
    for topic in all_polemics:
        f.write(str(topic[0])+'\n')
        for headline in topic[1]:
            f.write(headline+'\n')
        f.write('\n')

#don't forget to clean output files from noise

In [52]:
with open('polemic_headlines_by_themes.txt', 'r', encoding='utf-8') as f1:
    topics = f1.read().split('\n\n')

In [53]:
len(topics)

10

In [55]:
for topic in topics:
    topic = topic.split('\n')
    print(topic[0], len(topic[1:]))

6 18
7 28
12 13
16 7
20 32
21 20
24 8
30 68
33 17
34 17


In [57]:
with open('all_heads_links_nodups.txt', 'r', encoding='utf-8') as f1:
    just_all = f1.read().split('\n')

JSONDecodeError: Expecting value: line 1 column 2 (char 1)