In [1]:
from sklearn import cluster
import networkx as nx
from collections import defaultdict
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score
import pickle
from datetime import datetime
from sklearn.cluster import KMeans

In [2]:
with open ('polemic_comments.pkl', 'rb') as fp:
    polemic_comments = pickle.load(fp)

In [4]:
#count cluster contents
only_comments_by_cluster = {}
#prints 'cluster_id', 'num_articles_in_cluster', 'num_comments_in_cluster', 'mean_comments_in_cluster'
for clstr in polemic_comments:
    clstr_comments = []
    cntr_articles = 0
    for article in clstr[1]:
        cntr_articles += 1
        comments_arr = article[2]
        if len(comments_arr) != 0:
            for inner_tuple in comments_arr:
                for comment_text in inner_tuple[1]:
                    clstr_comments.append(comment_text)
    cntr_comments = len(clstr_comments)
    print(clstr[0],'\t',cntr_articles,'\t',cntr_comments,'\t',cntr_comments/cntr_articles)
    only_comments_by_cluster[clstr[0]] = clstr_comments

6 	 18 	 74 	 4.111111111111111
7 	 28 	 87 	 3.107142857142857
12 	 13 	 30 	 2.3076923076923075
16 	 7 	 31 	 4.428571428571429
20 	 32 	 443 	 13.84375
21 	 20 	 248 	 12.4
24 	 8 	 47 	 5.875
30 	 68 	 1521 	 22.36764705882353
33 	 17 	 985 	 57.94117647058823
34 	 16 	 334 	 20.875


In [5]:
import nltk, math, codecs
from gensim.models import Doc2Vec
import re
import pymorphy2
from datetime import datetime

fname = 'noStopLemma_PV-DBOW_wrd-vec_1it_2win_6mincount_alpha25-25_sz80.model'

model = Doc2Vec.load(fname)

morph = pymorphy2.MorphAnalyzer()



In [6]:
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    txt = f.read().split('\n')
stw = set(txt)

In [8]:
def distanceToCentroid():
    '''calculate distances to centroid for clusters'''
    all_distances = []
    for i in range(NUM_CLUSTERS):
        clustersize = 0
        for j in range(0, len(assigned_clusters)):
            if (assigned_clusters[j] == i):
                clustersize+=1
        clustersizes.append(clustersize)
        dist = 0.0
        centr = km.cluster_centers_[i]
        for j in range(0, len(assigned_clusters)):
            if (assigned_clusters[j] == i):
                dist += pow(nltk.cluster.util.cosine_distance(vectors[j], centr),2)/clustersize
        dist = math.sqrt(dist)
        all_distances.append(dist)
    return all_distances

def nClosestToCentroid(cluster_id, n):
    '''return n headlines closest to centroid'''
    centr = km.cluster_centers_[clstr]
    distances = []
    for j in range(0, len(assigned_clusters)):
        if (assigned_clusters[j] == cluster_id):
            distances.append((used_lines[j], nltk.cluster.util.cosine_distance(vectors[j], centr)))
    distances = sorted(distances, key=lambda tup: tup[1])
    return distances[:n]


def get_titles_by_cluster(id):
    list = []
    for x in range(0, len(assigned_clusters)):
        if (assigned_clusters[x] == id):
            list.append(used_lines[x])
    return list

def get_topics(titles):
    from collections import Counter
    words = [preprocess_document(x) for x in titles]
    words = [word for sublist in words for word in sublist]
    #filtered_words = [word for word in words if word not in stw]
    count = Counter(words)
    print(count.most_common()[:5])


def cluster_to_topics(id):
    '''return 5 words best describing the topic of the cluster'''
    get_topics(get_titles_by_cluster(id))

In [21]:
def preprocess(str):
    str = re.sub(r'http(s)?:\/\/\S*? ', "", str)
    return str


def preprocess_document(text):
    fixedNoStop = []
    fixed = ''.join([x if x.isalnum() or x.isspace() else " " for x in text ]).split()
    for fix in fixed:
        if fix not in stw:
            fix = morph.parse(fix)[0].normal_form
            fixedNoStop.append(fix)
    return fixedNoStop

start_time = datetime.now()

comm_clusters = []
chosen_clstrs = ['20', '21', '30', '33', '34']
NUM_CLUSTERS = 5
divider = 6
    
for polem_clstr in chosen_clstrs:
    lines = only_comments_by_cluster[polem_clstr]
    for line in range(len(lines)):
        lines[line] = lines[line].lower()

    vectors = []

    print("inferring vectors")
    duplicate_dict = {}
    used_lines = []
    for i, t in enumerate(lines):
        if t not in duplicate_dict:
            duplicate_dict[t] = True
            used_lines.append(t)
            vectors.append(model.infer_vector(preprocess_document(t)))
            
    clustersizes = []
    km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', max_iter=100, n_init=8)
    km.fit(vectors)
    assigned_clusters = km.labels_
    
    theme_clstrs = []
    for clstr in range(NUM_CLUSTERS):
        arr = []
        n = round(len(get_titles_by_cluster(clstr))/divider)
        nclose = nClosestToCentroid(clstr, n)
        for close_comm in nclose:
            arr.append(close_comm[0])
        theme_clstrs.append(arr)
    comm_clusters.append((polem_clstr, theme_clstrs))
print("done")

inferring vectors
inferring vectors
inferring vectors
inferring vectors
inferring vectors
done


In [30]:
with open('comments_by_themes.txt', 'w', encoding='utf-8') as f:
    for topic in comm_clusters:
        f.write(str(topic[0])+'\n')
        counter = 0
        for comments in topic[1]:
            f.write('Cluster_id {}\n'.format(str(counter)))
            for comm_cluster in comments:
                #print(comm_cluster)
                f.write(comm_cluster+'\n')
            counter = counter + 1
            f.write('\n')
        f.write('\n')

#don't forget to clean output files from noise