In [77]:
from gensim.models import LdaMulticore
from os import path
from collections import defaultdict
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
from random import choice

import re
import numpy as np
import pyLDAvis.gensim

ru_stopwords = stopwords.words('russian')

In [33]:
citations = defaultdict(lambda: [])

with open(path.join('data', 'fixes_handle_only.stem.txt')) as f:
    for text in f:
        try:
            key, citation = text.split(' ', 1)
            citations[key].append([word for word in word_tokenize(citation) if word not in ru_stopwords])
        except ValueError:
            pass

In [71]:
def pretty_print_topics(topics):
    pretty_output = ''
    pretty_topics = [', '.join([re.findall('"([^"]*)"', s)[0] for s in topic[1].split(' + ')]) for topic in topics]
    for i, topic in enumerate(pretty_topics):
        pretty_output += 'Topic {}: {}; '.format(i, topic)
    return pretty_output

In [None]:
topics = {}

for key, citation in citations.items():
    dictionary = Dictionary(citation)
    bow_corpus = [dictionary.doc2bow(doc) for doc in citation]
    lda_model = LdaMulticore(bow_corpus, num_topics=3, id2word=dictionary, passes=2, workers=2)
    topics[key] = pretty_print_topics(lda_model.print_topics(num_topics=10, num_words=5))

In [76]:
with open(path.join('data', 'topics.txt'), 'w') as f:
    for key, topic in topics.items():
        f.write('{} {}\n'.format(key, topic))

In [82]:
random_key = choice(list(citations.keys()))
dictionary = Dictionary(citations[random_key])
bow_corpus = [dictionary.doc2bow(doc) for doc in citations[random_key]]
lda_model = LdaMulticore(bow_corpus, num_topics=3, id2word=dictionary, passes=2, workers=2)
topics[key] = pretty_print_topics(lda_model.print_topics(num_topics=10, num_words=5))
visdata = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.save_html(visdata, path.join('data', '{}_vis.html'.format(random_key)))