In [4]:
from gensim.models import LdaMulticore
from os import path
from collections import defaultdict
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
from random import choice

import re
import numpy as np
import pyLDAvis.gensim

ru_stopwords = stopwords.words('russian')

In [15]:
citations = defaultdict(lambda: [])

with open(path.join('..', 'data', 'fixes_handle_only.stem.txt')) as f:
    for text in f:
        try:
            key, citation = text.split(' ', 1)
            citations[key].append([word for word in word_tokenize(citation) if word not in ru_stopwords])
        except ValueError:
            pass

In [36]:
def pretty_print_topics(topics):
    topics_list = []
    pretty_output = ''
    pretty_topics = [', '.join([re.findall('"([^"]*)"', s)[0] for s in topic[1].split(' + ')]) for topic in topics]
    for i, topic in enumerate(pretty_topics):
        pretty_output += 'Topic {}: {}; '.format(i, topic)
        topics_list.append(topic)
    return pretty_output, topics_list

In [65]:
def print_topics_by_ids(ids, topic_list):
    pretty_output = ''
    for topic, prob in ids:
        pretty_output += '{}, probability: {:0.2f}; '.format(topic_list[topic], prob)
    return pretty_output

In [18]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
topics = {}
topics_dist = defaultdict(lambda: [])

for key, citation in citations.items():
    dictionary = Dictionary(citation)
    bow_corpus = [dictionary.doc2bow(doc) for doc in citation]
    lda_model = LdaMulticore(bow_corpus, num_topics=3, id2word=dictionary, passes=2, workers=2)
    topics[key], topics_list = pretty_print_topics(lda_model.print_topics(num_topics=3, num_words=5))
    for i in range(len(bow_corpus)):
        topics_dist[key].append(print_topics_by_ids(lda_model[bow_corpus[i]], topics_list))
    
#     visdata = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
#     pyLDAvis.save_html(visdata, path.join('..', 'data', 'vis', '{}_vis.html'.format(key)))

In [None]:
with open(path.join('data', 'topics_lda.txt'), 'w') as f:
    for key, topic in topics.items():
        f.write('{} {}\n'.format(key, topic))

In [None]:
with open(path.join('data', 'topics_lda_dist.txt'), 'w') as f:
    for key, topics_ in dict(topics_dist).items():
        for topic in topics_:
            f.write('{} {}\n'.format(key, topic))