In [None]:
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
from os import path
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from pymorphy2 import MorphAnalyzer
from json import dump
from textdistance import levenshtein

ru_stopwords = stopwords.words('russian')
alpha_tokenizer = RegexpTokenizer('[A-Za-zА-Яа-я]\w+')
morph = MorphAnalyzer()

In [None]:
with open(path.join('..', 'data', 'citcon4bundles.txt'), 'r') as f:
    data = f.read()

In [None]:
lines = data.split('\n')

In [None]:
context_groups = defaultdict(lambda: {})
errors = []

for line in lines:
    try:
        context_group, text = line.split(' ', 1)
        splits = text.split(' ', 3)
        citation_text = [morph.parse(word.lower())[0].normal_form for word in alpha_tokenizer.tokenize(splits[3]) if word not in ru_stopwords]
        citation_code = '_'.join(splits[:3])
        context_groups[context_group][citation_code] = citation_text
    except ValueError:
        errors.append(line)

In [None]:
def pretty_print_topics(topics):
    topics_list = []
    pretty_output = ''
    pretty_topics = [', '.join([re.findall('"([^"]*)"', s)[0] for s in topic[1].split(' + ')]) for topic in topics]
    for i, topic in enumerate(pretty_topics):
        pretty_output += 'Topic {}: {}; '.format(i, topic)
        topics_list.append(topic)
    return pretty_output, topics_list

In [None]:
def print_topics_by_ids(old_lda_topics, ids, topic_list, ref_key):
    pretty_output = []
#     current_topic = ', '.join(topic_list)
#     dist = levenshtein.distance(current_topic, old_lda_topics[ref_key.split('_')[0]])
    for topic, prob in ids:
        pretty_output.append({'ref_key': ref_key, 'topic': topic_list[topic], 'probability': str(round(prob, 2))})
    return pretty_output

In [None]:
with open(path.join('..', 'data', 'topics_lda_dist.txt'), 'r') as f:
    lda_dist = f.read().split('\n')

In [None]:
old_lda_topics = {}

for line in lda_dist[:~0]:
    key, text = line.split(' ', 1)
    topics = text.split(', probability')[0]
    if key in lda_topics.keys():
        old_lda_topics[key.split('citing:')[1]] = '{}, {}'.format(lda_topics[key], topics)
    else:
        old_lda_topics[key.split('citing:')[1]] = topics
    break

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
topics = {}
topics_dist = defaultdict(lambda: [])

for key, citation in context_groups.items():
    try:
        dictionary = Dictionary(citation.values())
        bow_corpus = [dictionary.doc2bow(doc) for doc in citation.values()]
        lda_model = LdaMulticore(bow_corpus, num_topics=3, id2word=dictionary, passes=2, workers=2)
        topics[key], topics_list = pretty_print_topics(lda_model.print_topics(num_topics=3, num_words=5))
        for i in range(len(bow_corpus)):
            topics_dist[key].append(print_topics_by_ids(old_lda_topics, lda_model[bow_corpus[i]], topics_list, list(citation.keys())[i]))
    except ValueError:
        continue

In [None]:
with open('topic_output.json', 'w') as f:
    dump(dict(topics_dist), f, ensure_ascii=False)