In [11]:
import artm

from os import path
from collections import Counter, defaultdict
from nltk import word_tokenize
from nltk.corpus import stopwords

ru_stopwords = stopwords.words('russian')

In [12]:
citations = defaultdict(lambda: [])

with open(path.join('..', 'data', 'fixes_handle_only.stem.txt')) as f:
    for text in f:
        try:
            key, citation = text.split(' ', 1)
            citations[key].append([word for word in word_tokenize(citation) if word not in ru_stopwords])
        except ValueError:
            pass

In [None]:
obtained_topics = {}
obtained_topics_reg = {}
unparsed = []

for citation, texts_ in citations.items():
    
    texts = []
    lemmas = []
    word_filter = lambda word: word not in ru_stopwords and len(word) >= 3
    for line in texts_:
        lemmas.extend(list(filter(word_filter, line)))
    texts.append(Counter(lemmas))
    
    with open(path.join('..', 'data', 'citations.vw'), 'w') as output:
        for i, text in enumerate(texts):
            line = str(i) + ' | '
            for key, value in text.items():
                line += (key + ':' + str(value) + ' ')
            line += '\n'
            output.write(line)
    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=path.join('..', 'data', 'citations.vw'), data_format='vowpal_wabbit', target_folder='batches')
    except:
        unparsed.append(citation)
        continue
    dictionary = artm.Dictionary()
    dictionary.gather(data_path='batches')
    dictionary.save_text(dictionary_path=path.join('batches', 'dict.txt'))
    model = artm.ARTM(num_topics=3, dictionary=dictionary)
    
    scores = [
        artm.PerplexityScore(name='perp_score', dictionary=dictionary),
        artm.SparsityPhiScore(name='phi_sparsity_score'),    
        artm.SparsityThetaScore(name='theta_sparsity_score'),
        artm.TopTokensScore(name='top_tokens_score')    
    ]

    for score in scores:
        model.scores.add(score)
       
    pretty_print_topics = ''
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)
    saved_top_tokens = model.score_tracker['top_tokens_score'].last_tokens
    for topic_id, topic_name in enumerate(model.topic_names):
        pretty_print_topics += 'Topic {}: {}; '.format(topic_id, ', '.join(saved_top_tokens[topic_name]))
    obtained_topics[citation] = pretty_print_topics
    
    regularizers = [
        artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_words'),
        artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer_words'),
        artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer')
    ]

    for regularizer in regularizers:
        model.regularizers.add(regularizer, overwrite=True)
        
    model.regularizers['decorrelator_phi_words'].tau = 1e+6
    model.regularizers['sparse_phi_regularizer_words'].tau = 1e+4
    model.regularizers['sparse_theta_regularizer'].tau = 1e+2
    
    pretty_print_topics = ''
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)
    saved_top_tokens = model.score_tracker['top_tokens_score'].last_tokens
    for topic_id, topic_name in enumerate(model.topic_names):
        pretty_print_topics += 'Topic {}: {}; '.format(topic_id, ', '.join(saved_top_tokens[topic_name]))
    obtained_topics_reg[citation] = pretty_print_topics

In [None]:
with open(path.join('..', 'data', 'topics_bigartm.txt'), 'w') as f:
    for key, topic in obtained_topics.items():
        f.write('{} {}\n'.format(key, topic))

In [None]:
with open(path.join('..', 'data', 'topics_bigartm_reg.txt'), 'w') as f:
    for key, topic in obtained_topics_reg.items():
        f.write('{} {}\n'.format(key, topic))