In [3]:
!pip install pymorphy2



In [53]:
from typing import List
import nltk
import string
import pymorphy2
import codecs

class PrepareForTopics():
  def __init__(self):
    self.morph = pymorphy2.MorphAnalyzer()
    self.tokenizer = nltk.WordPunctTokenizer()
    self.stopwords = set(line.strip() for line in codecs.open('rus_stopwords.txt', "r", "utf_8_sig").readlines())

  def prepare_corp(self, news_list: List[str]):
    return [self.newstext2token(news_text) for news_text in news_list]

  def newstext2token(self, news_text: str):
      tokens = self.tokenizer.tokenize(news_text.lower())
      tokens_with_no_punct = [self.morph.parse(w)[0].normal_form for w in tokens if all(c not in string.punctuation for c in w)]
      tokens_base_forms = [w for w in tokens_with_no_punct if w not in self.stopwords and w.isalpha()]
      tokens_long = [w for w in tokens_base_forms if len(w)>1]
      tokens_last = list(filter(lambda w: self.morph.parse(w)[0].tag.POS in ['ADJF', 'NOUN'], tokens_long))
      return tokens_last


In [34]:
import re
raw_data = [re.sub("Данное сообщение (материал) создано и (или) распространено иностранным средством массовой информации, выполняющим функции иностранного агента, и (или) российским юридическим лицом, выполняющим функции иностранного агента. \n", '', line.strip()) for line in codecs.open('emb_dataset.txt', "r", "utf_8_sig").readlines() if line.strip()!=""]


In [57]:
processed_docs = PrepareForTopics().prepare_corp(raw_data)

In [58]:
print(processed_docs[:2])

[['газохранилище', 'казань', 'улица', 'западный', 'взрыв', 'мчс', 'следственный', 'комитет', 'россия', 'татарстан', 'взрыв', 'газовый', 'заправка', 'уголовный', 'дело', 'нарушение', 'правило', 'безопасность', 'взрывоопасный', 'объект', 'ст', 'ук', 'рф'], ['результат', 'хлопок', 'газозаправочный', 'станция', 'рабочий', 'количество', 'факт', 'следственный', 'орган', 'республика', 'татарстан', 'уголовный', 'дело', 'пресс', 'релиз', 'ведомство']]


In [59]:
len(processed_docs)

512487

In [60]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears 
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary(processed_docs)

In [61]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 безопасность
1 взрыв
2 взрывоопасный
3 газовый
4 газохранилище
5 дело
6 западный
7 заправка
8 казань
9 комитет
10 мчс


In [62]:
'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
dictionary.filter_extremes(no_below=10, no_above=0.1, keep_n= 100000)

In [63]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [64]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 79 ("сообщение") appears 1 time.
Word 82 ("страна") appears 1 time.
Word 198 ("больший") appears 2 time.
Word 202 ("мир") appears 1 time.
Word 203 ("надпись") appears 1 time.
Word 204 ("название") appears 1 time.
Word 206 ("самый") appears 2 time.
Word 210 ("граффити") appears 2 time.
Word 211 ("искусство") appears 1 time.
Word 212 ("мировой") appears 1 time.
Word 213 ("патриотический") appears 1 time.


In [None]:
from gensim.models import LdaMulticore
lda_model =  LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [None]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

In [None]:
from gensim.test.utils import datapath
temp_file = datapath("LDAmodel")
lda_model.save(temp_file)

In [None]:
morph = pymorphy2.MorphAnalyzer()
print(morph.parse("произойти")[0].tag.POS)