## Import Dataset and EDA 

In [1]:
#Modules
import pandas as pd
import numpy as np
from ast import literal_eval
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import gensim.test.utils 
import tqdm
%matplotlib inline
import matplotlib.pyplot as plt
import pickle

In [2]:
kremlin = pd.read_excel('data/putin_corpus.xlsx')
kremlin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33189 entries, 0 to 33188
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          33189 non-null  object
 1   URL           33189 non-null  object
 2   description   30316 non-null  object
 3   introduction  30316 non-null  object
 4   id            33189 non-null  int64 
 5   speaker       33189 non-null  object
 6   p             33189 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.8+ MB


In [3]:
kremlin.rename(columns = {'p':'text'}, inplace = True)

In [5]:
print(kremlin.iloc[2666]["text"])

Ежели эти люди, о которых Вы сейчас сказали, пройдя через это, придут к этому, мы будем только аплодировать. Понимаете, некоторые, на мой взгляд, может быть, вы на меня сильно обидитесь, но пиарятся в тюрьмах. Вот это страшно.


In [10]:
print(kremlin.iloc[140]["description"])

Владимир Путин поздравил Патриарха Кирилла с Днём тезоименитства, праздником Вознесения и Днём славянской письменности. Глава государства преподнёс в подарок предстоятелю Русской православной церкви икону «Вознесение».Затем Владимир Путин и Патриарх Московский и всея Руси Кирилл встретились с архиепископом Афинским и всея Эллады Иеронимом II, совершающим визит в Россию.


In [None]:
#Identify Noise with Regular Expression
import re 
RE_SUSPICIOUS = re.compile(r'[Äô&#<>{}\[\]\\]')

def impurity(text, min_len=10): 
    """returns the share of suspicious characters in a text""" 
    if text == None or len(text) < min_len: 
        return 0 
    else: 
        return len(RE_SUSPICIOUS.findall(text))/len(text)


In [None]:
# add new column to data frame
kremlin['impurity'] = kremlin['text'].apply(impurity, min_len=10)

In [None]:
kremlin['impurity'].sort_values().value_counts()

In [None]:
# get the top 3 records 
#critical part is just []...I will leave as it is for the moment
kremlin[['text', 'impurity']].sort_values(by='impurity', ascending=False).head(3)

In [None]:
def clean(text):
     # everything non-alpahnumeric with a space
     text = re.sub(r'\W+',' ', text)
     # Two or more dots with one
     text = re.sub(r'\.{2,}', ' ', text)
     # sequences of white spaces
     text = re.sub(r'\s+', ' ', text)
     #convert text to lowercase
     text = text.lower()
     return text.strip()

In [None]:
kremlin['text'] = kremlin['text'].map(clean)

In [None]:
# get the top 3 records 
#critical part is just []...I will leave as it is for the moment
kremlin[['text', 'impurity']].sort_values(by='impurity', ascending=False).head(3)

In [None]:
kremlin.drop(columns=['impurity'], inplace=True)

In [None]:
print(repr(kremlin.iloc[14288]["text"]))

In [None]:
print(kremlin.iloc[29683]["text"])

## Text Preprocessing

In [None]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.ru.stop_words import STOP_WORDS

In [None]:
nlp_spacy = spacy.load('ru_core_news_sm', exclude=["ner"])
nlp_spacy.disable_pipe("parser")
nlp_spacy.enable_pipe("senter")

In [None]:
#let’s build phrase models that we can clean and use to create the corpus for our analysis
bigram = gensim.models.Phrases(kremlin['text'], min_count = 5,
                               threshold = 100)

In [None]:
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [None]:
#Stopwords
from spacy.lang.ru import stop_words
nlp_spacy.Defaults.stop_words |= { 'дорогой', 'день', 'коллега', 'добрый', 'два', 'раз', 'сегодня', 'спасибо', 'уважаемый', 'уважаемыe'}
stop_words = stop_words.STOP_WORDS

In [None]:
#remove stopwords
def remove_stopwords(texts):
    return[[word for word in simple_preprocess(str(doc)) if word not in stop_words]
            for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
#Turn words into lemmas
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp_spacy(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
data_words_nostops = remove_stopwords(kremlin['text'])

In [None]:
data_words_bigram = make_bigrams(data_words_nostops)

In [None]:
data_lemma = lemmatization(data_words_bigram, 
                           allowed_postags=['NOUN', 'ADJ', 'VERB'])

In [None]:
lemma = pd.DataFrame({'lemma': data_lemma})

In [None]:
lemma

In [None]:
lemma.to_pickle('data/lemma.pkl')

In [None]:
read_lemma = pd.read_pickle('data/lemma.pkl')

In [None]:
read_lemma

In [10]:
#if read_lemma works, then it's necessary to create again a list of a list
read_lemma = pd.read_pickle('data/lemma.pkl')
data_lemma = read_lemma['lemma'].tolist()

In [86]:
#dictionary
id2word = corpora.Dictionary(data_lemma)
#corpus
texts = data_lemma
#term document matrix
corpus = [id2word.doc2bow(text) for text in texts]

In [118]:
[[(id2word[i], freq) for i, freq in doc] for doc in corpus[:2]]

[[('гражданин', 1), ('дорогой', 1), ('друг', 1), ('уважаемые', 1)],
 [('гражданин', 1),
  ('внешний', 1),
  ('внутренний', 1),
  ('вступление', 1),
  ('гарант', 1),
  ('глава', 1),
  ('государство', 1),
  ('должность', 1),
  ('жизнь', 1),
  ('избрать', 1),
  ('история', 1),
  ('конституция', 1),
  ('направление', 1),
  ('начало', 1),
  ('новый', 1),
  ('определять', 1),
  ('основный', 1),
  ('политика', 1),
  ('право', 1),
  ('президент', 2),
  ('свобода', 1),
  ('человек', 1),
  ('этап', 1),
  ('являться', 1)]]

In [87]:
print('Number of unique tokens: %d' % len(id2word))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 26650
Number of documents: 33189


## Training the Models

### num_of_topics=10 

In [88]:
#TRAINING THE MODEL (num_of_topics=10)
num_topics = 10
chunksize = 100
passes = 10 #controls how often we train the model on the entire corpus.
iterations = 200
eval_every = None  # Don't evaluate model perplexity, takes too much time.

In [89]:
lda_model_10 = gensim.models.LdaMulticore(corpus = corpus,
                                       id2word = id2word,
                                       num_topics = num_topics,
                                       random_state = 42,
                                       chunksize = chunksize,
                                       passes = passes,
                                       per_word_topics=True,
                                       minimum_probability = 0)

In [90]:
pprint(lda_model_10.print_topics())
doc_lda = lda_model_10[corpus]

[(0,
  '0.027*"страна" + 0.020*"экономический" + 0.017*"международный" + '
  '0.016*"отношение" + 0.015*"форум" + 0.015*"российский" + 0.012*"регион" + '
  '0.011*"развитие" + 0.011*"республика" + 0.010*"связь"'),
 (1,
  '0.029*"центр" + 0.017*"медицинский" + 0.015*"новый" + 0.013*"школа" + '
  '0.012*"образование" + 0.011*"область" + 0.011*"наука" + 0.011*"современный" '
  '+ 0.011*"товарищ" + 0.011*"научный"'),
 (2,
  '0.033*"друг" + 0.026*"уважаемые" + 0.025*"коллега" + 0.023*"успех" + '
  '0.021*"дорогой" + 0.020*"президент" + 0.019*"поблагодарить" + 0.016*"слово" '
  '+ 0.015*"праздник" + 0.015*"поздравлять"'),
 (3,
  '0.050*"внимание" + 0.050*"большой" + 0.043*"ребёнок" + 0.040*"город" + '
  '0.037*"семья" + 0.026*"благодарить" + 0.025*"здоровье" + 0.015*"игра" + '
  '0.013*"дом" + 0.012*"обратить"'),
 (4,
  '0.020*"народ" + 0.017*"страна" + 0.015*"история" + 0.014*"человек" + '
  '0.012*"великий" + 0.012*"искусственный" + 0.011*"война" + 0.011*"сила" + '
  '0.009*"жизнь" + 0.009

In [91]:
#save model in pickle format to working directory
lda_model_10.save("models/lda_model_10.pkl")

In [92]:
#load model back into your workspace from working directory
lda_10 = gensim.models.LdaModel.load("models/lda_model_10.pkl")

In [93]:
#u_mass
coherence_model_lda_10 = CoherenceModel(model=lda_model_10, corpus=corpus, coherence='u_mass')
coherence_lda_10 = coherence_model_lda_10.get_coherence()
print('\nCoherence Score: ', coherence_lda_10)


Coherence Score:  -3.726749524769879


In [119]:
#c_v
coherence_model_lda_10 = CoherenceModel(model=lda_model_10, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda_10.get_coherence()
print('nCoherence Score: ', coherence_lda)

  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


nCoherence Score:  nan


### num_of_topics = 30

In [94]:
#TRAINING THE MODEL (num_of_topics=30)
num_topics = 30
chunksize = 100
passes = 10 #controls how often we train the model on the entire corpus.
iterations = 200
eval_every = None  # Don't evaluate model perplexity, takes too much time.

In [95]:
lda_model_30 = gensim.models.LdaMulticore(corpus = corpus,
                                       id2word = id2word,
                                       num_topics = num_topics,
                                       random_state = 42,
                                       chunksize = chunksize,
                                       passes = passes,
                                       per_word_topics=True,
                                       minimum_probability = 0)

In [96]:
pprint(lda_model_30.print_topics())
doc_lda = lda_model_30[corpus]

[(16,
  '0.067*"успех" + 0.052*"поздравлять" + 0.043*"праздник" + 0.043*"желать" + '
  '0.033*"добрый" + 0.032*"новый" + 0.030*"здоровье" + 0.030*"чемпионат" + '
  '0.029*"восточный" + 0.025*"работа"'),
 (2,
  '0.049*"спорт" + 0.042*"команда" + 0.039*"выступление" + 0.038*"школа" + '
  '0.034*"игра" + 0.028*"язык" + 0.025*"спортивный" + 0.022*"единство" + '
  '0.021*"относиться" + 0.021*"массовый"'),
 (20,
  '0.037*"компания" + 0.036*"экономика" + 0.033*"рынок" + 0.026*"рост" + '
  '0.026*"бизнес" + 0.020*"мировой" + 0.017*"отрасль" + 0.017*"процент" + '
  '0.017*"малый" + 0.016*"цена"'),
 (12,
  '0.256*"год" + 0.053*"тысяча" + 0.044*"процент" + 0.040*"миллиард" + '
  '0.032*"последний" + 0.032*"прошлый" + 0.028*"доллар" + 0.023*"миллион" + '
  '0.021*"рубль" + 0.015*"принять"'),
 (9,
  '0.060*"человек" + 0.047*"говорить" + 0.039*"знать" + 0.032*"сделать" + '
  '0.029*"первый" + 0.029*"сказать" + 0.027*"работать" + 0.026*"думать" + '
  '0.025*"делать" + 0.023*"хотеть"'),
 (17,
  '0.061

In [108]:
#save model in pickle format to working directory
lda_model_30.save("models/lda_model_30.pkl")

In [109]:
#load model back into your workspace from working directory
lda_30 = gensim.models.LdaModel.load("models/lda_model_30.pkl")

In [99]:
coherence_model_lda_30 = CoherenceModel(model=lda_model_30, corpus=corpus, coherence='u_mass')
coherence_lda_30 = coherence_model_lda_30.get_coherence()
print('\nCoherence Score: ', coherence_lda_30)


Coherence Score:  -5.105336650761064


### num_of_topics = 50

In [105]:
#TRAINING THE MODEL (num_of_topics=50)
num_topics = 50
chunksize = 100
passes = 10 #controls how often we train the model on the entire corpus.
iterations = 200
eval_every = None  # Don't evaluate model perplexity, takes too much time.

In [106]:
lda_model_50 = gensim.models.LdaMulticore(corpus = corpus,
                                       id2word = id2word,
                                       num_topics = num_topics,
                                       random_state = 42,
                                       chunksize = chunksize,
                                       passes = passes,
                                       per_word_topics=True,
                                       minimum_probability = 0)

In [110]:
pprint(lda_model_50.print_topics())
doc_lda = lda_model_50[corpus]

[(0,
  '0.057*"интересный" + 0.050*"образ" + 0.042*"помощь" + 0.038*"оказывать" + '
  '0.036*"увидеть" + 0.035*"взгляд" + 0.031*"церковь" + 0.029*"собраться" + '
  '0.028*"передать" + 0.023*"представитель"'),
 (11,
  '0.144*"слово" + 0.067*"сказать" + 0.057*"благодарность" + 0.052*"контакт" + '
  '0.047*"дать" + 0.046*"присутствовать" + 0.043*"зал" + 0.038*"уважаемый" + '
  '0.030*"террорист" + 0.026*"отдельный"'),
 (2,
  '0.070*"спорт" + 0.057*"вид" + 0.054*"следующий" + 0.049*"посмотреть" + '
  '0.046*"вызов" + 0.036*"выступление" + 0.033*"спортивный" + 0.027*"целом" + '
  '0.023*"увеличиться" + 0.018*"приходить"'),
 (15,
  '0.046*"создавать" + 0.043*"задача" + 0.041*"решать" + 0.031*"информация" + '
  '0.030*"момент" + 0.027*"перспектива" + 0.027*"угроза" + 0.025*"океан" + '
  '0.022*"содействие" + 0.021*"народный"'),
 (22,
  '0.061*"общий" + 0.061*"страна" + 0.049*"регион" + 0.043*"усилие" + '
  '0.043*"развитие" + 0.036*"сотрудничество" + 0.032*"диалог" + 0.029*"мир" + '
  '0.029*

In [111]:
#save model in pickle format to working directory
lda_model_50.save("models/lda_model_50.pkl")

In [112]:
#load model back into your workspace from working directory
lda_50 = gensim.models.LdaModel.load("models/lda_model_50.pkl")

In [113]:
coherence_model_lda_50 = CoherenceModel(model=lda_model_50, corpus=corpus, coherence='u_mass')
coherence_lda_50 = coherence_model_lda_50.get_coherence()
print('\nCoherence Score: ', coherence_lda_50)


Coherence Score:  -6.037390920317644


### visualize topic models

In [7]:
from imp import reload
import pyLDAvis
import pyLDAvis.gensim_models

  from imp import reload


In [8]:
pyLDAvis.enable_notebook()

In [101]:
from imp import reload
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pyLDAvis.gensim_models.prepare(lda_10, corpus, id2word)

In [114]:
from imp import reload
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pyLDAvis.gensim_models.prepare(lda_30, corpus, id2word)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [115]:
pyLDAvis.gensim_models.prepare(lda_50, corpus, id2word)

In [116]:
pyLDAvis.save_html(lda_10, 'graphs/lda_10.html')

AttributeError: 'LdaMulticore' object has no attribute 'to_json'