# LDA and Top-30 words visualization

In this exercise, I used my Twitter archive in Russian (about 22.000 tweets) as corpus. LDA topic modelling is made with Gensim library. 

In [16]:
import nltk
import string
import pandas as pd
from gensim.corpora import Dictionary
from gensim import corpora
import pickle
import gensim

#define all words and signs that are to be removed from the text
stopwords = nltk.corpus.stopwords.words('russian')
stopwords_delete = ['никогда', 'нельзя', 'всегда', "между", "сейчас", "хорошо", "перед"]
stopwords_add = ['это', 'который', "хотя", "кстати", "обо", "ещё", "меж", "зато", "пусть", "ага", "этот", "это", "почему", "видимо", "кажется",
                 "весь", "ты", "он", "она", "оно", "мы", "вы", "кто", "что", "сам", "сама", "само", "свой", "наш", "ваш", "их", "тот", "та", "те", 
                 "то", "раз", "твой", "мой", "кой", "кое", "все", "весь", "всё", "быть", "тот", "кроме", "причем", "наверное", "около",
                 "таки", "такой", "какой", "каждый", "который", "вместо", "например", "вообще", "значит", "б", "д", 
                 "е", "ж", "з", "к", "л", "м", "н", "п", "р", "с", "ф", "ч", "ц", "ш", "щ", "ь", "ъ","э", "ю", "i", "the", "on", "a", "are"
                 "is", "was", "were", "to", "as", "so", "am", "about", "for", "re", "you", "we", "they", "us", "them", "me", "though", "although", 
                 "all", "or", "and", "some", "until", "an", "will", "no", "yes", "not", "with", "your", "this"]

new_stopwords = []
for word in stopwords:
	if word not in stopwords_delete:
		new_stopwords.append(word)
stopwords = new_stopwords
if len(stopwords_add) != 0:
	stopwords += stopwords_add
punctuation = list(string.punctuation)
punctuation += ['–', '—', '"', "¿", "¡", "``", "''", "..."]
stop = stopwords + punctuation     

For LDA, we just need all texts to be tokenized. 

In [17]:
def prepare_text_for_lda(text):
    tokens = text.split()
    tokens = [token for token in tokens if len(token) > 4] #we use only "long" words, they are usually more significant
    tokens = [token for token in tokens if token not in stop] #filter stopwords and punctuation
    return tokens

In [18]:
#We load CSV table with the archive of tweets. Previuosly, I cleaned the text from unsernames, smileys and hashtags, 
#and lemmatized it to reduce the number of tokens. You can see how to do this for Russian here: https://github.com/ZotovaElena/Tweets-Preprocessing
tweets = pd.read_csv('tweets_clean.csv', sep='\t')
tweets = tweets.fillna('')
tweets_text = tweets.text_lemmatized.values
tweets_text = list(tweets.text_lemmatized.values)

#create a list of tweets tokenized
text_data = []
for line in tweets_text: 
    tokens = prepare_text_for_lda(line)
    text_data.append(tokens)
            
#create a dictionary from the data, then convert to bag-of-words corpus and save the dictionary and corpus for future use
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

#Ask LDA to find some topics in the data
NUM_TOPICS = 15
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model15.gensim')
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.037*"песня" + 0.030*"находить" + 0.030*"барнаул" + 0.029*"место" + 0.026*"рядом" + 0.022*"показывать" + 0.020*"несколько" + 0.019*"машина" + 0.016*"центр" + 0.015*"радио"')
(1, '0.039*"русский" + 0.031*"нравиться" + 0.029*"видеть" + 0.029*"испанский" + 0.029*"москва" + 0.027*"слово" + 0.027*"очень" + 0.020*"ездить" + 0.020*"дорога" + 0.019*"становиться"')
(2, '0.046*"читать" + 0.031*"оказываться" + 0.027*"обычный" + 0.024*"книга" + 0.022*"мужчина" + 0.020*"слышать" + 0.017*"завтра" + 0.017*"ребенок" + 0.014*"заниматься" + 0.014*"отлично"')
(3, '0.052*"телефон" + 0.041*"купить" + 0.037*"смотреть" + 0.035*"магазин" + 0.035*"новый" + 0.034*"деньги" + 0.027*"фильм" + 0.019*"покупать" + 0.018*"страна" + 0.016*"довольно"')
(4, '0.085*"хорошо" + 0.062*"работа" + 0.038*"ходить" + 0.038*"ехать" + 0.021*"собака" + 0.018*"карта" + 0.016*"нужно" + 0.015*"проходить" + 0.015*"интересно" + 0.015*"белый"')
(5, '0.098*"хороший" + 0.028*"сколько" + 0.028*"правда" + 0.023*"испания" + 0.023*"погода

It is not very easy to make LDA topics in Gensim human readable. Here is a nice way to visualize them. 

In [19]:
import pyLDAvis.gensim
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
