# DOU.ua Topic Modeling

(c) Yuriy Guts, 2016

Using Latent Dirichlet Allocation (LDA), we'll explore the topic structure for the comments of each post.

## Imports

In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs
import glob
import logging
import os
import re

In [3]:
import numpy as np

In [4]:
import gensim
from gensim import corpora, models

In [5]:
import nltk
from nltk.corpus import stopwords

In [6]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


**Set up logging**

In [7]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

## Prepare Corpus

**Read crawled comments from files**

In [8]:
texts = []
filenames = []
data_path = os.path.join("data", "clean")
for comment_file in glob.glob(os.path.join(data_path, "clean-*.txt")):
    filenames.append(comment_file)
    with codecs.open(comment_file, "r", "utf-8") as f:
        texts.append(f.read())

**Collect stopwords for English, Russian and Ukrainian**

In [9]:
stopwords_combined = set(stopwords.words("russian")).union(stopwords.words("english"))

In [10]:
print("Stopword count:", len(stopwords_combined))

Stopword count: 304


In [11]:
with open(os.path.join("utils", "stopwords-ru.txt")) as swf:
    stopwords_combined = stopwords_combined.union(set(swf.read().split("\n")))
with open(os.path.join("utils", "stopwords-uk.txt")) as swf:
    stopwords_combined = stopwords_combined.union(set(swf.read().split("\n")))

**Split texts into words**

In [12]:
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")

In [13]:
stemmer = nltk.stem.snowball.RussianStemmer()

In [None]:
texts = [
    [
        stemmer.stem(word)
        for word in tokenizer.tokenize(document)
        if not word in stopwords_combined
    ]
    for document in texts
]

**Create the corpus**

In [None]:
dictionary = corpora.Dictionary(texts)

In [None]:
print("The corpus contains", dictionary.num_docs, "documents.")

In [None]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
# Fixing the seed to make the results reproducible.
np.random.seed(2016)

## Train LDA Model

ldamodel = models.ldamodel.LdaModel(
    corpus,
    num_topics=18,
    id2word=dictionary,
    passes=70
)

In [None]:
ldamodel = models.ldamodel.LdaModel.load(os.path.join("trained", "checkpoint-18topics.lda"))

In [None]:
ldamodel.print_topics(num_topics=18, num_words=10)

In [None]:
#ldamodel_big.save(os.path.join("trained", "checkpoint-18topics.lda"))

**Assign topic names for better interpretation later**

In [None]:
topic_names = [
    "business1", "taxes", "mentor", "qa", "fraud",
    "ukrainian", "courses", "recruitment", "edu-corruption", "roads-cars",
    "cashflow", "knowledge", "business2", "tractor", "moral-rights",
    "politics", "code", "relationships"
]

**Print topic structure for every post from our corpus**

In [None]:
for i, text in enumerate(texts):
    print(filenames[i])
    print([
        (topic_names[idx], prob)
        for idx, prob in ldamodel_big[dictionary.doc2bow(texts[i])]
    ])