# DOU.ua Topic Modeling

(c) Yuriy Guts, 2016

Using Latent Dirichlet Allocation (LDA), we'll find out which topics dominate in the comments for each post.

## Imports

In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs
import glob
import logging
import os
import re

In [3]:
import numpy as np

In [4]:
import gensim
from gensim import corpora, models

In [5]:
import nltk
from nltk.corpus import stopwords

In [6]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


**Set up logging**

In [7]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

## Prepare Corpus

**Read crawled comments from files**

In [8]:
texts = []
filenames = []
data_path = os.path.join("data", "clean")
for comment_file in glob.glob(os.path.join(data_path, "clean-*.txt")):
    filenames.append(comment_file)
    with codecs.open(comment_file, "r", "utf-8") as f:
        texts.append(f.read())

**Collect stopwords for English, Russian and Ukrainian**

In [9]:
stopwords_combined = set(stopwords.words("russian")).union(stopwords.words("english"))

In [10]:
print("Stopword count:", len(stopwords_combined))

Stopword count: 304


In [11]:
with open(os.path.join("utils", "stopwords-ru.txt")) as swf:
    stopwords_combined = stopwords_combined.union(set(swf.read().split("\n")))
with open(os.path.join("utils", "stopwords-uk.txt")) as swf:
    stopwords_combined = stopwords_combined.union(set(swf.read().split("\n")))

**Split texts into words**

In [12]:
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")

In [13]:
stemmer = nltk.stem.snowball.RussianStemmer()

In [14]:
texts = [
    [
        stemmer.stem(word)
        for word in tokenizer.tokenize(document)
        if not word in stopwords_combined
    ]
    for document in texts
]

**Create the corpus**

In [15]:
dictionary = corpora.Dictionary(texts)

In [16]:
print("The corpus contains", dictionary.num_docs, "documents.")

The corpus contains 519 documents.


In [17]:
corpus = [dictionary.doc2bow(text) for text in texts]

## Train LDA Model

In [18]:
# Fixing the seed to make the results reproducible.
np.random.seed(2016)

In [19]:
# Uncomment to retrain the model instead of loading it from file.

# ldamodel = models.ldamodel.LdaModel(
#     corpus,
#     num_topics=18,
#     id2word=dictionary,
#     passes=70
# )

In [20]:
ldamodel = models.ldamodel.LdaModel.load(os.path.join("trained", "checkpoint-18topics.lda"))

In [21]:
ldamodel.print_topics(num_topics=18, num_words=12)

[(0,
  '0.003*продукт + 0.003*подкаст + 0.002*бізнес + 0.002*фізик + 0.002*коуч + 0.002*ведущ + 0.002*ирин + 0.001*продуктов + 0.001*ісм + 0.001*аутсорсингов + 0.001*сапр + 0.001*ринк'),
 (1,
  '0.014*налог + 0.011*налогов + 0.009*доход + 0.008*един + 0.008*плат + 0.008*счет + 0.007*добр + 0.007*групп + 0.006*фоп + 0.006*отчет + 0.006*вопрос + 0.005*сумм'),
 (2,
  '0.016*курс + 0.010*jav + 0.009*ищ + 0.008*работ + 0.007*ментор + 0.006*знан + 0.004*проект + 0.004*html + 0.004*месяц + 0.004*работа + 0.004*css + 0.004*добр'),
 (3,
  '0.007*box + 0.005*whit + 0.003*testing + 0.002*black + 0.002*ap + 0.002*полиграф + 0.002*автоматизац + 0.001*interna + 0.001*автотест + 0.001*автоматизатор + 0.001*structures + 0.001*selenium'),
 (4,
  '0.005*телефон + 0.004*мошенник + 0.004*карт + 0.004*монитор + 0.004*номер + 0.004*деньг + 0.004*цен + 0.003*приложен + 0.003*ноут + 0.003*куп + 0.003*дан + 0.002*нов'),
 (5,
  '0.008*мов + 0.007*українськ + 0.006*украинск + 0.005*язык + 0.005*як + 0.005*й + 0.

In [22]:
#ldamodel.save(os.path.join("trained", "checkpoint-18topics.lda"))

**Assign topic names for better interpretation later**

In [23]:
topic_names = [
    # 00..04
    "market-1", "taxes", "mentor", "qa", "fraud",
    # 05..09
    "ukrainian", "courses", "recruitment", "edu-corruption", "living-conditions",
    # 10..14
    "cashflow", "knowledge", "tractor", "market-2", "moral-rights",
    # 15..17
    "politics", "work", "relationships"
]

**Print topic structure for every post from our corpus**

In [31]:
for i, text in enumerate(texts):
    print(filenames[i])
    print()
    
    # Print topics ordered by presence strength
    topics_in_this_document = sorted(
        [
            (topic_names[idx], presence_strength)
            for idx, presence_strength in ldamodel[dictionary.doc2bow(texts[i])]
            if presence_strength >= 0.05
        ],
        key=lambda pair: pair[1],
        reverse=True
    )
    
    for topic, strength in topics_in_this_document:
        print(topic.rjust(25, ' '), "  {0:.0f}%".format(strength * 100))
    
    print()

data/clean/clean-comments-100-days.txt

                  tractor   51%
                     work   28%
                    fraud   15%
        living-conditions   6%

data/clean/clean-comments-10213.txt

                  tractor   43%
                     work   40%
                    fraud   16%

data/clean/clean-comments-10245.txt

                     work   42%
                 market-2   33%
                  tractor   14%
                ukrainian   5%

data/clean/clean-comments-10268.txt

                     work   34%
                 cashflow   22%
                 market-2   19%
                  tractor   13%

data/clean/clean-comments-10331.txt

                     work   50%
        living-conditions   17%
            relationships   15%
                 market-2   8%

data/clean/clean-comments-10405.txt

                     work   51%
        living-conditions   17%
            relationships   16%
                 cashflow   6%

data/clean/clean-comments-10445.txt

