In [2]:
import re
import string
import graphlab as gl
import pandas as pd
import funcy as fp
import pyLDAvis
import pyLDAvis.graphlab

gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 24)
pyLDAvis.enable_notebook()

In [3]:
stories_sf = gl.load_sframe("hn_processed.sframe")

In [4]:
import spacy.parts_of_speech as pos

DEFAULT_POS = set([pos.NOUN, pos.VERB, pos.ADV, pos.ADJ])
DEFAULT_STOPWORDS = gl.text_analytics.stopwords() | set(['pm','am', "'re", "'ve", "n't", 'thing'])

In [7]:
from gensim.corpora import Dictionary

In [8]:
unfiltered_dictionary = Dictionary.load('unfiltered.dict')

[INFO] loading Dictionary object from unfiltered.dict


In [14]:
from copy import deepcopy

def filter_dict(no_below=5, no_above=0.5, keep_n=None, bad_tokens=None, good_tokens=None, orig_dict=unfiltered_dictionary):
    dictionary = deepcopy(orig_dict)
    def tokens2ids(words):
        if words: return [unfiltered_dictionary.token2id[w] for w in words]
    if good_tokens or bad_tokens:        
        dictionary.filter_tokens(bad_ids=tokens2ids(bad_words), good_ids=tokens2ids(good_words))
        dictionary.compactify()
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()
    return dictionary

In [15]:
%%time
dictionary = filter_dict(no_below=10, no_above=0.3, keep_n=100000)

[INFO] discarding 552490 tokens: [(u'mr._costolo', 2), (u'gaillard/reuters', 1), (u'rob_glaser', 8), (u'black_cab', 8), (u'leo_kelion_technology', 7), (u'london_taxi', 4), (u'kalanick', 6), (u'tfl', 5), (u'bertram', 9), (u'ltda', 3)]...
[INFO] keeping 67069 tokens which were in no less than 10 and no more than 76191 (=30.0%) documents
[INFO] resulting dictionary: Dictionary(67069 unique tokens: [u'fawn', u'aishwarya_rai', u'joi_ito', u'mainly_consist', u'python_interpreter']...)


CPU times: user 5.58 s, sys: 97.3 ms, total: 5.68 s
Wall time: 5.61 s


In [16]:
def doc2bags(dictionary, doc):  
    return {dictionary[id]: count for id, count in dictionary.doc2bow(doc)}    

In [17]:
%%time
bows = stories_sf["processed"].apply(fp.partial(doc2bags, dictionary))

CPU times: user 1.6 s, sys: 147 ms, total: 1.74 s
Wall time: 3 s


In [18]:
%%time
tm_cgs = gl.topic_model.create(bows, num_topics=100, num_iterations=10, method='cgs')

PROGRESS: Learning a topic model
PROGRESS:        Number of documents    253973
PROGRESS:            Vocabulary size     67069
PROGRESS:    Running collapsed Gibbs sampling
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | 10        | 51.06s        | 2.36102e+06    | 0               |
PROGRESS: +-----------+---------------+----------------+-----------------+
CPU times: user 111 ms, sys: 14.7 ms, total: 126 ms
Wall time: 1min 12s


In [19]:
%%time
pyLDAvis.graphlab.prepare(tm_cgs, bows)

CPU times: user 56.3 s, sys: 1.81 s, total: 58.1 s
Wall time: 1min 50s


In [20]:
dictionary = filter_dict(no_below=5, no_above=0.3)
bows = stories_sf["processed"].apply(fp.partial(doc2bags, dictionary))

[INFO] discarding 503561 tokens: [(u'mr._costolo', 2), (u'gaillard/reuters', 1), (u'london_taxi', 4), (u'ltda', 3), (u'confusables', 1), (u'confusable', 4), (u'utr', 1), (u'chgrp', 4), (u'runhost', 1), (u'filemap', 1)]...
[INFO] keeping 115998 tokens which were in no less than 5 and no more than 76191 (=30.0%) documents
[INFO] resulting dictionary: Dictionary(115998 unique tokens: [u'fawn', u'unsupportable', u'joi_ito', u'mainly_consist', u'sonja']...)


In [23]:
topic_model100 = gl.topic_model.create(bows, num_topics=100, num_iterations=200)

PROGRESS: Learning a topic model
PROGRESS:        Number of documents    253973
PROGRESS:            Vocabulary size    115998
PROGRESS:    Running collapsed Gibbs sampling
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | 10        | 55.15s        | 2.49533e+06    | 0               |
PROGRESS: | 20        | 1m 42s        | 2.55197e+06    | 0               |
PROGRESS: | 30        | 2m 29s        | 2.42161e+06    | 0               |
PROGRESS: | 40        | 3m 17s        | 2.40682e+06    | 0               |
PROGRESS: | 50        | 4m 4s         | 2.51469e+06    | 0               |
PROGRESS: | 60        | 4m 51s        | 2.4741e+06     | 0               |
PROGRESS: | 70        | 5m 38s        | 2.47171e+06    | 0               |
PROGRESS: | 80        | 6m 26s        | 2.37051e+06    | 0               |
PR

In [24]:
%%time
vis_data100 = pyLDAvis.graphlab.prepare(topic_model100, bows)

CPU times: user 3min 31s, sys: 7.04 s, total: 3min 38s
Wall time: 5min 10s




In [31]:
vis_data100

In [26]:
topic_model75 = gl.topic_model.create(bows, num_topics=75, num_iterations=200)

PROGRESS: Learning a topic model
PROGRESS:        Number of documents    253973
PROGRESS:            Vocabulary size    115998
PROGRESS:    Running collapsed Gibbs sampling
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | 10        | 45.43s        | 2.89845e+06    | 0               |
PROGRESS: | 20        | 1m 25s        | 2.78779e+06    | 0               |
PROGRESS: | 30        | 2m 5s         | 2.98204e+06    | 0               |
PROGRESS: | 40        | 2m 46s        | 2.91351e+06    | 0               |
PROGRESS: | 50        | 3m 27s        | 2.84212e+06    | 0               |
PROGRESS: | 60        | 4m 8s         | 2.87685e+06    | 0               |
PROGRESS: | 70        | 4m 49s        | 2.95537e+06    | 0               |
PROGRESS: | 80        | 5m 30s        | 2.72292e+06    | 0               |
PR

In [27]:
%%time
vis_data75 = pyLDAvis.graphlab.prepare(topic_model75, bows)

CPU times: user 2min 58s, sys: 6.12 s, total: 3min 4s
Wall time: 4min 20s


In [32]:
vis_data75