In [1]:
import re
import string
import graphlab as gl # this import will be assumed from now on
import pandas as pd
import funcy as fp
import pyLDAvis
import pyLDAvis.graphlab

[INFO] This trial license of GraphLab Create is assigned to ben@benmabey.com and will expire on August 05, 2015. Please contact trial@dato.com for licensing options or to request a free non-commercial license for personal or academic use.

[INFO] Start server at: ipc:///tmp/graphlab_server-1473 - Server binary: /Users/bmabey/.virtualenvs/rbl-data/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1436666813.log
[INFO] GraphLab Server Version: 1.4.1


In [2]:
pyLDAvis.enable_notebook()

In [3]:
stories_sf = gl.load_sframe("hacker_news_stories.sframe") #gl.load_sframe("http://s3.amazonaws.com/dato-datasets/hacker_news/stories_with_text.sframe")

In [4]:
re_print = re.compile(r'[^' + string.printable + ']')
re_contraction = re.compile(r'(\w+)\s+(t|ve|d|ll|m|re)\b')

def combine_and_clean(row):
    cat = row["title"] + ' ' + row["text"]
    ascii = re_print.sub(' ', cat)
    fixed = re_contraction.sub(lambda m: "'".join(m.groups()), ascii)
    return fixed

stories_sf["text_title"] = stories_sf.apply(combine_and_clean)

In [5]:
stories_sf.save("hn_cleaned.sframe")

In [6]:
sample = stories_sf# stories_sf.sample(0.4, seed=42)
del sample["text"]
del sample["title"]

In [7]:
df = sample.to_dataframe()

In [8]:
from joblib import Parallel, delayed, cpu_count

def _series_chunks(s, n_jobs):
    if n_jobs < 0:
        # so, have n chunks if we are using all n cores/cpus = cpu_count() + 1 + n_jobs
        n_jobs = cpu_count() + 1 + n_jobs
    n = len(s)
    n_chunks = int(n / n_jobs)
    return (s.iloc[ilocs] for ilocs in fp.chunks(n_chunks, range(n)))

def series_pmap(s, f, n_jobs=-1):
    if n_jobs == 0:
        return s.map(f)
    return pd.concat(Parallel(n_jobs=n_jobs)(delayed(series_pmap)(sub_series, f, n_jobs=0) \
                                                 for sub_series in _series_chunks(s, n_jobs)))

In [9]:
from spacy.en import English
nlp = English()

In [10]:
def tokenize(s):
    tokens = nlp(unicode(s))
    # graphlab doesn't handle namedtuples (or even tuples) so we have to use a list :(
    return [[t.orth_.lower(), t.lemma_.lower(), t.pos, t.tag] for t in tokens if t.orth_.strip()]

In [None]:
def tokenize_d(s):
    tokens = nlp(unicode(s))
    # graphlab doesn't handle namedtuples (or even tuples) so we have to use a list :(
    return [[t.orth_, t.lemma_.lower(), t.pos_, t.tag_] for t in tokens if t.orth_.strip()]

In [None]:
%%time
tokens = series_pmap(df['text_title'], tokenize)

In [None]:
tokens.to_pickle('hn_tokens.pkl')

In [None]:
import spacy.parts_of_speech as pos

DEFAULT_POS = set([pos.NOUN, pos.VERB, pos.ADV, pos.ADJ])
DEFAULT_STOPWORDS = gl.text_analytics.stopwords() | set(['pm','am', "'re", "'ve", "n't", 'thing'])

In [None]:
# chunks to train Phrases with
def chunk(tokens, allowed_pos=DEFAULT_POS):
    split = []
    for token in tokens:
        orth, lemma, pos, tag = token
        if pos in allowed_pos:
            split.append(lemma)
        else:
            # break detected!
            if len(split) > 1:
                yield split
            if len(split) > 0:
                split = []
        

In [None]:
%%time
def chunk_doc(tokens):
    return list(chunk(tokens))
    
chunked_docs = tokens.map(chunk_doc)

In [None]:
contiguous_chunks = fp.cat(chunked_docs)

In [None]:
from gensim.models.phrases import Phrases

In [None]:
%%time
# train chunker now
bigram = Phrases(contiguous_chunks)
trigram = Phrases(bigram[contiguous_chunks])


In [None]:
%%time
tokenized_and_filtered_docs = chunked_docs.map(fp.cat)

In [None]:
def extract_chunks(doc, min_len=3):
    return [w for w in trigram[bigram[doc]] if len(w) >= min_len and w not in DEFAULT_STOPWORDS]

In [None]:
%%time
processed_docs = series_pmap(tokenized_and_filtered_docs, extract_chunks)

In [None]:
from gensim.corpora import Dictionary, MmCorpus


In [None]:
%%time
dictionary = Dictionary(processed_docs)

In [None]:
dictionary.filter_extremes(no_below=5, no_above=0.3, keep_n=None)
dictionary.compactify()

In [None]:
%%time
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
def gensim2bows(corpus, dictionary):
    return [{dictionary[id]: count for id, count in doc} for doc in corpus]
gensim2gl = fp.compose(gl.SArray, gensim2bows)

In [None]:
bows = gensim2gl(corpus, dictionary)

In [None]:
topic_model100 = gl.topic_model.create(bows, num_topics=100, num_iterations=200)

In [None]:
%%time
vis_data100 = pyLDAvis.graphlab.prepare(topic_model100, bows)

In [None]:
topic_model75 = gl.topic_model.create(bows, num_topics=75, num_iterations=200)

In [None]:
%%time
vis_data75 = pyLDAvis.graphlab.prepare(topic_model75, bows)

In [160]:
pyLDAvis.show(vis_data)

127.0.0.1 - - [11/Jul/2015 18:15:40] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [11/Jul/2015 18:15:40] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...


127.0.0.1 - - [11/Jul/2015 18:15:40] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [11/Jul/2015 18:15:40] "GET /LDAvis.css HTTP/1.1" 200 -



Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]
