In [2]:
import re
import string
import graphlab as gl # this import will be assumed from now on
import pandas as pd
import funcy as fp

gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 24)

In [7]:
stories_sf = gl.load_sframe("hacker_news_stories.sframe") #gl.load_sframe("http://s3.amazonaws.com/dato-datasets/hacker_news/stories_with_text.sframe")

In [10]:
re_print = re.compile(r'[^' + string.printable + ']')
re_contraction = re.compile(r'(\w+)\s+(t|ve|d|ll|m|re)\b')

def combine_and_clean(row):
    cat = row["title"] + ' ' + row["text"]
    ascii = re_print.sub(' ', cat)
    # removing the odd charaters resulted in invalid space-filled contractions
    fixed = re_contraction.sub(lambda m: "'".join(m.groups()), ascii)
    return fixed

stories_sf["text_title"] = stories_sf.apply(combine_and_clean)

I am using [spaCy](https://honnibal.github.io/spaCy/) to do tokenization and POS tagging. It did not work well with GraphLab and so I have convert the dataframe to a pandas one and process it locally. spaCy is a fast but only uses a single core. Since processing the entire dataset is embarrassingly parallel I'll be using [joblib](https://pythonhosted.org/joblib/) to use all the cores to process all the documents.

(This takes ~19GB of memory to do the entire dataset, so use a sample of the dataset if you are following along and have less memory.)

In [12]:
#Use all of it!
sample = stories_sf #  stories_sf.sample(0.4, seed=42)

In [13]:
del sample["text"]
del sample["title"]
df = sample.to_dataframe()

In [14]:
from joblib import Parallel, delayed, cpu_count

def _series_chunks(s, n_jobs):
    if n_jobs < 0:
        # so, have n chunks if we are using all n cores/cpus = cpu_count() + 1 + n_jobs
        n_jobs = cpu_count() + 1 + n_jobs
    n = len(s)
    n_chunks = int(n / n_jobs)
    return (s.iloc[ilocs] for ilocs in fp.chunks(n_chunks, range(n)))

def series_pmap(s, f, n_jobs=-1):
    if n_jobs == 0:
        return s.map(f)
    return pd.concat(Parallel(n_jobs=n_jobs)(delayed(series_pmap)(sub_series, f, n_jobs=0) \
                                                 for sub_series in _series_chunks(s, n_jobs)))

In [15]:
from spacy.en import English
nlp = English()

In [16]:
def tokenize(s):
    tokens = nlp(unicode(s))
    # the native spaCy tuples don't pickle so we extract the data into our own structure
    # Also, since graphlab doesn't handle namedtuples (or tuples) so we have to use a list :(
    return [[t.orth_.lower(), t.lemma_.lower(), t.pos, t.tag] for t in tokens if t.orth_.strip()]

In [18]:
%%time
tokens = series_pmap(df['text_title'], tokenize)

CPU times: user 3min 21s, sys: 39.6 s, total: 4min 1s
Wall time: 25min 10s


In [19]:
stories_sf["tokens"] = gl.SArray(tokens)

Now that we have tokenized and tagged the docs we now want to do some simple phrase detection (aka [collocation extraction](https://en.wikipedia.org/wiki/Collocation_extraction)) using ngrams. We'll be using gensim's [Phrase](http://radimrehurek.com/gensim/models/phrases.html) class to do that. It takes a stream of sentences. We only want to consider certain types of words as part of phrases so we will use the others (including punctiontion) as a sentence chunk boundary. Meaning, a single sentence will be segmented into potentialy various chunks.  I opted to write my own chunker since it was easy and spaCy doesn't have one ([nltk](http://www.nltk.org/) does however).

In [21]:
import spacy.parts_of_speech as pos

DEFAULT_POS = set([pos.NOUN, pos.VERB, pos.ADV, pos.ADJ])
DEFAULT_STOPWORDS = gl.text_analytics.stopwords() | set(['pm','am', "'re", "'ve", "n't", 'thing'])

In [22]:
 def chunk(tokens, allowed_pos=DEFAULT_POS):
    split = []
    for token in tokens:
        orth, lemma, pos, tag = token
        if pos in allowed_pos:
            split.append(lemma)
        else:
            # break detected!
            if len(split) > 1:
                yield split
            if len(split) > 0:
                split = []

In [23]:
%%time
def chunk_doc(tokens):
    return list(chunk(tokens))
    
chunked_docs = tokens.map(chunk_doc)

CPU times: user 28.5 s, sys: 2.19 s, total: 30.7 s
Wall time: 30.4 s


In [24]:
contiguous_chunks = fp.cat(chunked_docs)

In [4]:
from gensim.models.phrases import Phrases

In [26]:
%%time
# train chunker now
bigram = Phrases(contiguous_chunks)
trigram = Phrases(bigram[contiguous_chunks])


[INFO] collecting all words and their counts
[INFO] PROGRESS: at sentence #0, processed 0 words and 0 word types
[INFO] PROGRESS: at sentence #10000, processed 17397 words and 18410 word types
[INFO] PROGRESS: at sentence #20000, processed 34388 words and 32442 word types
[INFO] PROGRESS: at sentence #30000, processed 52011 words and 45045 word types
[INFO] PROGRESS: at sentence #40000, processed 69618 words and 57212 word types
[INFO] PROGRESS: at sentence #50000, processed 87074 words and 68103 word types
[INFO] PROGRESS: at sentence #60000, processed 104624 words and 78584 word types
[INFO] PROGRESS: at sentence #70000, processed 122055 words and 89103 word types
[INFO] PROGRESS: at sentence #80000, processed 139804 words and 100264 word types
[INFO] PROGRESS: at sentence #90000, processed 157386 words and 109643 word types
[INFO] PROGRESS: at sentence #100000, processed 174403 words and 119006 word types
[INFO] PROGRESS: at sentence #110000, processed 191929 words and 128681 word t

CPU times: user 5min 49s, sys: 26.6 s, total: 6min 16s
Wall time: 5min 59s


In [27]:
%%time
tokenized_and_filtered_docs = chunked_docs.map(fp.cat)

CPU times: user 5.08 s, sys: 984 ms, total: 6.06 s
Wall time: 5.62 s


In [28]:
def extract_chunks(doc, min_len=3):
    return [w for w in trigram[bigram[doc]] if len(w) >= min_len and w not in DEFAULT_STOPWORDS]

In [29]:
%%time
processed_docs = series_pmap(tokenized_and_filtered_docs, extract_chunks)

CPU times: user 1min 8s, sys: 27.2 s, total: 1min 35s
Wall time: 1min 59s


In [35]:
stories_sf["processed"] = gl.SArray(processed_docs)
stories_sf.save("hn_processed.sframe")

In [5]:
from gensim.corpora import Dictionary, MmCorpus

In [13]:
%%time
dictionary = Dictionary(stories_sf["processed"])

[INFO] adding document #0 to Dictionary(0 unique tokens: [])
[INFO] adding document #10000 to Dictionary(63652 unique tokens: [u'fawn', u'faecal_transplant', u'name&#x27;s', u'joi_ito', u'machine&#x27;s']...)
[INFO] adding document #20000 to Dictionary(83250 unique tokens: [u'fawn', u'sucess', u'faecal_transplant', u'name&#x27;s', u'joi_ito']...)
[INFO] adding document #30000 to Dictionary(110342 unique tokens: [u'fawn', u'verplank', u'joi_ito', u'machine&#x27;s', u'relevant)<p']...)
[INFO] adding document #40000 to Dictionary(191538 unique tokens: [u'verplank', u'hands.<p>i&#x27;m', u'mainly_consist', u'sowell', u'polling/voting']...)
[INFO] adding document #50000 to Dictionary(228457 unique tokens: [u'm_context', u'verplank', u'hands.<p>i&#x27;m', u'piston_console', u'mainly_consist']...)
[INFO] adding document #60000 to Dictionary(241406 unique tokens: [u'm_context', u'verplank', u'hands.<p>i&#x27;m', u'piston_console', u'mainly_consist']...)
[INFO] adding document #70000 to Diction

CPU times: user 1min 27s, sys: 2.06 s, total: 1min 29s
Wall time: 1min 39s


In [32]:
dictionary.save('unfiltered.dict')

[INFO] saving Dictionary object under unfiltered.dict, separately None
