In [68]:
import nltk
import nltk.tokenize
import nltk.corpus
import nltk.stem

import gensim.models.wrappers
import gensim.parsing.preprocessing
import gensim

import time
import string
import pickle
import matplotlib.pyplot as plt
import multiprocessing

In [2]:
relevant_pos = {
    "FW",
    "JJ",
    "JJR",
    "JJS",
    "NN",
    "NNS",
    "NP",
    "NPS",
    "RB",
    "RBR",
    "RBS",
    "VB",
    "VBD",
    "VBG",
    "VBN",
    "VBP",
    "VBZ",
}

wordnet_lemmatizer = nltk.stem.SnowballStemmer("english")
stops = (set(nltk.corpus.stopwords.words('english')) 
       | set(gensim.parsing.preprocessing.STOPWORDS) 
       | set(string.punctuation))
any_forbidden = set("0123456789*+")
all_forbidden = set(string.punctuation)

In [3]:
def debate_tokenize(text):
    text = text.replace(".", ". ") # happens often enough that it becomes a problem
    tokens = nltk.tokenize.word_tokenize(text)
    poses = [wordnet_lemmatizer.stem(w).lower()
                 for w, p in nltk.pos_tag(tokens)
                 if p in relevant_pos]
    tokens = [w for w in poses 
                if w not in stops
                and not any(c in any_forbidden for c in w)
                and not all(c in all_forbidden for c in w)]
    return tokens

def stemmer(index, key):
    namedate, _, text = key.split("\t")
    tokens = debate_tokenize(text)
    name, date = namedate[:10], namedate[10:]
    return (index, name, date, tokens)

In [4]:
pool = multiprocessing.Pool(processes=2)
with open("data/hansard_debates.tsv") as stream:
    # subdata = [next(stream) for i in range(5000)]
    results = [pool.apply_async(stemmer, v) for v in enumerate(stream)]
start_time = time.time()

In [86]:
num_done = sum(results[i].ready() for i in range(len(results)))
start_num = num_done
start_time = time.time()
for i in range(0):
    time.sleep(0.5)
    num_done += sum(results[i].ready() for i in range(num_done, num_done+50))
    avg = (num_done-start_num) / (time.time() - start_time)
    print(i, num_done, avg, end=" "*12+"\r")

0 150928 25.18673611911309            

IndexError: list index out of range

In [88]:
all(i.ready() for i in results)

True

In [91]:
data = [i.get() for i in results]
with open("data/stemmed.p", "wb") as f:
    pickle.dump(data, f)

In [2]:
with open("data/stemmed.p", "rb") as f:
    data = pickle.load(f)

In [4]:
counts = {}
counter = 0
for _, _, _, l in data:
    for w in l:
        counts[w] = counts.get(w, 0) + 1
    counter += 1
    if counter % 100 == 0:
            print(".", end="")
    if counter % 5000 == 0:
        print(" ", counter)
sorted([(v, k) for k, v in counts.items()], reverse=True)[:30]
excluded = {"hon"}

..................................................  5000
..................................................  10000
..................................................  15000
..................................................  20000
..................................................  25000
..................................................  30000
..................................................  35000
..................................................  40000
..................................................  45000
..................................................  50000
..................................................  55000
..................................................  60000
..................................................  65000
..................................................  70000
..................................................  75000
..................................................  80000
..................................................  85000
...............

In [5]:
filtered = [(i, date, name, [w for w in l 
                if len(w) > 3
                and counts[w] >= 200 
                and w not in excluded]) 
            for (i, date, name, l) in data]
filtered = [(i, date, name, l) for (i, date, name, l) in filtered if len(l) > 100]


In [6]:
wordlist = {w for *_, l in filtered for w in l}
idmap    = sorted(wordlist)
wordmap  = {w:i for i, w in enumerate(idmap)}
filtered = [(i, date, name, [wordmap[w] for w in l]) for (i, date, name, l) in filtered]

In [7]:
with open("data/filtered.p", "wb") as f:
    pickle.dump({"idmap" : idmap, "filtered" : filtered}, f)

In [9]:
with open("data/filtered.p", "rb") as f:
    d = pickle.load(f)
    filtered = d["filtered"]
    idmap    = d["idmap"]
    wordmap  = {w:i for i, w in enumerate(idmap)}

In [None]:
counts = {}
counter = 0
for *_, l in filtered:
    for w in l:
        counts[w] = counts.get(w, 0) + 1
    counter += 1
    if counter % 100 == 0:
            print(".", end="")
    if counter % 5000 == 0:
        print(" ", counter)

In [21]:
docs = [[idmap[w] for w in l] for *_, l in filtered]
dictionary = gensim.corpora.Dictionary(docs)

In [22]:
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

In [74]:
lda_model = gensim.models.wrappers.LdaMallet(
    "/usr/local/bin/mallet",
    corpus  = bow_corpus, 
    id2word = dictionary, 
    num_topics = 200,
    optimize_interval=20,
    iterations = 30000,
    workers = 1)

KeyboardInterrupt: 

In [76]:
sorted(idmap, key=lambda i: len(i))

['b',
 'c',
 'e',
 'f',
 'h',
 'j',
 'l',
 'n',
 'w',
 "'s",
 'a.',
 'ab',
 'ad',
 'ah',
 'al',
 'au',
 'aw',
 'ay',
 'c.',
 'd.',
 'e.',
 'ed',
 'en',
 'ep',
 'er',
 'et',
 'ex',
 'ft',
 'h.',
 'ha',
 'hi',
 'ho',
 'i.',
 'ii',
 'l.',
 'la',
 'lb',
 'le',
 'lo',
 'm.',
 'mo',
 'ne',
 'oh',
 'oz',
 'p.',
 'pf',
 'pp',
 'r.',
 'rt',
 's.',
 'se',
 'su',
 'tu',
 'v.',
 'ye',
 'abl',
 'ac-',
 'acr',
 'act',
 'ad-',
 'add',
 'age',
 'ago',
 'aid',
 'ail',
 'aim',
 'air',
 'al-',
 'ale',
 'alm',
 'an-',
 'anc',
 'ani',
 'ant',
 'ap-',
 'apt',
 'arc',
 'arm',
 'art',
 'as-',
 'ash',
 'ask',
 'ass',
 'at-',
 'ate',
 'awe',
 'axe',
 'aye',
 'bad',
 'bag',
 'ban',
 'bar',
 'bat',
 'bay',
 'be-',
 'be.',
 'bed',
 'bee',
 'beg',
 'ber',
 'bet',
 'bid',
 'big',
 'bit',
 'bog',
 'bon',
 'bow',
 'box',
 'boy',
 'bud',
 'buy',
 'bye',
 'cab',
 'cap',
 'car',
 'cat',
 'com',
 'cow',
 'cri',
 'cum',
 'cun',
 'cup',
 'cur',
 'cut',
 'cwt',
 'dam',
 'day',
 'de-',
 'deg',
 'den',
 'des',
 'die',
 'dig',
