In [58]:
import nltk
import nltk.tokenize
import nltk.corpus
import nltk.stem

import gensim.parsing.preprocessing
import gensim

import time
import string
import pickle
import matplotlib.pyplot as plt
import multiprocessing

In [2]:
relevant_pos = {
    "FW",
    "JJ",
    "JJR",
    "JJS",
    "NN",
    "NNS",
    "NP",
    "NPS",
    "RB",
    "RBR",
    "RBS",
    "VB",
    "VBD",
    "VBG",
    "VBN",
    "VBP",
    "VBZ",
}

wordnet_lemmatizer = nltk.stem.SnowballStemmer("english")
stops = (set(nltk.corpus.stopwords.words('english')) 
       | set(gensim.parsing.preprocessing.STOPWORDS) 
       | set(string.punctuation))
any_forbidden = set("0123456789*+")
all_forbidden = set(string.punctuation)

In [3]:
def debate_tokenize(text):
    text = text.replace(".", ". ") # happens often enough that it becomes a problem
    tokens = nltk.tokenize.word_tokenize(text)
    poses = [wordnet_lemmatizer.stem(w).lower()
                 for w, p in nltk.pos_tag(tokens)
                 if p in relevant_pos]
    tokens = [w for w in poses 
                if w not in stops
                and not any(c in any_forbidden for c in w)
                and not all(c in all_forbidden for c in w)]
    return tokens

def stemmer(index, key):
    namedate, _, text = key.split("\t")
    tokens = debate_tokenize(text)
    name, date = namedate[:10], namedate[10:]
    return (index, name, date, tokens)

In [4]:
pool = multiprocessing.Pool(processes=2)
with open("data/hansard_debates.tsv") as stream:
    # subdata = [next(stream) for i in range(5000)]
    results = [pool.apply_async(stemmer, v) for v in enumerate(stream)]
start_time = time.time()

In [None]:
# num_done = 25000
for i in range(20):
    time.sleep(0.5)
    num_done += sum(results[i].ready() for i in range(num_done, num_done+500))
    print(i, num_done, end="\r")

938 28021

In [None]:
# data = [i.get() for i in results]
# with open("stemmed.p", "wb") as f:
#     pickle.dump(data, f)

In [2]:
# with open("stemmed.p", "rb") as f:
#     data = pickle.load(f)

In [None]:
counts = {}
counter = 0
for _, _, l in data:
    for w in l:
        counts[w] = counts.get(w, 0) + 1
    counter += 1
    if counter % 100 == 0:
            print(".", end="")
    if counter % 5000 == 0:
        print(" ", counter)
sorted([(v, k) for k, v in counts.items()], reverse=True)[:30]
excluded = {"hon"}

In [None]:
filtered = [(date, name, [w for w in l if counts[w] >= 200 and w not in excluded]) 
            for (date, name, l) in data]
filtered = [(i, date, name, l) for i, (date, name, l) in enumerate(filtered) if len(l) > 100]


In [59]:
wordlist = {w for *_, l in filtered for w in l}
idmap    = sorted(wordlist)
wordmap  = {w:i for i, w in enumerate(idmap)}
filtered = [(i, date, name, [wordmap[w] for w in l]) for (i, date, name, l) in filtered]

In [65]:
with open("filtered.p", "wb") as f:
    pickle.dump({"idmap" : idmap, "filtered" : filtered}, f)

In [68]:
with open("filtered.p", "rb") as f:
    d = pickle.load(f)
    filtered = d["filtered"]
    idmap    = d["idmap"]
    wordmap  = {w:i for i, w in idmap}

..................................................  5000
..................................................  10000
..................................................  15000
..................................................  20000
..................................................  25000
..................................................  30000
..................................................  35000
..................................................  40000
..................................................  45000
..................................................  50000
..................................................  55000
............

In [None]:
counts = {}
counter = 0
for *_, l in filtered:
    for w in l:
        counts[w] = counts.get(w, 0) + 1
    counter += 1
    if counter % 100 == 0:
            print(".", end="")
    if counter % 5000 == 0:
        print(" ", counter)