# Testing gensim bigram implementations

## getting data


In [1]:
LANG="english"

In [2]:
%%bash

fdate=20170327
fname=enwikinews-$fdate-cirrussearch-content.json.gz
if [ ! -e  $fname ]
then
    wget "https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname"
fi


In [3]:
# iterator
import gzip
import json

FDATE = 20170327
FNAME = "enwikinews-%s-cirrussearch-content.json.gz" % FDATE

def iter_texts(fpath=FNAME):
    with gzip.open(fpath, "rt") as f:
        for l in f:
            data = json.loads(l)
            if "title" in data:
                yield data["title"]
                yield data["text"]

In [4]:
# also prepare nltk
import nltk
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## preparing data

In [5]:
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
fr_tokenizer = RegexpTokenizer('\w[\w-]*|\d[\d,]*')

In [6]:
def prepare(txt):
    # lower case
    txt = txt.lower()
    return [fr_tokenizer.tokenize(sent) 
            for sent in sent_tokenize(txt, language=LANG)]

In [7]:
# we put all data in ram, it's not so much
corpus = []
for txt in iter_texts():
    corpus.extend(prepare(txt))

In [8]:
# how many sentences and words ?
words_count = sum(len(s) for s in corpus)
print("Corpus has %d words in %d sentences" % (words_count, len(corpus)))

Corpus has 1003521 words in 46159 sentences


## testing bigram

In [9]:
from gensim.models.phrases import Phrases
from gensim.models.phrases2 import Phrases as PhrasesCT

In [10]:
from nltk.corpus import stopwords
" ".join(stopwords.words(LANG))

'i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now d ll m o re ve y ain aren couldn didn doesn hadn hasn haven isn ma mightn mustn needn shan shouldn wasn weren won wouldn'

In [11]:
times = {"create": {}, "use": {}}

### bigram std

#### construction

In [12]:
t = %timeit -o Phrases(corpus)
times["create"]["std"] = t.best

1 loop, best of 3: 1.34 s per loop


#### execution

In [13]:
bigram = Phrases(corpus)
t = %timeit -o list(bigram[corpus])
times["use"]["std"] = t.best



1 loop, best of 3: 2.96 s per loop


### bigram common terms, without using them

#### construction

In [14]:
t = %timeit -o PhrasesCT(corpus)
times["create"]["ct_none"] = t.best

1 loop, best of 3: 1.34 s per loop


#### execution

In [15]:
bigram2 = PhrasesCT(corpus)
t = %timeit -o list(bigram2[corpus])
times["use"]["ct_none"] = t.best



1 loop, best of 3: 3.01 s per loop


### bigram commons terms, effectively using them

#### construction

In [16]:
t = %timeit -o PhrasesCT(corpus, common_terms=stopwords.words(LANG))
times["create"]["ct"] = t.best

1 loop, best of 3: 1.52 s per loop


#### execution

In [17]:
bigram_ct = PhrasesCT(corpus, common_terms=stopwords.words(LANG))
t = %%timeit -o list(bigram_ct[corpus])
times["use"]["ct"] = t.best



1 loop, best of 3: 3.47 s per loop


## new bigram found thanks to common terms

In [18]:
ct_ngrams = sorted(list(set((g[1], g[0].decode("utf-8"))
                     for g in bigram_ct.export_phrases(corpus) 
                     if len(g[0].split()) > 2)))
ct_ngrams[-20:]

[(1300.0463768115942, 'chamber of deputies'),
 (1632.9466019417475, 'globe and mail'),
 (1648.9558823529412, 'chosen to involve'),
 (1661.1703703703704, 'thoughts and prayers'),
 (1820.2759740259742, 'thank you for listening'),
 (2034.0861678004535, 'hall of fame'),
 (2038.7090909090912, 'saturn s rings'),
 (2046.7992063492065, 'colleges and universities'),
 (2777.188854489164, 'burst into flames'),
 (2920.026041666667, 'divide by zero'),
 (2950.7631578947367, 'emerges from recession'),
 (3136.4755244755243, 'serbia and montenegro'),
 (3219.014354066985, 'compilation of brief'),
 (3297.9117647058824, 'accessed on 2006-12-10'),
 (3297.9117647058824, 'accessed on 2006-12-11'),
 (4548.843813387424, 'parks and recreation'),
 (4651.277037037037, 'monsters and critics'),
 (5333.703783783783, 'disasters and accidents'),
 (5862.954248366013, 'skull and bones'),
 (7081.831578947369, 'click on the donate')]

## Time recap

In [19]:
print("\tstd\tct_none\tct")
cols = ["std", "ct_none", "ct"]
for k, v in times.items():
    print("\t".join([k] + ["%.3f" % v[col] for col in cols]))
    print("\t".join([k + "%"] + ["%d%%" % (v[col] / v["std"] * 100) for col in cols]))

	std	ct_none	ct
create	1.336	1.336	1.516
create%	100%	99%	113%
use	2.958	3.015	3.466
use%	100%	101%	117%
