# Testing gensim bigram implementations

## getting data

In [1]:
LANG="french"

In [2]:
%%bash

fdate=20170327
fname=frwikinews-$fdate-cirrussearch-content.json.gz
if [ ! -e  $fname ]
then
    wget "https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname"
fi


In [3]:
# iterator
import gzip
import json

FDATE = 20170327
FNAME = "frwikinews-%s-cirrussearch-content.json.gz" % FDATE

def iter_texts(fpath=FNAME):
    with gzip.open(fpath, "rt") as f:
        for l in f:
            data = json.loads(l)
            if "title" in data:
                yield data["title"]
                yield data["text"]

In [4]:
# also prepare nltk
import nltk
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## preparing data

In [5]:
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
fr_tokenizer = RegexpTokenizer('\w[\w-]*|\d[\d,]*')

In [6]:
def prepare(txt):
    # lower case
    txt = txt.lower()
    return [fr_tokenizer.tokenize(sent) 
            for sent in sent_tokenize(txt, language=LANG)]

In [7]:
# we put all data in ram, it's not so much
corpus = []
for txt in iter_texts():
    corpus.extend(prepare(txt))

In [8]:
# how many sentences and words ?
words_count = sum(len(s) for s in corpus)
print("Corpus has %d words in %d sentences" % (words_count, len(corpus)))

Corpus has 571431 words in 26255 sentences


## testing bigram

In [9]:
from gensim.models.phrases import Phrases
from gensim.models.phrases2 import Phrases as PhrasesCT

In [10]:
from nltk.corpus import stopwords
" ".join(stopwords.words(LANG))

'au aux avec ce ces dans de des du elle en et eux il je la le leur lui ma mais me même mes moi mon ne nos notre nous on ou par pas pour qu que qui sa se ses son sur ta te tes toi ton tu un une vos votre vous c d j l à m n s t y été étée étées étés étant étante étants étantes suis es est sommes êtes sont serai seras sera serons serez seront serais serait serions seriez seraient étais était étions étiez étaient fus fut fûmes fûtes furent sois soit soyons soyez soient fusse fusses fût fussions fussiez fussent ayant ayante ayantes ayants eu eue eues eus ai as avons avez ont aurai auras aura aurons aurez auront aurais aurait aurions auriez auraient avais avait avions aviez avaient eut eûmes eûtes eurent aie aies ait ayons ayez aient eusse eusses eût eussions eussiez eussent'

In [11]:
times = {"create": {}, "use": {}}

### bigram std

#### construction

In [12]:
t = %timeit -o Phrases(corpus)
times["create"]["std"] = t.best

1 loop, best of 3: 707 ms per loop


#### execution

In [13]:
bigram = Phrases(corpus)
t = %timeit -o list(bigram[corpus])
times["use"]["std"] = t.best



1 loop, best of 3: 1.65 s per loop


### bigram common terms, without using them

#### construction

In [14]:
t = %timeit -o PhrasesCT(corpus)
times["create"]["ct_none"] = t.best

1 loop, best of 3: 703 ms per loop


#### execution

In [15]:
bigram2 = PhrasesCT(corpus)
t = %timeit -o list(bigram2[corpus])
times["use"]["ct_none"] = t.best



1 loop, best of 3: 1.7 s per loop


### bigram commons terms, effectively using them

#### construction

In [16]:
t = %timeit -o PhrasesCT(corpus, common_terms=stopwords.words(LANG))
times["create"]["ct"] = t.best

1 loop, best of 3: 820 ms per loop


#### execution

In [17]:
bigram_ct = PhrasesCT(corpus, common_terms=stopwords.words(LANG))
t = %%timeit -o list(bigram_ct[corpus])
times["use"]["ct"] = t.best



1 loop, best of 3: 1.82 s per loop


## new bigram found thanks to common terms

In [18]:
ct_ngrams = sorted(list(set((g[1], g[0].decode("utf-8"))
                     for g in bigram_ct.export_phrases(corpus) 
                     if len(g[0].split()) > 2)))
ct_ngrams[-20:]

[(1456.1700680272108, 'col du galibier'),
 (1486.5069444444443, 'feux d artifice'),
 (1585.6074074074074, 'déposé une plainte'),
 (1646.5923076923077, 'science et technologie'),
 (1832.4836811128944, 'traité de lisbonne'),
 (1899.914201183432, 'marine le pen'),
 (2123.581349206349, 'garde des sceaux'),
 (2363.0197944423294, 'vallée d aoste'),
 (2378.411111111111, 'outrage au drapeau'),
 (2503.590643274854, 'barrage de sivens'),
 (2623.247549019608, 'tribune de genève'),
 (2675.7125, 'dauphins de sète'),
 (2905.7511312217193, 'télégramme de brest'),
 (3057.957142857143, 'anneaux de saturne'),
 (3377.6252465483235, 'côte d ivoire'),
 (3822.4464285714284, 'cercle des nageurs'),
 (3822.4464285714284, 'pointée du doigt'),
 (4540.60303030303, 'défilé de robes'),
 (4896.728758169935, 'enrichissement d uranium'),
 (10614.396694214875, 'giscard d estaing')]

## Time recap

In [19]:
print("\tstd\tct_none\tct")
cols = ["std", "ct_none", "ct"]
for k, v in times.items():
    print("\t".join([k] + ["%.3f" % v[col] for col in cols]))
    print("\t".join([k + "%"] + ["%d%%" % (v[col] / v["std"] * 100) for col in cols]))

	std	ct_none	ct
use	1.650	1.698	1.816
use%	100%	102%	110%
create	0.707	0.703	0.820
create%	100%	99%	115%
