In [1]:
import gensim.models.doc2vec
from gensim.models import Doc2Vec

stopwords_path = 'data/stopwords_german.txt'
doc_path = 'dewiki-preprocessed.txt'
model_path = 'dewiki.model'

model = Doc2Vec.load(model_path)
print(model)

Doc2Vec(dm/c,d100,n5,w5,mc10,s0.001,t20)


In [2]:
print('Vocab size: %d' % len(model.wv.index2word))
print('Vector size: %d' % model.vector_size)

# model.wv
# => gensim.models.keyedvectors.Word2VecKeyedVectors

# Lookup term by index:
# model.wv.index2word[0]
# => 'jahr'

# Lookup term:
# model.wv.vocab['jahr']
# => {'count': 305445, 'index': 0, 'sample_int': 3898599532}

# Get term vector:
# model.wv.vectors[0]

Vocab size: 528683
Vector size: 100


In [None]:
# This is the code that loads documents from train.py
# If we need this for evaluation we put this in a common file.
# Look at the length check, this has a potential to change the docIDs => prevent empty docs during pre-processing

from collections import namedtuple
import codecs

def read_lines(path):
    '''Return lines in file'''
    return [line.strip() for line in codecs.open(path, "r", "utf-8")]

print("Loading stopwords: {}".format(stopwords_path))
stopwords = read_lines(stopwords_path)
stopwords = dict(map(lambda w: (w.lower(), ''), stopwords))
    
TaggedDocument = namedtuple('TaggedDocument', 'tags words')

all_docs = []

print('Loading documents: %s' % doc_path)
doc_id = 0
for i, line in enumerate(open(doc_path, encoding="utf-8")):
    line = line.strip()
    words = gensim.utils.to_unicode(line).split()

    #words = [w for w in words if w not in stopwords and len(w) > 1]
    words = [w for w in words if w not in stopwords]

    all_docs.append(TaggedDocument([doc_id], words))
    doc_id += 1
    if doc_id % 50000 == 0:
        print(doc_id)
        
print('%d documents loaded' % len(all_docs))

## Are inferred document vectors close to the actual vectors?

In [42]:
import numpy as np

doc_idx = np.random.randint(len(all_docs))
doc_idx = 349192
inferred_doc_vec = model.infer_vector(all_docs[doc_idx].words)
print(doc_idx)
print('%s' % (model.docvecs.most_similar([inferred_doc_vec], topn=3)))

349192
[(349192, 0.7934771776199341), (396408, 0.5859248042106628), (56958, 0.5487353801727295)]


## Do close documents seem more related than distant ones?

In [44]:
import random
import numpy as np

doc_idx = np.random.randint(len(all_docs))
sims = model.docvecs.most_similar(doc_idx, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(all_docs[doc_idx].words)))

print(u'SIMILAR/DISSIMILAR DOCS\n')
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(all_docs[sims[index][0]].words)))

TARGET (411594): «cheatham county cheatham county county bundesstaat tennessee vereinigten staaten . verwaltungssitz ( county seat ) ashland city . county nordwestlich geographischen zentrums tennessee , norden 40 km kentucky entfernt fläche 795 quadratkilometern , wovon 12 quadratkilometer wasserfläche . grenzt uhrzeigersinn countys : robertson county , davidson county , williamson county , dickson county montgomery county . cheatham county 28 . februar 1856 davidson county , dickson county , montgomery county robertson county gebildet . benannt edward saunders cheatham , politiker tennessee . volkszählung jahr 2000 lebten cheatham county 35 . 912 menschen 12 . 878 haushalten 10 . 160 familien . bevölkerungsdichte betrug 46 einwohner quadratkilometer . ethnisch betrachtet bevölkerung 96 , 86 prozent weißen , 1 , 48 prozent afroamerikanern , 0 , 38 prozent amerikanischen ureinwohnern , 0 , 18 prozent asiaten , 0 , 05 prozent bewohnern pazifischen inselraum 0 , 36 prozent ethnischen gru

## Do the word vectors show useful similarities?

In [48]:
import random


# pick a random word with a suitable number of occurences
while True:
    word = random.choice(model.wv.index2word)
    if model.wv.vocab[word].count > 10:
        break
        
print(word)
word='bitcoin'
model.wv.most_similar(word, topn=20)

rohstoffmangel


[('njc', 0.4766344726085663),
 ('blumenkorb', 0.4700588285923004),
 ('ziegenfuss', 0.46407201886177063),
 ('sankei', 0.4613894820213318),
 ('yaḥyā', 0.4587106704711914),
 ('uṣūl', 0.4571664333343506),
 ('katastrophenfilme', 0.456667423248291),
 ('sfc', 0.4526963233947754),
 ('almanachs', 0.4512360095977783),
 ('kapitaldienst', 0.4502202272415161),
 ('eggs', 0.44766002893447876),
 ('poker', 0.4469255805015564),
 ('erneuerbare-energien-gesetzes', 0.4467746615409851),
 ('adoptieren', 0.4457639455795288),
 ('spendete', 0.4452362358570099),
 ('amantadin', 0.4444889426231384),
 ('spice', 0.4441375136375427),
 ('feuerwerkskörpern', 0.44211944937705994),
 ('zeichentrickfilms', 0.4400610327720642),
 ('dx7', 0.43809282779693604)]

## Evaluate word vectors on syntactic analogies

In [7]:
sections = model.wv.accuracy('data/syntactic.questions')
for i, section in enumerate(sections):
    correct, incorrect = len(sections[i]['correct']), len(sections[i]['incorrect'])
    print('%s: %0.2f%% correct (%d of %d)' % (section['section'], float(correct*100)/(correct+incorrect), correct, correct+incorrect))

nouns: SI/PL: 47.03% correct (182 of 387)
nouns: PL/SI: 33.25% correct (128 of 385)
adjectives: GR/KOM: 26.76% correct (19 of 71)
adjectives: KOM/GR: 25.33% correct (19 of 75)
adjectives: GR/SUP: 25.00% correct (1 of 4)
adjectives: SUP/GR: 0.00% correct (0 of 2)
adjectives: KOM/SUP: 27.27% correct (3 of 11)
adjectives: SUP/KOM: 23.08% correct (3 of 13)
verbs (pres): INF/1SP: 16.00% correct (4 of 25)
verbs (pres): 1SP/INF: 24.32% correct (9 of 37)
verbs (pres): INF/2PP: 45.05% correct (41 of 91)
verbs (pres): 2PP/INF: 41.58% correct (42 of 101)
verbs (pres): 1SP/2PP: 19.05% correct (4 of 21)
verbs (pres): 2PP/1SP: 5.26% correct (1 of 19)
verbs (past): INF/3SV: 35.00% correct (49 of 140)
verbs (past): 3SV/INF: 28.78% correct (40 of 139)
verbs (past): INF/3PV: 48.89% correct (66 of 135)
verbs (past): 3PV/INF: 38.17% correct (50 of 131)
verbs (past): 3SV/3PV: 72.82% correct (150 of 206)
verbs (past): 3PV/3SV: 70.19% correct (146 of 208)
total: 43.48% correct (957 of 2201)


## Evaluate word vectors on semantic analogies

In [8]:
sections = model.wv.accuracy('data/semantic.bestmatch.questions')
for i, section in enumerate(sections):
    correct, incorrect = len(sections[i]['correct']), len(sections[i]['incorrect'])
    print('%s: %0.2f%% correct (%d of %d)' % (section['section'], float(correct*100)/(correct+incorrect), correct, correct+incorrect))

semantic analogies: 47.37% correct (207 of 437)
total: 47.37% correct (207 of 437)
