Doc2Vec Tutorial on the Lee Dataset

In [1]:
import gensim
import os
import collections
import smart_open
import random



In [3]:
# Set file names for train and test data
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

In [4]:
# Define a Function to Read and Preprocess Text
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [7]:
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))


In [8]:
### Let's take a look at the training corpus


train_corpus[:2]

[TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', '

In [9]:
print(test_corpus[:2])

[['the', 'national', 'executive', 'of', 'the', 'strife', 'torn', 'democrats', 'last', 'night', 'appointed', 'little', 'known', 'west', 'australian', 'senator', 'brian', 'greig', 'as', 'interim', 'leader', 'shock', 'move', 'likely', 'to', 'provoke', 'further', 'conflict', 'between', 'the', 'party', 'senators', 'and', 'its', 'organisation', 'in', 'move', 'to', 'reassert', 'control', 'over', 'the', 'party', 'seven', 'senators', 'the', 'national', 'executive', 'last', 'night', 'rejected', 'aden', 'ridgeway', 'bid', 'to', 'become', 'interim', 'leader', 'in', 'favour', 'of', 'senator', 'greig', 'supporter', 'of', 'deposed', 'leader', 'natasha', 'stott', 'despoja', 'and', 'an', 'outspoken', 'gay', 'rights', 'activist'], ['cash', 'strapped', 'financial', 'services', 'group', 'amp', 'has', 'shelved', 'million', 'plan', 'to', 'buy', 'shares', 'back', 'from', 'investors', 'and', 'will', 'raise', 'million', 'in', 'fresh', 'capital', 'after', 'profits', 'crashed', 'in', 'the', 'six', 'months', 'to'

In [None]:
# Training the Model
## Instantiate a Doc2Vec Object


In [10]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)


In [None]:
# Build a Vocabulary


In [11]:
model.build_vocab(train_corpus)


In [None]:
#Time to Train

In [12]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)


Wall time: 3.02 s


In [None]:
# Inferring a Vector


In [13]:
model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])


array([-0.10868635, -0.2956276 , -0.19921397,  0.3264171 ,  0.12784891,
        0.02770999, -0.19867364, -0.04864173,  0.03895487, -0.07293084,
       -0.11969864, -0.0588683 , -0.12900402, -0.10300121,  0.14849016,
        0.07740878, -0.00046946,  0.01123086,  0.0999772 ,  0.02508878,
        0.13587904,  0.03768358,  0.02772718,  0.04581225,  0.04193378,
       -0.16614155, -0.05633305, -0.13858557,  0.1780703 ,  0.12953457,
        0.1228665 , -0.23995115,  0.27694485,  0.08209886, -0.1754283 ,
       -0.01667484,  0.02446443,  0.30923843, -0.03084652, -0.1624096 ,
       -0.14758769,  0.12783529,  0.05991   , -0.03790137, -0.10440305,
       -0.1815852 , -0.08259042,  0.01387192,  0.17346479,  0.06453985],
      dtype=float32)

In [None]:
# Assessing Model


In [14]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

  if np.issubdtype(vec.dtype, np.int):


In [15]:
collections.Counter(ranks)  # Results vary between runs due to random seeding and very small corpus


Counter({0: 292, 1: 8})

In [16]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [17]:
# Pick a random document from the corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))


Train Document (276): «defence minister robert hill has confirmed australian troops arrived in afghanistan this morning senator hill says it is an advance party and the rest of the troops will arrive within the next few days he says australian forces will operate with us troops in southern afghanistan to fight the taliban and al qaeda networks senator hill says the operation could take several months»

Similar Document (88, 0.8597304224967957): «the coroner investigating the death of race marshal at the australian formula one grand prix in melbourne has indicated he will not stand in the way of next year race on the last day of hearings into the death of race marshall graham beveridge the grand prix corporation legal team sought judicial assurance next year event would be able to go ahead coroner graeme johnston said his recommendations were unlikely to alter the corporation plans for the race and they would include nothing that could not reasonably be dealt with before the forthcoming

In [None]:
# Testing the Model

In [18]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (37): «robert mugabe strengthened his hold on the zimbabwean government yesterday by retaining the most combative hardliner ministers in cabinet shuffle which offered little hope of moderation of the land seizures and other policies that have kept zimbabwe in crisis and brought international condemnation»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (66, 0.8237478137016296): «argentina government has crumbled after at least people were killed and hundreds injured in nationwide riots argentina president fernando de la rua has resigned and called for national unity government with the opposition peronists the president resignation followed hours of rioting across the country people took to the streets protesting against the government economic austerity program argentina is now on the brink of defaulting on its next debt repayment which could be the largest default ever the opposition parties are reported to have rejected the call for nati