[gensim tutorial Doc2Vec](https://radimrehurek.com/gensim_3.8.3/auto_examples/tutorials/run_doc2vec_lee.html)

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import os
import gensim

test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')

In [3]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [4]:
print(train_corpus[:2])

[TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', '

In [5]:
print(test_corpus[:2])

[['the', 'national', 'executive', 'of', 'the', 'strife', 'torn', 'democrats', 'last', 'night', 'appointed', 'little', 'known', 'west', 'australian', 'senator', 'brian', 'greig', 'as', 'interim', 'leader', 'shock', 'move', 'likely', 'to', 'provoke', 'further', 'conflict', 'between', 'the', 'party', 'senators', 'and', 'its', 'organisation', 'in', 'move', 'to', 'reassert', 'control', 'over', 'the', 'party', 'seven', 'senators', 'the', 'national', 'executive', 'last', 'night', 'rejected', 'aden', 'ridgeway', 'bid', 'to', 'become', 'interim', 'leader', 'in', 'favour', 'of', 'senator', 'greig', 'supporter', 'of', 'deposed', 'leader', 'natasha', 'stott', 'despoja', 'and', 'an', 'outspoken', 'gay', 'rights', 'activist'], ['cash', 'strapped', 'financial', 'services', 'group', 'amp', 'has', 'shelved', 'million', 'plan', 'to', 'buy', 'shares', 'back', 'from', 'investors', 'and', 'will', 'raise', 'million', 'in', 'fresh', 'capital', 'after', 'profits', 'crashed', 'in', 'the', 'six', 'months', 'to'

In [6]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)



In [7]:
model.build_vocab(train_corpus)

2020-11-09 02:52:03,072 : INFO : collecting all words and their counts
2020-11-09 02:52:03,076 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-11-09 02:52:03,117 : INFO : collected 6981 word types and 300 unique tags from a corpus of 300 examples and 58152 words
2020-11-09 02:52:03,120 : INFO : Loading a fresh vocabulary
2020-11-09 02:52:03,206 : INFO : effective_min_count=2 retains 3955 unique words (56% of original 6981, drops 3026)
2020-11-09 02:52:03,208 : INFO : effective_min_count=2 leaves 55126 word corpus (94% of original 58152, drops 3026)
2020-11-09 02:52:03,242 : INFO : deleting the raw counts dictionary of 6981 items
2020-11-09 02:52:03,244 : INFO : sample=0.001 downsamples 46 most-common words
2020-11-09 02:52:03,245 : INFO : downsampling leaves estimated 42390 word corpus (76.9% of prior 55126)
2020-11-09 02:52:03,270 : INFO : estimated required memory for 3955 words and 50 dimensions: 3619500 bytes
2020-11-09 02:52:03,272 : INFO : res

In [8]:
model.wv.vocab

{'hundreds': <gensim.models.keyedvectors.Vocab at 0x7f8718aee4c0>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7f8718aeee20>,
 'people': <gensim.models.keyedvectors.Vocab at 0x7f86f4a72670>,
 'have': <gensim.models.keyedvectors.Vocab at 0x7f86f4a72460>,
 'been': <gensim.models.keyedvectors.Vocab at 0x7f86f4a723a0>,
 'forced': <gensim.models.keyedvectors.Vocab at 0x7f86f4a72b80>,
 'to': <gensim.models.keyedvectors.Vocab at 0x7f86f4a72160>,
 'their': <gensim.models.keyedvectors.Vocab at 0x7f86f4a726d0>,
 'homes': <gensim.models.keyedvectors.Vocab at 0x7f8718ae5ac0>,
 'in': <gensim.models.keyedvectors.Vocab at 0x7f86f467c070>,
 'the': <gensim.models.keyedvectors.Vocab at 0x7f86f467c0d0>,
 'southern': <gensim.models.keyedvectors.Vocab at 0x7f86f467c1c0>,
 'highlands': <gensim.models.keyedvectors.Vocab at 0x7f86f467c1f0>,
 'new': <gensim.models.keyedvectors.Vocab at 0x7f86f467c250>,
 'south': <gensim.models.keyedvectors.Vocab at 0x7f86f467c2b0>,
 'wales': <gensim.models.keyedvectors.Vocab

In [9]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2020-11-09 02:52:05,585 : INFO : training model with 3 workers on 3955 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-11-09 02:52:05,736 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-09 02:52:05,747 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-09 02:52:05,759 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-09 02:52:05,761 : INFO : EPOCH - 1 : training on 58152 raw words (42545 effective words) took 0.2s, 265771 effective words/s
2020-11-09 02:52:05,882 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-09 02:52:05,888 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-09 02:52:05,893 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-09 02:52:05,899 : INFO : EPOCH - 2 : training on 58152 raw words (42732 effective words) took 0.1s, 328222 effective words/s
2020-11-09 02:52:06,026 : INFO : worker 

In [10]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[ 0.03191052  0.03715688 -0.03362894  0.05324172 -0.11474621 -0.2124891
  0.12339746 -0.13223116  0.10381526  0.13897069  0.06673327 -0.05396178
 -0.15554695 -0.15100278 -0.00852595  0.10703195 -0.03238697  0.12748212
 -0.11176948 -0.05673671  0.10647999 -0.36569402 -0.12285884  0.11293185
  0.22817574 -0.04702117 -0.19333778 -0.2987403  -0.04195669 -0.12866144
  0.07344174  0.0478657  -0.08353905 -0.10920544 -0.02880618 -0.09043071
  0.02198034 -0.14181916  0.17157027  0.04261709 -0.09738749  0.02116145
 -0.2849392   0.00606085 -0.11495837  0.1742237   0.08485585 -0.05797894
  0.07795745  0.14133991]


In [11]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

2020-11-09 02:52:11,584 : INFO : precomputing L2-norms of doc weight vectors


In [12]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 292, 1: 8})


In [13]:
sims

[(299, 0.9420672059059143),
 (104, 0.82389235496521),
 (112, 0.8222939968109131),
 (146, 0.810670018196106),
 (55, 0.7861096858978271),
 (17, 0.7598564028739929),
 (139, 0.7563958764076233),
 (16, 0.7463501691818237),
 (182, 0.74010169506073),
 (275, 0.7230762839317322),
 (191, 0.7226172089576721),
 (6, 0.7175352573394775),
 (37, 0.7160442471504211),
 (47, 0.694209098815918),
 (72, 0.6800458431243896),
 (59, 0.6731588840484619),
 (215, 0.6658599376678467),
 (280, 0.6514462828636169),
 (283, 0.6416264772415161),
 (293, 0.6282510757446289),
 (118, 0.6206558346748352),
 (165, 0.6124496459960938),
 (294, 0.6045972108840942),
 (158, 0.6030454635620117),
 (258, 0.6003173589706421),
 (42, 0.5998677015304565),
 (70, 0.592827558517456),
 (96, 0.589391827583313),
 (111, 0.5782062411308289),
 (132, 0.5647280216217041),
 (246, 0.5558435320854187),
 (241, 0.5496294498443604),
 (71, 0.548494815826416),
 (232, 0.5275859832763672),
 (282, 0.5262740850448608),
 (190, 0.5254759788513184),
 (266, 0.52343

In [14]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [15]:
import random
doc_id = random.randint(0, len(train_corpus) - 1)

print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (271): «director of defunct swiss company that organised canyoning trip in that ended with people dying of them australians has denied responsibility for the tragedy along with two co directors who are also charged with manslaughter adventure world director stephan friedli appeared in court on the first day of their trial he described the deaths of people in the saxeten river gorge as an accident that was unforeseeable and not preventable friedli said he was aware of the possibility the river could flood but when asked whether his company carried out risk analysis he replied we know the region we live here to the question you know what you are accused of have you made any mistakes mr friedli replied no don think so»

Similar Document (271, 0.9434817433357239): «director of defunct swiss company that organised canyoning trip in that ended with people dying of them australians has denied responsibility for the tragedy along with two co directors who are also charged with m

In [16]:
doc_id = random.randint(0, len(test_corpus) - 1)

In [17]:
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (49): «labor needed to distinguish itself from the government on the issue of asylum seekers greens leader bob brown has said his senate colleague kerry nettle intends to move motion today on the first anniversary of the tampa crisis condemning the government over its refugee policy and calling for an end to mandatory detention we greens want to bring the government to book over its serial breach of international obligations as far as asylum seekers in this country are concerned senator brown said today»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (106, 0.8003504276275635): «the federal government has called on labor not to delay its plans to increase the australian security intelligence organisation asio powers to combat terrorism labor wants parliamentary inquiry to be set up to examine proposals to significantly increase asio powers to detain and interrogate suspects under proposed legislation to go before parliament next year asio w