## tutorial
https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

In [17]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [18]:
import os
import gensim


In [19]:
# set file names for train and test data
test_data_dir = os.path.join(gensim.__path__[0], 'test', "test_data")
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')


In [20]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # for training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [21]:
# lets take a look at training corpus
print(train_corpus[:2])

[TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', '

In [22]:
# lets take a look at test corpus
print(test_corpus[:2])

[['the', 'national', 'executive', 'of', 'the', 'strife', 'torn', 'democrats', 'last', 'night', 'appointed', 'little', 'known', 'west', 'australian', 'senator', 'brian', 'greig', 'as', 'interim', 'leader', 'shock', 'move', 'likely', 'to', 'provoke', 'further', 'conflict', 'between', 'the', 'party', 'senators', 'and', 'its', 'organisation', 'in', 'move', 'to', 'reassert', 'control', 'over', 'the', 'party', 'seven', 'senators', 'the', 'national', 'executive', 'last', 'night', 'rejected', 'aden', 'ridgeway', 'bid', 'to', 'become', 'interim', 'leader', 'in', 'favour', 'of', 'senator', 'greig', 'supporter', 'of', 'deposed', 'leader', 'natasha', 'stott', 'despoja', 'and', 'an', 'outspoken', 'gay', 'rights', 'activist'], ['cash', 'strapped', 'financial', 'services', 'group', 'amp', 'has', 'shelved', 'million', 'plan', 'to', 'buy', 'shares', 'back', 'from', 'investors', 'and', 'will', 'raise', 'million', 'in', 'fresh', 'capital', 'after', 'profits', 'crashed', 'in', 'the', 'six', 'months', 'to'

In [23]:
# set the model
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

2023-05-04 16:03:34,562 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3)', 'datetime': '2023-05-04T16:03:34.562925', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  5 2022, 01:53:17) \n[Clang 12.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


In [24]:
# build a vocabulary
model.build_vocab(train_corpus)

2023-05-04 16:03:34,618 : INFO : collecting all words and their counts
2023-05-04 16:03:34,621 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2023-05-04 16:03:34,629 : INFO : collected 6981 word types and 300 unique tags from a corpus of 300 examples and 58152 words
2023-05-04 16:03:34,644 : INFO : Creating a fresh vocabulary
2023-05-04 16:03:34,657 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 3955 unique words (56.653774530869505%% of original 6981, drops 3026)', 'datetime': '2023-05-04T16:03:34.657796', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  5 2022, 01:53:17) \n[Clang 12.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2023-05-04 16:03:34,664 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 55126 word corpus (94.79639565277205%% of original 58152, drops 3026)', 'datetime': '2023-05-04T16:03:34.664861', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  5 2022, 01:53

In [25]:
# see how many times word appeared in the training corpus
word = "river"
print(f"Word {word} appeared {model.wv.get_vecattr(word, 'count')} times in the training corpus.")


Word river appeared 29 times in the training corpus.


In [None]:
# train the model
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)


In [27]:
# infer vector
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.1812215  -0.2852917  -0.04234434  0.13818139 -0.03384136 -0.08112288
 -0.07822665 -0.04190934 -0.2978265  -0.1343627   0.10365524 -0.02020241
 -0.02950263  0.00174129 -0.07419611 -0.06989134  0.08999297  0.30330315
  0.11803856 -0.14431839  0.04221455 -0.00588043  0.14562517 -0.01240869
 -0.02576952  0.0130606  -0.25196114  0.0176945  -0.06980794 -0.10956214
  0.3694717  -0.04494542  0.06686682  0.22764955  0.21305315  0.15566029
  0.04199717 -0.2514844  -0.13784435  0.02060667  0.02420529 -0.02626253
 -0.06177605  0.01050091  0.15215768 -0.03006554 -0.1274753  -0.07440846
  0.06637181 -0.0203342 ]


In [28]:
x = ['only', 'you', 'can', 'prevent', 'forest', 'fires']
x.index('only')

0

In [29]:
# assessing the model
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [30]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 292, 1: 8})


In [31]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [32]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus)-1)

# compare and print the second most similar document
print('Train Document ({}): <<{}>>\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: <<{}>>\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (134): <<israel has reacted with caution to promise from palestinian leader yasser arafat to hunt down suicide bombers and end armed attacks against israeli targets mr arafat made the commitments during speech broadcast on palestinian television the palestinian leader said israel was using suicide attacks as pretext for waging war on palestinians and that such operations were therefore against palestinian national interests israel will be looking to see whether mr arafat is offering anything more than words he has promised to round up suicide bombers before but very few in fact have been arrested mr arafat said peace was the only way of resolving the conflict and that the changed world situation since the attacks in the united states on september had to be taken into account the united states government says it is keenly watching to see whether mr arafat actions match his words the white house says it will continue to engage in the peace process despite the withdrawal of

In [None]:
# testing the model
# pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])