In [1]:
from gensim.models.doc2vec import Doc2Vec
from pprint import pprint
import pickle
import os
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile
from gensim import utils
import multiprocessing
import re
import signal
from pickle import PicklingError
import logging

In [2]:
TOKEN_MIN_LEN = 2
TOKEN_MAX_LEN = 15
IGNORED_NAMESPACES = [
    'Wikipedia', 'Category', 'File', 'Portal', 'Template',
    'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
    'Special', 'Talk'
]
ARTICLE_MIN_WORDS = 50
DESIRED_ARTICLE_MIN_WORDS = 5
logger = logging.getLogger(__name__)

In [3]:
RE_P0 = re.compile(r'<!--.*?-->', re.DOTALL | re.UNICODE)
"""Comments."""
RE_P1 = re.compile(r'<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE)
"""Footnotes."""
RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE)
"""Links to languages."""
RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE)
"""Template."""
RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE)
"""Template."""
RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)
"""Remove URL, keep description."""
RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE)
"""Simplify links, keep description."""
RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
"""Keep description of images."""
RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)
"""Keep description of files."""
RE_P9 = re.compile(r'<nowiki([> ].*?)(</nowiki>|/>)', re.DOTALL | re.UNICODE)
"""External links."""
RE_P10 = re.compile(r'<math([> ].*?)(</math>|/>)', re.DOTALL | re.UNICODE)
"""Math content."""
RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)
"""All other tags."""
RE_P12 = re.compile(r'(({\|)|(\|-(?!\d))|(\|}))(.*?)(?=\n)', re.UNICODE)
"""Table formatting."""
RE_P13 = re.compile(r'(?<=(\n[ ])|(\n\n)|([ ]{2})|(.\n)|(.\t))(\||\!)([^[\]\n]*?\|)*', re.UNICODE)
"""Table cell formatting."""
RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)
"""Categories."""
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
"""Remove File and Image templates."""
RE_P16 = re.compile(r'\[{2}(.*?)\]{2}', re.UNICODE)
"""Capture interlinks text and article linked"""
RE_P17 = re.compile(
    r'(\n.{0,4}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=)|(scope=))(.*))|'
    r'(^.{0,2}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=))(.*))',
    re.UNICODE
)

In [4]:
def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
    """Tokenize a piece of text from Wikipedia.

    Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens.

    Parameters
    ----------
    content : str
        String without markup (see :func:`~gensim.corpora.wikicorpus.filter_wiki`).
    token_min_len : int
        Minimal token length.
    token_max_len : int
        Maximal token length.
    lower : bool
         Convert `content` to lower case?

    Returns
    -------
    list of str
        List of tokens from `content`.

    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [
        utils.to_unicode(token) for token in utils.tokenize(content, lower=lower, errors='ignore')
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]

def init_to_ignore_interrupt():
    """Enables interruption ignoring.

    Warnings
    --------
    Should only be used when master is prepared to handle termination of
    child processes.

    """
    signal.signal(signal.SIGINT, signal.SIG_IGN)

def _process_article(args):
    """Same as :func:`~gensim.corpora.wikicorpus.process_article`, but with args in list format.

    Parameters
    ----------
    args : [(str, bool, str, int), (function, int, int, bool)]
        First element - same as `args` from :func:`~gensim.corpora.wikicorpus.process_article`,
        second element is tokenizer function, token minimal length, token maximal length, lowercase flag.

    Returns
    -------
    (list of str, str, int)
        List of tokens from article, title and page id.

    Warnings
    --------
    Should not be called explicitly. Use :func:`~gensim.corpora.wikicorpus.process_article` instead.

    """
#     print(args[-1])
    tokenizer_func, token_min_len, token_max_len, lower = args[-1]
    args = args[:-1]

    return process_article(
        args, tokenizer_func=tokenizer_func, token_min_len=token_min_len,
        token_max_len=token_max_len, lower=lower
    )


def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
                    token_max_len=TOKEN_MAX_LEN, lower=True):
    """Parse a Wikipedia article, extract all tokens.

    Notes
    -----
    Set `tokenizer_func` (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`) parameter for languages
    like Japanese or Thai to perform better tokenization.
    The `tokenizer_func` needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool).

    Parameters
    ----------
    args : (str, bool, str, int)
        Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title,
        page identificator.
    tokenizer_func : function
        Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
        Needs to have interface:
        tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
    token_min_len : int
        Minimal token length.
    token_max_len : int
        Maximal token length.
    lower : bool
         Convert article text to lower case?

    Returns
    -------
    (list of str, str, int)
        List of tokens from article, title and page id.

    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenizer_func(text, token_min_len, token_max_len, lower)
    return result, title, pageid

def remove_markup(text, promote_remaining=True, simplify_links=True):
    """Filter out wiki markup from `text`, leaving only text.

    Parameters
    ----------
    text : str
        String containing markup.
    promote_remaining : bool
        Whether uncaught markup should be promoted to plain text.
    simplify_links : bool
        Whether links should be simplified keeping only their description text.

    Returns
    -------
    str
        `text` without markup.

    """
    text = re.sub(RE_P2, '', text)  # remove the last list (=languages)
    # the wiki markup is recursive (markup inside markup etc)
    # instead of writing a recursive grammar, here we deal with that by removing
    # markup in a loop, starting with inner-most expressions and working outwards,
    # for as long as something changes.
#     text = remove_template(text)
#     text = remove_file(text)
    iters = 0
    while True:
        old, iters = text, iters + 1
        text = re.sub(RE_P0, '', text)  # remove comments
        text = re.sub(RE_P1, '', text)  # remove footnotes
        text = re.sub(RE_P9, '', text)  # remove outside links
        text = re.sub(RE_P10, '', text)  # remove math content
        text = re.sub(RE_P11, '', text)  # remove all remaining tags
        text = re.sub(RE_P14, '', text)  # remove categories
        text = re.sub(RE_P5, '\\3', text)  # remove urls, keep description

        if simplify_links:
            text = re.sub(RE_P6, '\\2', text)  # simplify links, keep description only
        # remove table markup
        text = text.replace("!!", "\n|")  # each table head cell on a separate line
        text = text.replace("|-||", "\n|")  # for cases where a cell is filled with '-'
        text = re.sub(RE_P12, '\n', text)  # remove formatting lines
        text = text.replace('|||', '|\n|')  # each table cell on a separate line(where |{{a|b}}||cell-content)
        text = text.replace('||', '\n|')  # each table cell on a separate line
        text = re.sub(RE_P13, '\n', text)  # leave only cell content
        text = re.sub(RE_P17, '\n', text)  # remove formatting lines

        # remove empty mark-up
        text = text.replace('[]', '')
        # stop if nothing changed between two iterations or after a fixed number of iterations
        if old == text or iters > 2:
            break

    if promote_remaining:
        text = text.replace('[', '').replace(']', '')  # promote all remaining markup to plain text

    return text

def filter_wiki(raw, promote_remaining=True, simplify_links=True):
    """Filter out wiki markup from `raw`, leaving only text.

    Parameters
    ----------
    raw : str
        Unicode or utf-8 encoded string.
    promote_remaining : bool
        Whether uncaught markup should be promoted to plain text.
    simplify_links : bool
        Whether links should be simplified keeping only their description text.

    Returns
    -------
    str
        `raw` without markup.

    """
    # parsing of the wiki markup is not perfect, but sufficient for our purposes
    # contributions to improving this code are welcome :)
    text = utils.to_unicode(raw, 'utf8', errors='ignore')
    text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
    return remove_markup(text, promote_remaining, simplify_links)

def get_custom_texts(text):
    """Iterate over the dump, yielding a list of tokens for each article that passed
    the length and namespace filtering.

    Uses multiprocessing internally to parallelize the work and process the dump more quickly.

    Notes
    -----
    This iterates over the **texts**. If you want vectors, just use the standard corpus interface
    instead of this method:

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.test.utils import datapath
        >>> from gensim.corpora import WikiCorpus
        >>>
        >>> path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2")
        >>>
        >>> for vec in WikiCorpus(path_to_wiki_dump):
        ...     pass

    Yields
    ------
    list of str
        If `metadata` is False, yield only list of token extracted from the article.
    (list of str, (int, str))
        List of tokens (extracted from the article), page id and article title otherwise.

    """
    articles, articles_all = 0, 0
    positions, positions_all = 0, 0

    tokenization_params = (tokenize, TOKEN_MIN_LEN, TOKEN_MAX_LEN, True)
    texts = ((text, utils.has_pattern(), 'custom_title', '1', tokenization_params),)
#     print(texts)
    processes = max(1, multiprocessing.cpu_count() - 1)
    metadata = False
    filter_articles = None
    length = 0

    pool = multiprocessing.Pool(processes, init_to_ignore_interrupt)

    try:
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts, chunksize=10 * processes, maxsize=1):
            for tokens, title, pageid in pool.imap(_process_article, group):
                articles_all += 1
                positions_all += len(tokens)
                # article redirects and short stubs are pruned here
                if len(tokens) < DESIRED_ARTICLE_MIN_WORDS or \
                        any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                    continue
                articles += 1
                positions += len(tokens)
                if metadata:
                    yield (tokens, (pageid, title))
                else:
                    yield tokens

    except KeyboardInterrupt:
        logger.warn(
            "user terminated iteration over Wikipedia corpus after %i documents with %i positions "
            "(total %i articles, %i positions before pruning articles shorter than %i words)",
            articles, positions, articles_all, positions_all, DESIRED_ARTICLE_MIN_WORDS
        )
    except PicklingError as exc:
        raise_from(PicklingError('Can not send filtering function {} to multiprocessing, '
                                 'make sure the function can be pickled.'.format(filter_articles)), exc)
    else:
        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions "
            "(total %i articles, %i positions before pruning articles shorter than %i words)",
            articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS
        )
        length = articles  # cache corpus length
    finally:
        pool.terminate()

In [5]:
class EpochLoggerDM(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self, path_prefix):
        self.path_prefix = path_prefix
        self.epoch = 0
        self.start = datetime.datetime.now()

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))
        self.start = datetime.datetime.now()
        print(self.start)

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        if (self.epoch >= 8):
            output_path = get_tmpfile('{}_epoch{}.model'.format(self.path_prefix, self.epoch))
            model.save(output_path)
        self.epoch += 1
        print(datetime.datetime.now() - self.start)
    
    def on_train_begin(self, model):
        print("Training for DM")
    
    def on_train_end(self, model):
        print("Training end for DM")

class EpochLoggerDBOW(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self, path_prefix):
        self.epoch = 0
        self.path_prefix = path_prefix
        self.start = datetime.datetime.now()

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))
        self.start = datetime.datetime.now()
        print(self.start)

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        if (self.epoch >= 8):
            output_path = get_tmpfile('{}_epoch{}.model'.format(self.path_prefix, self.epoch))
            model.save(output_path)
        self.epoch += 1
        print(datetime.datetime.now() - self.start)
    
    def on_train_begin(self, model):
        print("Training for DBOW")
    
    def on_train_end(self, model):
        print("Training end for DBOW")


In [6]:
# filePath = os.path.join(')
# print(filePath)
# x = get_tmpfile('DBOW_epoch9.model')
# print(x)
# print(os.getcwd())
model_dbow = Doc2Vec.load('/home/ubuntu/Notebooks/wiki_corpus_doc2vec_exp/modelsData/DBOW_epoch9.model')
model_dm = Doc2Vec.load('/home/ubuntu/Notebooks/wiki_corpus_doc2vec_exp/modelsData/DM_epoch9.model')

In [8]:
text = 'The epic treatise of modern economics, written by Adam Smith in 1776, was interestingly titled “An Inquiry into the Nature and Causes of the Wealth of Nations”. With India having become the fifth largest economy in the world in 2019 and aspiring to be the third largest by 2025, it is only befitting to go back to one of the foundational questions posed by Smith, “What causes wealth and prosperity of nations?” The Economic Survey 2019-20 makes a humble attempt to craft a framework of policies that can foster wealth creation in India. This inquiry is particularly critical at this stage as India aspires to become a $5 trillion economy by 2025 – an ambitious vision that should create, as Smith observed, “universal opulence which extends itself to the lowest ranks of the people.”'

In [9]:
text1 = """Numeric representation of text documents is a challenging task in machine learning. Such a representation may be used for many purposes, for example: document retrieval, web search, spam filtering, topic modeling etc.
However, there are not many good techniques to do this. Many tasks use the well known but simplistic method of bag of words (BOW), but outcomes will be mostly mediocre, since BOW loses many subtleties of a possible good representation, e.g consideration of word ordering.
Latent Dirichlet Allocation (LDA) is also a common technique for topic modeling (extracting topics/keywords out of texts) but it’s very hard to tune, and results are hard to evaluate.
In this post, I will review the doc2vec method, a concept that was presented in 2014 by Mikilov and Le in this article, which we are going to mention many times through this post. Worth to mention that Mikilov is one of the authors of word2vec as well.
Doc2vec is a very nice technique. It’s easy to use, gives good results, and as you can understand from its name, heavily based on word2vec. so we’ll start with a short introduction about word2vec.
word2vec
word2vec is a well known concept, used to generate representation vectors out of words.
There are many good tutorials online about word2vec, like this one and this one, but describing doc2vec without word2vec will miss the point, so I’ll be brief.
In general, when you like to build some model using words, simply labeling/one-hot encoding them is a plausible way to go. However, when using such encoding, the words lose their meaning. e.g, if we encode Paris as id_4, France as id_6 and power as id_8, France will have the same relation to power as with Paris. We would prefer a representation in which France and Paris will be closer than France and power.
The word2vec, presented in 2013 in this article, intends to give you just that: a numeric representation for each word, that will be able to capture such relations as above. this is part of a wider concept in machine learning — the feature vectors.
Such representations, encapsulate different relations between words, like synonyms, antonyms, or analogies, such as this one"""

In [20]:
text2 = """Word2vec is a group of related models that are used to produce word embeddings. These models are shallow, two-layer neural networks that are trained to reconstruct linguistic contexts of words. Word2vec takes as its input a large corpus of text and produces a vector space, typically of several hundred dimensions, with each unique word in the corpus being assigned a corresponding vector in the space. Word vectors are positioned in the vector space such that words that share common contexts in the corpus are located close to one another in the space.[1]

Word2vec was created and published in 2013 by a team of researchers led by Tomas Mikolov at Google and patented.[2] The algorithm has been subsequently analysed and explained by other researchers.[3][4] Embedding vectors created using the Word2vec algorithm have many advantages compared to earlier algorithms[1] such as latent semantic analysis."""

In [21]:
text_load1 = [x for x in get_custom_texts(text2) ]
text_load = text_load1[0]
print(text_load)

['word', 'vec', 'is', 'group', 'of', 'related', 'models', 'that', 'are', 'used', 'to', 'produce', 'word', 'embeddings', 'these', 'models', 'are', 'shallow', 'two', 'layer', 'neural', 'networks', 'that', 'are', 'trained', 'to', 'reconstruct', 'linguistic', 'contexts', 'of', 'words', 'word', 'vec', 'takes', 'as', 'its', 'input', 'large', 'corpus', 'of', 'text', 'and', 'produces', 'vector', 'space', 'typically', 'of', 'several', 'hundred', 'dimensions', 'with', 'each', 'unique', 'word', 'in', 'the', 'corpus', 'being', 'assigned', 'corresponding', 'vector', 'in', 'the', 'space', 'word', 'vectors', 'are', 'positioned', 'in', 'the', 'vector', 'space', 'such', 'that', 'words', 'that', 'share', 'common', 'contexts', 'in', 'the', 'corpus', 'are', 'located', 'close', 'to', 'one', 'another', 'in', 'the', 'space', 'word', 'vec', 'was', 'created', 'and', 'published', 'in', 'by', 'team', 'of', 'researchers', 'led', 'by', 'tomas', 'mikolov', 'at', 'google', 'and', 'patented', 'the', 'algorithm', 'has

In [22]:
x = model_dbow.infer_vector(text_load)
y = model_dm.infer_vector(text_load, epochs=40)
# print(x)
# print(model_dbow.estimate_memory())
# print(model_dbow.estimated_lookup_memory())
# print(model_dbow.layer1_size)
# model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [23]:
pprint(model_dbow.docvecs.most_similar(positive = [x]))

[('Morphological analysis', 0.5724701285362244),
 ('Distributional–relational database', 0.5563048124313354),
 ('CmapTools', 0.5551131963729858),
 ('Sparse matrix-vector multiplication', 0.5522376894950867),
 ('Prototype methods', 0.5483927726745605),
 ('Word-sense induction', 0.5481163263320923),
 ('GloVe (machine learning)', 0.5481064319610596),
 ('Kernel-independent component analysis', 0.546776294708252),
 ('Vertex (computer graphics)', 0.5446189641952515),
 ('Latent semantic mapping', 0.5439185500144958)]


In [24]:
pprint(model_dm.docvecs.most_similar(positive = [y]))

[('National Corpus of Polish', 0.5654904246330261),
 ('Word2vec', 0.5504993200302124),
 ('Word embedding', 0.49864524602890015),
 ('Tomas Mikolov', 0.4916972815990448),
 ('Pax Corpus', 0.49122172594070435),
 ('El Corpus', 0.47822171449661255),
 ('Bergen Corpus of London Teenage Language', 0.47560450434684753),
 ('Adam Kilgarriff', 0.4726002812385559),
 ('Persian Speech Corpus', 0.4718528389930725),
 ('Wellington Corpus of Spoken New Zealand English', 0.4697859287261963)]


In [25]:
pprint(model_dbow.docvecs.most_similar(positive = ['Word2vec']))

[('Word embedding', 0.6466984748840332),
 ('Recursive neural network', 0.5964563488960266),
 ('Structured sparsity regularization', 0.5880581140518188),
 ('Parametric programming', 0.582451581954956),
 ('Algorithm selection', 0.5806574821472168),
 ('Structured prediction', 0.5804087519645691),
 ('Teknomo–Fernandez algorithm', 0.5788553953170776),
 ('Simulation-based optimization', 0.5785280466079712),
 ('Binary regression', 0.5777685642242432),
 ('Hopkins statistic', 0.576458215713501)]


In [26]:
pprint(model_dm.docvecs.most_similar(positive = ['Word2vec']))

[('Word embedding', 0.6609228849411011),
 ('Language model', 0.6240352988243103),
 ('N-gram', 0.6167463660240173),
 ('Semantic folding', 0.5738292932510376),
 ('Paraphrasing (computational linguistics)', 0.5722178220748901),
 ('Semantic space', 0.5568692684173584),
 ('Statistical machine translation', 0.5510936975479126),
 ('Document-term matrix', 0.5505303740501404),
 ('Latent semantic analysis', 0.5490725040435791),
 ('Kneser–Ney smoothing', 0.547815203666687)]


In [27]:
pprint(model_dm.docvecs.most_similar(positive = ['Word embedding']))

[('Word2vec', 0.6609229445457458),
 ('Language model', 0.5932155847549438),
 ('Semantic space', 0.5806470513343811),
 ('GloVe (machine learning)', 0.5724257230758667),
 ('Semantic folding', 0.5698139071464539),
 ('Query understanding', 0.5652347207069397),
 ('Statistical semantics', 0.5561745166778564),
 ('Word-sense induction', 0.5468202829360962),
 ('Deeplearning4j', 0.5417449474334717),
 ('N-gram', 0.5404171943664551)]


In [28]:
pprint(model_dm.docvecs.most_similar(positive = ['Latent semantic analysis']))

[('Document-term matrix', 0.6433390378952026),
 ('Vector space model', 0.5866307616233826),
 ('Concept search', 0.5842148065567017),
 ('Tf–idf', 0.5753623247146606),
 ('Full-text search', 0.5615909099578857),
 ('Concept mining', 0.5588423013687134),
 ('Semantic similarity', 0.5494738817214966),
 ('Word2vec', 0.5490725040435791),
 ('Bag-of-words model', 0.5450059175491333),
 ('Enterprise search', 0.5332350134849548)]


In [29]:
pprint(model_dm.docvecs.most_similar(positive = ['Tf–idf']))

[('Document-term matrix', 0.7128222584724426),
 ('Okapi BM25', 0.67176353931427),
 ('Sentence extraction', 0.581932544708252),
 ('Divergence-from-randomness model', 0.5800539255142212),
 ('Latent semantic analysis', 0.5753623247146606),
 ('Vector space model', 0.5543138980865479),
 ('Document clustering', 0.5495814085006714),
 ('Bag-of-words model', 0.5433114171028137),
 ('Latent Dirichlet allocation', 0.5334317088127136),
 ('SMART Information Retrieval System', 0.5332077741622925)]


In [12]:
# model_dbow.docvecs.most_similar(positive=[x], topn=10)

In [13]:
model_dbow.docvecs.similarity(4772633,3)

0.11670896

In [85]:
print(model_dbow.docvecs.index_to_doctag(1))
print(model_dbow.docvecs.index_to_doctag(411653))

Autism
Economy of India


In [38]:
print(model_dbow.docvecs.count)

4772637


In [55]:
# model_dbow.docvecs.rank(1, 2)

In [86]:
y = model_dbow.docvecs["Economy of India"]
# print(model_dbow.docvecs.get_vector("Economy of India"))
print(model_dbow.docvecs.doctags["Economic Advisory Council"])

Doctag(offset=2884531, word_count=1075, doc_count=1)


In [87]:
model_dbow.docvecs.closer_than(411653, 2884531)

KeyError: 411653