# Importing neccessary libraries 

In [1]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Sample text 

In [2]:
text = "Doc2Vec is used for creating document embeddings. It captures the context of entire documents."

# Tokenize into sentences and words 

In [3]:
sentences = sent_tokenize(text)

In [4]:
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

In [5]:
print("Tokenized sentences:", tokenized_sentences)

Tokenized sentences: [['doc2vec', 'is', 'used', 'for', 'creating', 'document', 'embeddings', '.'], ['it', 'captures', 'the', 'context', 'of', 'entire', 'documents', '.']]


# Prepare tagged documents 

In [6]:
tagged_data = [TaggedDocument(words=words, tags=[str(idx)]) for idx, words in enumerate(tokenized_sentences)]

# Train the Doc2Vec model 

In [7]:
model = Doc2Vec(vector_size=100, window=5, min_count=1, dm=1, epochs=20)

In [8]:
model.build_vocab(tagged_data)

In [9]:
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [10]:
print("Doc2Vec model trained successfully")

Doc2Vec model trained successfully


# Infer document vectors 

In [11]:
doc_vector = model.infer_vector(word_tokenize("Doc2Vec is a powerful tool for document embeddings."))

In [12]:
print("Inferred document vector:", doc_vector)

Inferred document vector: [-2.7706267e-03  1.3429527e-03  1.5869472e-03 -2.4584713e-03
 -2.3472891e-03  5.3375843e-04 -3.9743059e-03  5.2359996e-05
  2.9550598e-03 -2.7604417e-03 -3.7889674e-03 -1.9334015e-03
  4.2413096e-03  2.0465918e-03  1.3609939e-03 -4.8937523e-03
  8.2520867e-04  5.1127811e-04  6.9515430e-04 -2.3296105e-03
 -8.0556283e-04 -4.4799200e-03  2.2648200e-03 -4.9373851e-04
  5.2303501e-04 -2.0024767e-03 -5.0389739e-03 -3.6044037e-03
  1.2511757e-03 -1.3600584e-03  7.2938367e-04  1.1363412e-03
  4.0583215e-03  1.4998398e-03  2.0337615e-03  2.4031831e-03
 -3.7486546e-03  3.3169743e-03 -3.9227023e-03 -2.2029560e-03
 -3.6247552e-04  1.9469992e-03 -6.3266495e-04  3.0458376e-03
  3.0886731e-03 -2.9660142e-03 -8.6042535e-04 -4.4005495e-03
  1.7403174e-03  3.7389309e-03  1.0981503e-03 -2.7740896e-03
  2.3899253e-03  2.3515951e-03  6.1562960e-04  4.3117297e-03
  3.4102448e-03  6.6092581e-04  2.1585708e-03  2.7102887e-04
 -3.7000000e-03  1.4416573e-03  3.1770305e-03 -1.2749948e-0