Simple tutorial [link](https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5)

In [39]:
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [40]:
data = ["love apple juice",
        "love orange juice",
        "miss Soviet Union",
        "miss European Union"]

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]


In [41]:
tagged_data

[TaggedDocument(words=['love', 'apple', 'juice'], tags=['0']),
 TaggedDocument(words=['love', 'orange', 'juice'], tags=['1']),
 TaggedDocument(words=['miss', 'soviet', 'union'], tags=['2']),
 TaggedDocument(words=['miss', 'european', 'union'], tags=['3'])]

> dm=1 means ‘distributed memory’ (PV-DM) and dm =0 means ‘distributed bag of words’ (PV-DBOW).

In [42]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    if epoch% (max_epochs/5) == 0: print('iteration {0}'.format(epoch))
        
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    
    # decrease the learning rate
    model.alpha -= 0.0002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")

print("Model Saved")



iteration 0
iteration 20
iteration 40
iteration 60
iteration 80
Model Saved


In [43]:
from gensim.models.doc2vec import Doc2Vec

model= Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)
print('\n>> vector is ready for machine learning')

V1_infer [ 0.00939252 -0.00324851 -0.00789354  0.00744676 -0.01761055  0.00213193
  0.00639656  0.0239707  -0.00988599  0.02283477 -0.00472748  0.00618429
 -0.02016012  0.02286941  0.01486451 -0.00711495 -0.01363916  0.02092048
  0.01515018 -0.02057129]

>> vector is ready for machine learning


In [44]:
# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)

[('0', 0.856641948223114), ('2', 0.7180614471435547), ('3', 0.6701167225837708)]


In [45]:
# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('3')
print(similar_doc)

[('0', 0.7460619807243347), ('2', 0.7106165289878845), ('1', 0.670116662979126)]


In [46]:
# to find vector of doc in training data using tags or in other words, 
# printing the vector of document at index 1 in training data
print(model.docvecs['1'])

[-0.04736115  0.01475147 -0.01801484 -0.03168618 -0.02599521  0.08510905
  0.06087071  0.0469239   0.04417982 -0.04136113 -0.00141001  0.02436803
 -0.01150584  0.0776168   0.01904231  0.01242081  0.03733888 -0.02499858
 -0.03769484  0.01486605]


In [54]:
#to find the vector of a document which is not in training data
test_data = word_tokenize("fruit juice".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

V1_infer [-0.01262034  0.00080706 -0.01596102  0.02155032  0.00902727 -0.01349586
 -0.02266083 -0.01867182  0.01565458  0.02170718  0.02251704  0.01250898
  0.01488099  0.0227744   0.00697103 -0.01825041 -0.01403463  0.00473706
  0.00092103  0.0086099 ]


In [56]:
model.docvecs.most_similar([v1])

[('0', -0.0534810870885849),
 ('2', -0.06103336066007614),
 ('3', -0.11242544651031494),
 ('1', -0.1933521181344986)]

In [57]:
model.most_similar('soviet')

  """Entry point for launching an IPython kernel.


[('apple', 0.2626633644104004),
 ('european', -0.10963808745145798),
 ('love', -0.15924587845802307),
 ('orange', -0.1951785832643509),
 ('miss', -0.26286447048187256),
 ('juice', -0.2799147963523865),
 ('union', -0.42586463689804077)]