## Doc2Vec
In this notebook we demonstrate how to train a doc2vec model on a custom corpus.

In [None]:
# To install only the requirements of this notebook, uncomment the lines below and run this cell

# ===========================

# !pip install gensim==3.6.0
# !pip install spacy==2.2.4
# !pip install nltk==3.2.5

# ===========================

In [None]:
# To install the requirements for the entire chapter, uncomment the lines below and run this cell

# ===========================

# try :
#     import google.colab
#     !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch3/ch3-requirements.txt | xargs -n 1 -L 1 pip install
# except ModuleNotFoundError :
#     !pip install -r "ch3-requirements.txt"

# ===========================

In [2]:
import warnings
warnings.filterwarnings('ignore')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from pprint import pprint
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/achilleas.voutsas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data = ["dog bites man",
        "man bites dog",
        "dog eats meat",
        "man eats food"]

tagged_data = [TaggedDocument(words=word_tokenize(word.lower()), tags=[str(i)]) for i, word in enumerate(data)]


In [4]:
tagged_data

[TaggedDocument(words=['dog', 'bites', 'man'], tags=['0']),
 TaggedDocument(words=['man', 'bites', 'dog'], tags=['1']),
 TaggedDocument(words=['dog', 'eats', 'meat'], tags=['2']),
 TaggedDocument(words=['man', 'eats', 'food'], tags=['3'])]

In [5]:
#dbow
model_dbow = Doc2Vec(tagged_data,vector_size=20, min_count=1, epochs=2,dm=0)


In [6]:
print(model_dbow.infer_vector(['man','eats','food']))#feature vector of man eats food

[-0.01203257  0.01399791  0.00436162 -0.00180027  0.01481865  0.00915204
 -0.00378098 -0.00889236  0.00451842  0.02051549  0.02342229  0.01624066
 -0.00929306 -0.01506979 -0.02199876  0.01465158  0.02258906 -0.02092641
  0.00850764 -0.01780725]


In [7]:
model_dbow.wv.most_similar("man",topn=5)#top 5 most simlar words.

[('meat', 0.39641645550727844),
 ('bites', 0.05595850199460983),
 ('dog', 0.050179000943899155),
 ('food', -0.06502582132816315),
 ('eats', -0.2928891181945801)]

In [9]:
model_dbow.wv.n_similarity(["dog"],["man"])

0.050179023

In [11]:
#dm
model_dm = Doc2Vec(tagged_data, min_count=1, vector_size=20, epochs=2,dm=1)

print("Inference Vector of man eats food\n ",model_dm.infer_vector(['man','eats','food']))

print("Most similar words to man in our corpus\n",model_dm.wv.most_similar("man",topn=5))
print("Similarity between man and dog: ",model_dm.wv.n_similarity(["dog"],["man"]))

Inference Vector of man eats food
  [-0.01203259  0.01399781  0.00436171 -0.00180043  0.01481868  0.00915196
 -0.00378094 -0.00889238  0.00451853  0.02051536  0.02342224  0.01624064
 -0.00929315 -0.01506988 -0.02199879  0.01465174  0.02258903 -0.02092638
  0.00850757 -0.01780711]
Most similar words to man in our corpus
 [('meat', 0.39641645550727844), ('bites', 0.05595850199460983), ('dog', 0.050179000943899155), ('food', -0.06502582132816315), ('eats', -0.2928891181945801)]
Similarity between man and dog:  0.050179023


What happens when we compare between words which are not in the vocabulary?

In [12]:
model_dm.wv.n_similarity(['covid'],['man'])

KeyError: "Key 'covid' not present"