## doc2vec implementation with Python (& Gensim)
- Note: This code is written in Python 3.6.1 (+Gensim 2.3.0)

In [75]:
import re
import numpy as np

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.corpus import gutenberg
from multiprocessing import Pool
from scipy import spatial

### Import training dataset
- Import Shakespeare's Hamlet corpus from nltk library

In [45]:
sentences = list(gutenberg.sents('shakespeare-hamlet.txt'))   # import the corpus and convert into a list

In [46]:
print('Type of corpus: ', type(sentences))
print('Length of corpus: ', len(sentences))

Type of corpus:  <class 'list'>
Length of corpus:  3106


In [47]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']']
['Actus', 'Primus', '.']
['Fran', '.']


### Preprocess data
- Use re module to preprocess data
- Convert all letters into lowercase
- Remove punctuations, numbers, etc.
- For the doc2vec model, input data should be in format of **iterable TaggedDocuments"**
    - Each TaggedDocument instance comprises **words** and **tags**
    - Hence, each document (i.e., a sentence or paragraph) should have a unique tag which is identifiable

In [48]:
for i in range(len(sentences)):
    sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)]  

In [49]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare']
['actus', 'primus']
['fran']


In [81]:
for i in range(len(sentences)):
    sentences[i] = TaggedDocument(words = sentences[i], tags = ['sent{}'.format(i)])    # converting each sentence into a TaggedDocument

In [82]:
sentences[0]

TaggedDocument(words=['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare'], tags=['sent0'])

### Create and train model
- Create a doc2vec model and train it with Hamlet corpus
- Key parameter description (https://radimrehurek.com/gensim/models/doc2vec.html)
    - **documents**: training data (has to be iterable TaggedDocument instances)
    - **size**: dimension of embedding space
    - **dm**: DBOW if 0, distributed-memory if 1
    - **window**: number of words accounted for each context (if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
    - **min_count**: minimum count of words to be included in the vocabulary
    - **iter**: number of training iterations
    - **workers**: number of worker threads to train

In [83]:
model = Doc2Vec(documents = sentences, dm = 1, size = 100, window = 3, min_count = 1, iter = 10, workers = Pool()._processes)

In [96]:
model.init_sims(replace = True)

### Save and load model
- doc2vec model can be saved and loaded locally
- Doing so can reduce time to train model again

In [99]:
model.save('doc2vec_model')

In [100]:
model = Doc2Vec.load('doc2vec_model')

### Similarity calculation
- Similarity between embedded words (i.e., vectors) can be computed using metrics such as cosine similarity
- For other metrics and comparisons between them, refer to: https://github.com/taki0112/Vector_Similarity

In [94]:
v1 = model.infer_vector('sent2')    # in doc2vec, infer_vector() function is used to infer the vector embedding of a document
v2 = model.infer_vector('sent3')

In [95]:
model.most_similar([v1])

[('seeke', 0.9795917272567749),
 ('hither', 0.9794537425041199),
 ('touching', 0.9791266918182373),
 ('spade', 0.9790579080581665),
 ('goes', 0.9789791107177734),
 ('hit', 0.9789602756500244),
 ('lose', 0.9786853790283203),
 ('countries', 0.9786409139633179),
 ('rash', 0.9785533547401428),
 ('honor', 0.978546142578125)]

In [97]:
# define a function that computes cosine similarity between two words
def cosine_similarity(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)

In [98]:
cosine_similarity(v1, v2)

0.95258642555608464