# Building paragraph vectors using Doc2Vec

### Import common text corpus, Doc2Vec algorithm and Tagged Document functionality from Gensim

In [11]:
!pip install gensim



In [1]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

### Corpus on which training will happen

In [2]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

### Building Tagged Documents from the corpus as that's an expectation from the Doc2Vec model

In [3]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]

In [4]:
documents

[TaggedDocument(words=['human', 'interface', 'computer'], tags=[0]),
 TaggedDocument(words=['survey', 'user', 'computer', 'system', 'response', 'time'], tags=[1]),
 TaggedDocument(words=['eps', 'user', 'interface', 'system'], tags=[2]),
 TaggedDocument(words=['system', 'human', 'system', 'eps'], tags=[3]),
 TaggedDocument(words=['user', 'response', 'time'], tags=[4]),
 TaggedDocument(words=['trees'], tags=[5]),
 TaggedDocument(words=['graph', 'trees'], tags=[6]),
 TaggedDocument(words=['graph', 'minors', 'trees'], tags=[7]),
 TaggedDocument(words=['graph', 'minors', 'survey'], tags=[8])]

### Building a basic Doc2Vec model

In [5]:
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

### What's the vector size (should be 5 as we specified it on top)

In [6]:
model.vector_size

5

### How many document vectors did we train?

In [7]:
len(model.docvecs)

  len(model.docvecs)


9

### Let's check out the vocabulary information for the model we built

In [8]:
len(model.wv.index_to_key)

12

In [9]:
model.wv.key_to_index

{'system': 0,
 'graph': 1,
 'trees': 2,
 'user': 3,
 'minors': 4,
 'eps': 5,
 'time': 6,
 'response': 7,
 'survey': 8,
 'computer': 9,
 'interface': 10,
 'human': 11}

### Let's infer a vector based on the trained Doc2Vec model

In [10]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.03756111 -0.03715252  0.0129479   0.02861738  0.08362415]


### Building a new model changing vector size and minimum count eligibility

In [11]:
model = Doc2Vec(documents, vector_size=5, window=2, min_count=3, workers=4)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [12]:
len(model.wv.index_to_key)

4

In [13]:
model.wv.key_to_index

{'system': 0, 'graph': 1, 'trees': 2, 'user': 3}

In [14]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.0371801  -0.03677846  0.01317028  0.02929444  0.08339311]


### Doc2Vec built next would be based on the distributed memory model (dm=1)

In [17]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, dm=1)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [18]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.00453587 -0.00391252  0.00089709  0.00296409  0.00813381  0.00256962
  0.00136201  0.00242808 -0.00329857  0.00952082 -0.0017757   0.00799976
  0.00059354 -0.00814846  0.00873208 -0.00364986  0.00074912  0.0027355
  0.00131337  0.00229351 -0.00573052  0.0051463   0.00388442 -0.00533876
  0.00435362 -0.0018584   0.00395117 -0.00281297 -0.00025203  0.0020558
 -0.00932153 -0.00561514 -0.00431304 -0.00056523  0.00643483 -0.00356581
 -0.00962584 -0.00659742 -0.00779687 -0.0017348  -0.00125581  0.00776353
  0.00157168  0.00380536  0.00258829 -0.00814661 -0.00045274  0.00371402
 -0.00660128  0.00900414]


### Doc2Vec built next would be based on the distributed bag of words approach (dm=0)

In [19]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, dm=0)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [25]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.00593539 -0.00425362  0.0004662   0.0026706   0.00770862  0.00273647
  0.00091485  0.00346302 -0.0038242   0.01032096 -0.00157123  0.00881096
  0.00088869 -0.00865343  0.00962424 -0.00427841 -0.00027584  0.00275743
  0.00066734  0.00265018 -0.00622919  0.00510398  0.0032966  -0.0051358
  0.00394173 -0.00155933  0.00362163 -0.00380825 -0.00030596  0.00192817
 -0.00831099 -0.00475652 -0.00538212 -0.00015652  0.00611886 -0.00251408
 -0.01026612 -0.00722642 -0.00749515 -0.00213776 -0.00172963  0.00873332
  0.00148657  0.00294841  0.0013323  -0.00739662  0.00019065  0.00405663
 -0.00620056  0.00857089]


### Adding the window size which controls the maximum distance between current and predicted word

In [26]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=0)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [27]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-5.85619686e-03 -4.14020661e-03  2.92560144e-04  2.62079341e-03
  7.67338788e-03  2.98890611e-03  8.56717757e-04  3.49160237e-03
 -3.95824527e-03  1.03212604e-02 -1.36668689e-03  8.95858556e-03
  1.07341912e-03 -8.61879252e-03  9.46658943e-03 -4.40538721e-03
  7.55539877e-05  2.91305920e-03  6.73631148e-04  2.50865007e-03
 -6.35773549e-03  5.19524980e-03  3.12256394e-03 -5.28495573e-03
  4.08078264e-03 -1.63315982e-03  3.49024963e-03 -3.97459324e-03
 -2.50483339e-04  1.84055394e-03 -8.26205779e-03 -4.62255860e-03
 -5.55908680e-03 -2.01697883e-04  6.21228153e-03 -2.48296838e-03
 -1.01173725e-02 -7.28166755e-03 -7.46667059e-03 -1.95274758e-03
 -2.01439532e-03  8.60384945e-03  1.23042718e-03  2.98013189e-03
  1.52203394e-03 -7.30147678e-03  1.30460947e-04  3.87898064e-03
 -5.91142336e-03  8.69590230e-03]


### Adding initial learning rate and to what value should the learning rate drop to linearly over training (alpha and min_alpha)

In [28]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [29]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.31974137 -0.06716121 -0.20223242 -0.00366339 -0.12749013  0.12073709
 -0.16609545  0.18861233 -0.27778292  0.12815428  0.23538662  0.27723283
  0.11906813 -0.25756946  0.18341936 -0.352204   -0.08047464  0.13784613
 -0.3398532  -0.03839013 -0.07702577  0.07554863 -0.16768833  0.1620395
 -0.04139753 -0.00550529 -0.20810111 -0.29029062  0.05777445 -0.18847813
  0.32191885  0.21942474 -0.3537838  -0.07794843 -0.08159574  0.17768037
 -0.03486383 -0.1888445  -0.03092271 -0.01426172 -0.12569518  0.15150201
  0.00671369 -0.31110078 -0.0699418   0.15969911  0.18711855 -0.00819251
  0.23305197 -0.16736859]


### Adding the dm_concat parameter to use concatenation of the word vectors

In [30]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, alpha=0.3, min_alpha=0.05, dm_concat=1)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [31]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[ 0.04073207 -0.20443733 -0.11139946  0.01324134  0.16146013  0.07265161
  0.02887594 -0.2575949   0.00560988  0.1745744  -0.10879918 -0.0027538
 -0.1769514  -0.17444625 -0.00454347 -0.1584279   0.04258839  0.08098657
 -0.04190582 -0.19145466  0.00216675  0.02538604 -0.08776652  0.02294124
 -0.23279321 -0.08641739 -0.2657417  -0.01555547  0.0756209  -0.16681834
  0.16999461 -0.03340479  0.05899809 -0.20745786  0.10253324 -0.15544245
  0.05034002 -0.11959898 -0.01451971  0.03602321  0.20438002 -0.01023639
  0.2261081  -0.21462099  0.05096089 -0.08346988  0.04929426 -0.00307659
  0.00197548 -0.14021483]


### Adding the dm_mean parameter to use sum of the context word vectors (dm_mean=1)

In [32]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, dm_concat=0, dm_mean=1, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [33]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.31974137 -0.06716121 -0.20223242 -0.00366339 -0.12749013  0.12073709
 -0.16609545  0.18861233 -0.27778292  0.12815428  0.23538662  0.27723283
  0.11906813 -0.25756946  0.18341936 -0.352204   -0.08047464  0.13784613
 -0.3398532  -0.03839013 -0.07702577  0.07554863 -0.16768833  0.1620395
 -0.04139753 -0.00550529 -0.20810111 -0.29029062  0.05777445 -0.18847813
  0.32191885  0.21942474 -0.3537838  -0.07794843 -0.08159574  0.17768037
 -0.03486383 -0.1888445  -0.03092271 -0.01426172 -0.12569518  0.15150201
  0.00671369 -0.31110078 -0.0699418   0.15969911  0.18711855 -0.00819251
  0.23305197 -0.16736859]


### Adding the dm_mean parameter to use mean of the context word vectors (dm_mean=0)

In [34]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, dm_concat=0, dm_mean=0, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [35]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.31974137 -0.06716121 -0.20223242 -0.00366339 -0.12749013  0.12073709
 -0.16609545  0.18861233 -0.27778292  0.12815428  0.23538662  0.27723283
  0.11906813 -0.25756946  0.18341936 -0.352204   -0.08047464  0.13784613
 -0.3398532  -0.03839013 -0.07702577  0.07554863 -0.16768833  0.1620395
 -0.04139753 -0.00550529 -0.20810111 -0.29029062  0.05777445 -0.18847813
  0.32191885  0.21942474 -0.3537838  -0.07794843 -0.08159574  0.17768037
 -0.03486383 -0.1888445  -0.03092271 -0.01426172 -0.12569518  0.15150201
  0.00671369 -0.31110078 -0.0699418   0.15969911  0.18711855 -0.00819251
  0.23305197 -0.16736859]
