### Problem Statement

  - Use the Gensim library to implement Doc2Vec for generating document embeddings from a set of paragraphs.


### Importing Modules

In [11]:
import gensim
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Creating Tagged Documents

In [17]:
docs = ["Ballspielverein Borussia 09 e. V. Dortmund, often known simply as Borussia Dortmund or by its initialism BVB, is a German professional sports club based in Dortmund, North Rhine-Westphalia.",
        "Futbol Club Barcelona, commonly known as Barcelona and familiarly as Barça ([ˈbaɾsə]), is a professional football club based in Barcelona, Catalonia, Spain, that competes in La Liga, the top flight of Spanish football.",
        "Manchester City Football Club is a professional football club based in Manchester, England. The club competes in the Premier League, the top tier of English football."]

In [18]:
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
                              tags=[str(i)]) for i,
               doc in enumerate(docs)]
tagged_data

[TaggedDocument(words=['ballspielverein', 'borussia', '09', 'e.', 'v.', 'dortmund', ',', 'often', 'known', 'simply', 'as', 'borussia', 'dortmund', 'or', 'by', 'its', 'initialism', 'bvb', ',', 'is', 'a', 'german', 'professional', 'sports', 'club', 'based', 'in', 'dortmund', ',', 'north', 'rhine-westphalia', '.'], tags=['0']),
 TaggedDocument(words=['futbol', 'club', 'barcelona', ',', 'commonly', 'known', 'as', 'barcelona', 'and', 'familiarly', 'as', 'barça', '(', '[', 'ˈbaɾsə', ']', ')', ',', 'is', 'a', 'professional', 'football', 'club', 'based', 'in', 'barcelona', ',', 'catalonia', ',', 'spain', ',', 'that', 'competes', 'in', 'la', 'liga', ',', 'the', 'top', 'flight', 'of', 'spanish', 'football', '.'], tags=['1']),
 TaggedDocument(words=['manchester', 'city', 'football', 'club', 'is', 'a', 'professional', 'football', 'club', 'based', 'in', 'manchester', ',', 'england', '.', 'the', 'club', 'competes', 'in', 'the', 'premier', 'league', ',', 'the', 'top', 'tier', 'of', 'english', 'footba

### Gensim's Word2Vec Model

Model Loading and Training

In [20]:
model = Doc2Vec()
model.build_vocab(tagged_data)
model.train(tagged_data,
            total_examples=model.corpus_count,
            epochs=model.epochs)

Predictions

In [22]:
document_vectors = [model.infer_vector(
    word_tokenize(doc.lower())) for doc in docs]

In [24]:
for i, doc in enumerate(docs):
    print("Document", i+1, ":", doc)
    print("Vector:", document_vectors[i])
    print()

Document 1 : Ballspielverein Borussia 09 e. V. Dortmund, often known simply as Borussia Dortmund or by its initialism BVB, is a German professional sports club based in Dortmund, North Rhine-Westphalia.
Vector: [-3.7805834e-03 -4.5711710e-03 -4.8266998e-03  2.2331548e-03
 -1.3317827e-03 -2.3213839e-03 -3.0306876e-03 -3.5532808e-03
  2.9661776e-03  3.1900082e-03  9.3555707e-04  3.0771566e-03
  4.0184241e-03  4.1386541e-03 -2.2775665e-05 -3.4195499e-04
 -3.9196210e-03 -6.7737368e-05  1.5786339e-03 -1.0418436e-03
 -4.8898729e-03  4.6273433e-03  2.2956051e-03  4.0373481e-03
 -1.0116837e-03  3.4960478e-03  7.3933729e-04 -2.0626315e-03
  2.9986303e-03  1.5712701e-03 -1.8408824e-03 -4.6567898e-03
 -1.8198629e-03 -3.0896266e-03  2.2764867e-03 -2.6385789e-03
 -4.9505071e-03 -6.2630410e-05  3.4139114e-03  1.0816329e-03
 -2.1662668e-03 -4.3310709e-03 -4.6909028e-03 -4.6007922e-03
  4.2914557e-03 -4.3560523e-03 -3.2549927e-03  3.0186789e-03
  1.2479740e-04  2.3577249e-04  1.9992024e-03  4.4294563e

### Cosine Simiarity

In [28]:
test_doc = ["Barcelona is one of the most widely supported teams in the world, and the club has one of the largest social media following in the world among sports teams"]

model.docvecs.most_similar([model.infer_vector(test_doc)])

  model.docvecs.most_similar([model.infer_vector(test_doc)])


[('0', -0.03234893083572388),
 ('2', -0.09455051273107529),
 ('1', -0.13056692481040955)]

<hr><hr>