# Gensim Tutorial – A Complete Beginners Guide

#From
https://www.machinelearningplus.com/nlp/gensim-tutorial/

## 3. How to create a Dictionary from a list of sentences?

In [3]:
import gensim
from gensim import corpora
from pprint import pprint

In [4]:
# How to create a dictionary from a list of sentences?
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

In [5]:
# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]

# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
print(dictionary)
#> Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)

Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


In [8]:
# Show the word to id map
print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32}


In [9]:
# Better way to print
pprint(dictionary.token2id)

{'Jamal': 9,
 "Khashoggi's": 10,
 'Saudi': 11,
 'Saudis': 0,
 'The': 1,
 'Turkey,': 26,
 'a': 2,
 'abduction': 27,
 'according': 28,
 'acknowledge': 3,
 'an': 12,
 'are': 4,
 'death': 13,
 'from': 29,
 'his': 30,
 'intended': 19,
 'interrogation': 20,
 'journalist': 14,
 'lead': 21,
 'of': 15,
 'one': 22,
 'preparing': 5,
 'report': 6,
 'result': 16,
 'sources.': 31,
 'that': 7,
 'the': 17,
 'to': 23,
 'two': 32,
 'was': 18,
 'went': 24,
 'will': 8,
 'wrong,': 25}


In [10]:
documents_2 = ["The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]

texts_2 = [[text for text in doc.split()] for doc in documents_2]

dictionary.add_documents(texts_2)


# If you check now, the dictionary should have been updated with the new words (tokens).
print(dictionary)
#> Dictionary(45 unique tokens: ['Human', 'abc', 'applications', 'computer', 'for']...)

print(dictionary.token2id)
#> {'Human': 0, 'abc': 1, 'applications': 2, 'computer': 3, 'for': 4, 'interface': 5, 
#>  'lab': 6, 'machine': 7, 'A': 8, 'of': 9, 'opinion': 10, 'response': 11, 'survey': 12, 
#>  'system': 13, 'time': 14, 'user': 15, 'EPS': 16, 'The': 17, 'management': 18, 
#>  'System': 19, 'and': 20, 'engineering': 21, 'human': 22, 'testing': 23, 'Relation': 24, 
#>  'error': 25, 'measurement': 26, 'perceived': 27, 'to': 28, 'binary': 29, 'generation': 30, 
#>  'random': 31, 'trees': 32, 'unordered': 33, 'graph': 34, 'in': 35, 'intersection': 36, 
#>  'paths': 37, 'Graph': 38, 'IV': 39, 'Widths': 40, 'minors': 41, 'ordering': 42, 
#>  'quasi': 43, 'well': 44}

Dictionary(48 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)
{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32, 'graph': 33, 'in': 34, 'intersection': 35, 'paths': 36, 'trees': 37, 'Graph': 38, 'IV': 39, 'Widths': 40, 'and': 41, 'minors': 42, 'ordering': 43, 'quasi': 44, 'well': 45, 'A': 46, 'survey': 47}


## 4. How to create a Dictionary from one or more text files?

In [11]:
from gensim.utils import simple_preprocess
from smart_open import smart_open
import os

# Create gensim dictionary form a single tet file
#dictionary = corpora.Dictionary(simple_preprocess(line, deacc=True) for line in open('C:/Users/kiselgof/OneDrive - RenaissanceRe/Datasets/Columbia_ML_course_Amueller/COMS4995-s19-master/slides/aml-19-word-embeddings/sample.txt', encoding='utf-8'))
dictionary = corpora.Dictionary(simple_preprocess(line, deacc=True) for line in open('C:/Users/kiselgof/Documents/Courses/COMS4995-s20-master/slides/aml-17-topic-models/sample.txt', encoding='utf-8'))

#C:\Users\kiselgof\Documents\Courses\COMS4995-s20-master\slides\aml-17-topic-models
# Token to Id map
dictionary.token2id

#> {'according': 35,
#>  'and': 22,
#>  'appointment': 23,
#>  'army': 0,
#>  'as': 43,
#>  'at': 24,
#>   ...
#> }

{'army': 0,
 'china': 1,
 'chinese': 2,
 'force': 3,
 'liberation': 4,
 'of': 5,
 'people': 6,
 'recently': 7,
 'recruited': 8,
 'rocket': 9,
 'tank': 10,
 'technicians': 11,
 'the': 12,
 'think': 13,
 'companies': 14,
 'daily': 15,
 'from': 16,
 'on': 17,
 'pla': 18,
 'private': 19,
 'reported': 20,
 'saturday': 21,
 'and': 22,
 'appointment': 23,
 'at': 24,
 'ceremony': 25,
 'experts': 26,
 'founding': 27,
 'hao': 28,
 'letters': 29,
 'other': 30,
 'received': 31,
 'science': 32,
 'technology': 33,
 'zhang': 34,
 'according': 35,
 'by': 36,
 'defense': 37,
 'national': 38,
 'panel': 39,
 'published': 40,
 'report': 41,
 'to': 42,
 'as': 43,
 'fellow': 44,
 'his': 45,
 'honored': 46,
 'will': 47,
 'conduct': 48,
 'design': 49,
 'fields': 50,
 'into': 51,
 'like': 52,
 'members': 53,
 'overall': 54,
 'research': 55,
 'serve': 56,
 'which': 57,
 'five': 58,
 'for': 59,
 'launching': 60,
 'missile': 61,
 'missiles': 62,
 'network': 63,
 'system': 64,
 'years': 65,
 'counterparts': 66,
 '

In [8]:
print(dictionary)

Dictionary(93 unique tokens: ['army', 'china', 'chinese', 'force', 'liberation']...)


Reading Multiple Files

In [12]:
class ReadTxtFiles(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname), encoding='latin'):
                yield simple_preprocess(line)

In [13]:
path_to_text_directory = "C:/Users/kiselgof/Documents/Courses/COMS4995-s20-master/slides/aml-17-topic-models/lsa_sports_food_docs"

dictionary = corpora.Dictionary(ReadTxtFiles(path_to_text_directory))

# Token to Id map
dictionary.token2id
# {'across': 0,
#  'activity': 1,
#  'although': 2,
#  'and': 3,
#  'are': 4,
#  ...
# }

{'across': 0,
 'activity': 1,
 'although': 2,
 'and': 3,
 'are': 4,
 'as': 5,
 'badminton': 6,
 'be': 7,
 'beach': 8,
 'by': 9,
 'casual': 10,
 'common': 11,
 'court': 12,
 'doubles': 13,
 'formal': 14,
 'forms': 15,
 'game': 16,
 'games': 17,
 'half': 18,
 'hit': 19,
 'in': 20,
 'indoor': 21,
 'is': 22,
 'it': 23,
 'landing': 24,
 'larger': 25,
 'may': 26,
 'most': 27,
 'net': 28,
 'of': 29,
 'often': 30,
 'on': 31,
 'one': 32,
 'opposing': 33,
 'or': 34,
 'outdoor': 35,
 'per': 36,
 'played': 37,
 'player': 38,
 'players': 39,
 'points': 40,
 'racquet': 41,
 'racquets': 42,
 'rectangular': 43,
 'scored': 44,
 'shuttlecock': 45,
 'side': 46,
 'singles': 47,
 'sport': 48,
 'striking': 49,
 'teams': 50,
 'the': 51,
 'to': 52,
 'two': 53,
 'using': 54,
 'with': 55,
 'within': 56,
 'yard': 57,
 'ball': 58,
 'baseball': 59,
 'bat': 60,
 'batting': 61,
 'between': 62,
 'each': 63,
 'fielding': 64,
 'nine': 65,
 'take': 66,
 'turns': 67,
 'who': 68,
 'advances': 69,
 'around': 70,
 'attempts

## 5. How to create a bag of words corpus in gensim?

In [14]:
# List with 2 sentences
my_docs = ["Who let the dogs out?",
           "Who? Who? Who? Who?"]

# Tokenize the docs
tokenized_list = [simple_preprocess(doc) for doc in my_docs]

# Create the Corpus
mydict = corpora.Dictionary()
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list]
pprint(mycorpus)
#> [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(4, 4)]]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(4, 4)]]


How to interpret the above corpus?

The (0, 1) in line 1 means, the word with id=0 appears once in the 1st document.
Likewise, the (4, 4) in the second list item means the word with id 4 appears 4 times in the second document. And so on.

Well, this is not human readable. To convert the id’s to words, you will need the dictionary to do the conversion.

Let’s see how to get the original texts back.

In [15]:
word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus]
pprint(word_counts)
#> [[('dogs', 1), ('let', 1), ('out', 1), ('the', 1), ('who', 1)], [('who', 4)]]

[[('dogs', 1), ('let', 1), ('out', 1), ('the', 1), ('who', 1)], [('who', 4)]]


Notice, the order of the words gets lost. Just the word and it’s frequency information is retained.

In [16]:
#BK: This just prints the tokens for the words
mydict.token2id

{'dogs': 0, 'let': 1, 'out': 2, 'the': 3, 'who': 4}

## 6. How to create a bag of words corpus from a text file?

In [17]:
from gensim.utils import simple_preprocess
from smart_open import smart_open
import nltk
#nltk.download('stopwords')  # run once
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [18]:
len(stop_words)

179

In [19]:
class BoWCorpus(object):
    def __init__(self, path, dictionary):
        self.filepath = path
        self.dictionary = dictionary

    def __iter__(self):
        global mydict  # OPTIONAL, only if updating the source dictionary.
        for line in smart_open(self.filepath, encoding='latin'):
            # tokenize
            tokenized_list = simple_preprocess(line, deacc=True)

            # create bag of words
            bow = self.dictionary.doc2bow(tokenized_list, allow_update=True)

            # update the source dictionary (OPTIONAL)
            mydict.merge_with(self.dictionary)

            # lazy return the BoW
            yield bow

In [20]:
# Create the Dictionary
mydict = corpora.Dictionary()

path = 'C:/Users/kiselgof/Documents/Courses/COMS4995-s20-master/slides/aml-17-topic-models/sample.txt'
# Create the Corpus
#bow_corpus = BoWCorpus('sample.txt', dictionary=mydict)  # memory friendly
bow_corpus = BoWCorpus(path, dictionary=mydict)  # memory friendly

# Print the token_id and count for each line.
for line in bow_corpus:
    print(line)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)]
[(14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]
[(5, 2), (12, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]
[(3, 1), (9, 1), (12, 2), (18, 1), (22, 1), (26, 1), (32, 1), (33, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)]
[(15, 1), (17, 1), (18, 1), (21, 1)]
[(3, 1), (9, 1), (14, 1), (16, 1), (19, 1), (22, 2), (26, 2), (32, 1), (33, 1), (34, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1)]
[(3, 1), (5, 2), (9, 1), (10, 1), (12, 1), (13, 1), (18, 1), (43, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)]
[(12, 1), (22, 1), (33, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1)]
[(12, 3), (16, 1), (26, 1), (41, 1), (43, 1), (47, 1), (66, 1), (67, 1), (68, 1), (69, 1), (

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## 7. How to save a gensim dictionary and corpus to disk and load them back?

In [21]:
# Save the Dict and Corpus

mydict.save('mydict.dict')  # save dict to disk
corpora.MmCorpus.serialize('bow_corpus.mm', bow_corpus)  # save corpus to disk

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [22]:
# Load them back
loaded_dict = corpora.Dictionary.load('mydict.dict')

corpus = corpora.MmCorpus('bow_corpus.mm')
for line in corpus:
    pprint(line)

[(0, 1.0),
 (1, 1.0),
 (2, 1.0),
 (3, 1.0),
 (4, 1.0),
 (5, 1.0),
 (6, 1.0),
 (7, 1.0),
 (8, 1.0),
 (9, 1.0),
 (10, 1.0),
 (11, 1.0),
 (12, 1.0),
 (13, 1.0)]
[(14, 1.0),
 (15, 1.0),
 (16, 1.0),
 (17, 1.0),
 (18, 1.0),
 (19, 1.0),
 (20, 1.0),
 (21, 1.0)]
[(5, 2.0),
 (12, 1.0),
 (22, 2.0),
 (23, 1.0),
 (24, 1.0),
 (25, 1.0),
 (26, 1.0),
 (27, 1.0),
 (28, 1.0),
 (29, 1.0),
 (30, 1.0),
 (31, 1.0),
 (32, 1.0),
 (33, 1.0),
 (34, 1.0)]
[(3, 1.0),
 (9, 1.0),
 (12, 2.0),
 (18, 1.0),
 (22, 1.0),
 (26, 1.0),
 (32, 1.0),
 (33, 1.0),
 (35, 1.0),
 (36, 1.0),
 (37, 1.0),
 (38, 1.0),
 (39, 1.0),
 (40, 1.0),
 (41, 1.0),
 (42, 1.0)]
[(15, 1.0), (17, 1.0), (18, 1.0), (21, 1.0)]
[(3, 1.0),
 (9, 1.0),
 (14, 1.0),
 (16, 1.0),
 (19, 1.0),
 (22, 2.0),
 (26, 2.0),
 (32, 1.0),
 (33, 1.0),
 (34, 1.0),
 (43, 1.0),
 (44, 1.0),
 (45, 1.0),
 (46, 1.0),
 (47, 1.0)]
[(3, 1.0),
 (5, 2.0),
 (9, 1.0),
 (10, 1.0),
 (12, 1.0),
 (13, 1.0),
 (18, 1.0),
 (43, 1.0),
 (47, 1.0),
 (48, 1.0),
 (49, 1.0),
 (50, 1.0),
 (51, 1.0),
 

## 8. How to create the TFIDF matrix (corpus) in gensim?

In [23]:
from gensim import models
import numpy as np

In [24]:
help(models.TfidfModel)

Help on class TfidfModel in module gensim.models.tfidfmodel:

class TfidfModel(gensim.interfaces.TransformationABC)
 |  TfidfModel(corpus=None, id2word=None, dictionary=None, wlocal=<function identity at 0x000001ED5807B2F0>, wglobal=<function df2idf at 0x000001ED6570E7B8>, normalize=True, smartirs=None, pivot=None, slope=0.25)
 |  
 |  Objects of this class realize the transformation between word-document co-occurrence matrix (int)
 |  into a locally/globally weighted TF-IDF matrix (positive floats).
 |  
 |  Examples
 |  --------
 |  .. sourcecode:: pycon
 |  
 |      >>> import gensim.downloader as api
 |      >>> from gensim.models import TfidfModel
 |      >>> from gensim.corpora import Dictionary
 |      >>>
 |      >>> dataset = api.load("text8")
 |      >>> dct = Dictionary(dataset)  # fit dictionary
 |      >>> corpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format
 |      >>>
 |      >>> model = TfidfModel(corpus)  # fit model
 |      >>> vector = mod

In [25]:
documents = ["This is the first line",
             "This is the second sentence",
             "This third document"]

In [26]:
# Create the Dictionary and Corpus
mydict = corpora.Dictionary([simple_preprocess(line) for line in documents])
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in documents]

In [27]:
# Show the Word Weights in Corpus
for doc in corpus:
    print([[mydict[id], freq] for id, freq in doc])

# [['first', 1], ['is', 1], ['line', 1], ['the', 1], ['this', 1]]
# [['is', 1], ['the', 1], ['this', 1], ['second', 1], ['sentence', 1]]
# [['this', 1], ['document', 1], ['third', 1]]

[['first', 1], ['is', 1], ['line', 1], ['the', 1], ['this', 1]]
[['is', 1], ['the', 1], ['this', 1], ['second', 1], ['sentence', 1]]
[['this', 1], ['document', 1], ['third', 1]]


In [28]:
# Create the TF-IDF model
tfidf = models.TfidfModel(corpus, smartirs='ntc')

# Show the TF-IDF weights
for doc in tfidf[corpus]:
    print([[mydict[id], np.around(freq, decimals=2)] for id, freq in doc])
# [['first', 0.66], ['is', 0.24], ['line', 0.66], ['the', 0.24]]
# [['is', 0.24], ['the', 0.24], ['second', 0.66], ['sentence', 0.66]]
# [['document', 0.71], ['third', 0.71]]

[['first', 0.63], ['is', 0.31], ['line', 0.63], ['the', 0.31], ['this', 0.13]]
[['is', 0.31], ['the', 0.31], ['this', 0.13], ['second', 0.63], ['sentence', 0.63]]
[['this', 0.15], ['document', 0.7], ['third', 0.7]]


Notice the difference in weights of the words between the original corpus and the tfidf weighted corpus.

The words ‘is’ and ‘the’ occur in two documents and were weighted down. The word ‘this’ appearing in all three documents was removed altogether. In simple terms, words that occur more frequently across the documents get smaller weights.

## 9. How to use gensim downloader API to load datasets?

Gensim provides an inbuilt API to download popular text datasets and word embedding models.

A comprehensive list of available datasets and models is maintained here.

https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list.json

Using the API to download the dataset is as simple as calling the api.load() method with the right data or model name.

The below example shows how to download the ‘glove-wiki-gigaword-50’ model.

In [29]:
import gensim.downloader as api

# Get information about the model or dataset
api.info('glove-wiki-gigaword-50')
# {'base_dataset': 'Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)',
#  'checksum': 'c289bc5d7f2f02c6dc9f2f9b67641813',
#  'description': 'Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).',
#  'file_name': 'glove-wiki-gigaword-50.gz',
#  'file_size': 69182535,
#  'license': 'http://opendatacommons.org/licenses/pddl/',
#  (... truncated...)

{'num_records': 400000,
 'file_size': 69182535,
 'base_dataset': 'Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)',
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/glove-wiki-gigaword-50/__init__.py',
 'license': 'http://opendatacommons.org/licenses/pddl/',
 'parameters': {'dimension': 50},
 'description': 'Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).',
 'preprocessing': 'Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i <fname> -o glove-wiki-gigaword-50.txt`.',
 'read_more': ['https://nlp.stanford.edu/projects/glove/',
  'https://nlp.stanford.edu/pubs/glove.pdf'],
 'checksum': 'c289bc5d7f2f02c6dc9f2f9b67641813',
 'file_name': 'glove-wiki-gigaword-50.gz',
 'parts': 1}

In [30]:
# Download
w2v_model = api.load("glove-wiki-gigaword-50")
w2v_model.most_similar('blue')

[('red', 0.8901656866073608),
 ('black', 0.8648407459259033),
 ('pink', 0.8452916741371155),
 ('green', 0.8346816301345825),
 ('yellow', 0.8320708274841309),
 ('purple', 0.829311192035675),
 ('white', 0.8225342035293579),
 ('orange', 0.8114303350448608),
 ('bright', 0.799933910369873),
 ('colored', 0.787665605545044)]

## 10. How to create bigrams and trigrams using Phraser models?

In [31]:
dataset = api.load("text8")
dataset = [wd for wd in dataset]
dct = corpora.Dictionary(dataset)
corpus = [dct.doc2bow(line) for line in dataset]

# Build the bigram models
bigram = gensim.models.phrases.Phrases(dataset, min_count=3, threshold=10)

# Construct bigram
print(bigram[dataset[0]])
# ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used',
#  'against', 'early', 'working_class', 'radicals', 'including', 'the', 'diggers',
#  'of', 'the', 'english', 'revolution', 'and', 'the', 'sans_culottes', 'of', 'the',
#  'french_revolution', 'whilst',...]

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working_class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans_culottes', 'of', 'the', 'french_revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative_way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken_up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived_from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political_philosophy', 'is', 'the', 'belief_that', 'rulers', 'are', 'unnecessary', 'and', 'should_be', 'abolished', 'although', 'there_are', 'differing_interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers_to', 'related', 'social_movements', 'that', 'advocate',

In [32]:
# Build the trigram models
trigram = gensim.models.phrases.Phrases(bigram[dataset], threshold=10)

# Construct trigram
print(trigram[bigram[dataset[0]]])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working_class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans_culottes', 'of', 'the', 'french_revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative_way', 'to_describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also_been', 'taken_up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived_from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political_philosophy', 'is', 'the', 'belief_that', 'rulers', 'are', 'unnecessary', 'and', 'should_be', 'abolished', 'although', 'there_are', 'differing_interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers_to', 'related', 'social_movements', 'that', 'advocate', 'the'

## 11. How to create Topic Models with LDA?

In [11]:
# Step 0: Import packages and stopwords
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.utils import simple_preprocess, lemmatize
from nltk.corpus import stopwords
import re
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
stop_words = stopwords.words('english')
stop_words = stop_words + ['com', 'edu', 'subject', 'lines', 'organization', 'would', 'article', 'could']

In [12]:
# Step 1: Import the dataset and get the text and real topic of each news article
dataset = api.load("text8")
data = [d for d in dataset]

In [13]:
# Step 2: Prepare Data (Remove stopwords and lemmatize)
data_processed = []

for i, doc in enumerate(data[:100]):
    doc_out = []
    for wd in doc:
        if wd not in stop_words:  # remove stopwords
            lemmatized_word = lemmatize(wd, allowed_tags=re.compile(r'(NN|JJ|RB)'))  # lemmatize
            if lemmatized_word:
                doc_out = doc_out + [lemmatized_word[0].split(b'/')[0].decode('utf-8')]
        else:
            continue
    data_processed.append(doc_out)

# Print a small sample    
print(data_processed[0][:5]) 
#> ['anarchism', 'originated', 'term', 'abuse', 'first']

RuntimeError: generator raised StopIteration

In [19]:
# Step 3: Create the Inputs of LDA model: Dictionary and Corpus
dct = corpora.Dictionary(data_processed)
corpus = [dct.doc2bow(line) for line in data_processed]

2019-11-12 10:28:33,008 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-11-12 10:28:33,426 : INFO : built Dictionary(40074 unique tokens: ['ability', 'able', 'abnormal', 'abolition', 'absence']...) from 100 documents (total 425549 corpus positions)


In [20]:
# Step 4: Train the LDA model
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dct,
                         random_state=100,
                         num_topics=7,
                         passes=10,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

# save the model
lda_model.save('lda_model.model')

# See the topics
lda_model.print_topics(-1)

2019-11-12 10:28:39,263 : INFO : using asymmetric alpha [0.26219156, 0.19027454, 0.14931786, 0.12287004, 0.104381524, 0.090729296, 0.080235206]
2019-11-12 10:28:39,265 : INFO : using symmetric eta at 0.14285714285714285
2019-11-12 10:28:39,271 : INFO : using serial LDA version on this node
2019-11-12 10:28:39,313 : INFO : running online LDA training, 7 topics, 10 passes over the supplied corpus of 100 documents, updating every 7000 documents, evaluating every ~0 documents, iterating 100x with a convergence threshold of 0.001000
2019-11-12 10:28:39,315 : INFO : training LDA model using 7 processes
2019-11-12 10:28:39,438 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #100/100, outstanding queue size 1
2019-11-12 10:28:49,734 : INFO : topic #6 (0.080): 0.001*"also" + 0.001*"american" + 0.001*"state" + 0.001*"person" + 0.001*"time" + 0.001*"world" + 0.001*"year" + 0.001*"war" + 0.001*"many" + 0.001*"first"
2019-11-12 10:28:49,736 : INFO : topic #5 (0.091): 0.001*"also" +

2019-11-12 10:28:53,411 : INFO : topic #5 (0.091): 0.001*"also" + 0.001*"abortion" + 0.001*"first" + 0.001*"american" + 0.000*"state" + 0.000*"many" + 0.000*"year" + 0.000*"time" + 0.000*"war" + 0.000*"person"
2019-11-12 10:28:53,412 : INFO : topic #2 (0.149): 0.005*"also" + 0.004*"american" + 0.004*"state" + 0.004*"first" + 0.003*"year" + 0.003*"many" + 0.003*"time" + 0.003*"new" + 0.002*"war" + 0.002*"person"
2019-11-12 10:28:53,414 : INFO : topic #1 (0.190): 0.001*"also" + 0.001*"state" + 0.001*"ammonia" + 0.001*"first" + 0.000*"many" + 0.000*"american" + 0.000*"war" + 0.000*"time" + 0.000*"year" + 0.000*"name"
2019-11-12 10:28:53,416 : INFO : topic #0 (0.262): 0.001*"also" + 0.001*"first" + 0.000*"state" + 0.000*"american" + 0.000*"time" + 0.000*"book" + 0.000*"year" + 0.000*"many" + 0.000*"person" + 0.000*"new"
2019-11-12 10:28:53,418 : INFO : topic diff=0.115010, rho=0.119438
2019-11-12 10:28:53,420 : INFO : PROGRESS: pass 7, dispatched chunk #0 = documents up to #100/100, outsta

[(0,
  '0.001*"also" + 0.000*"first" + 0.000*"state" + 0.000*"american" + 0.000*"time" + 0.000*"book" + 0.000*"year" + 0.000*"many" + 0.000*"person" + 0.000*"new"'),
 (1,
  '0.001*"also" + 0.001*"state" + 0.001*"ammonia" + 0.000*"first" + 0.000*"many" + 0.000*"american" + 0.000*"war" + 0.000*"time" + 0.000*"year" + 0.000*"name"'),
 (2,
  '0.005*"also" + 0.004*"american" + 0.004*"state" + 0.004*"first" + 0.003*"year" + 0.003*"many" + 0.003*"time" + 0.003*"new" + 0.003*"war" + 0.003*"person"'),
 (3,
  '0.001*"atheism" + 0.001*"also" + 0.001*"first" + 0.000*"american" + 0.000*"atheist" + 0.000*"god" + 0.000*"state" + 0.000*"many" + 0.000*"new" + 0.000*"year"'),
 (4,
  '0.001*"state" + 0.001*"also" + 0.001*"many" + 0.000*"world" + 0.000*"agave" + 0.000*"time" + 0.000*"new" + 0.000*"war" + 0.000*"god" + 0.000*"person"'),
 (5,
  '0.001*"also" + 0.001*"abortion" + 0.001*"first" + 0.001*"american" + 0.000*"state" + 0.000*"many" + 0.000*"year" + 0.000*"time" + 0.000*"war" + 0.000*"person"'),
 (

The lda_model.print_topics shows what words contributed to which of the 7 topics, along with the weightage of the word’s contribution to that topic.

You can see the words like ‘also’, ‘many’ coming across different topics. So I would add such words to the stop_words list to remove them and further tune to topic model for optimal number of topics.

LdaMulticore() supports parallel processing. Alternately you could also try and see what topics the LdaModel() gives.

## 12. How to interpret the LDA Topic Model’s output?

The lda_model object supports indexing. That is, if you pass a document (list of words) to the lda_model, it provides 3 things:

The topic(s) that document belongs to along with percentage.
The topic(s) each word in that document belongs to.
The topic(s) each word in that document belongs to AND the phi values.
So, what is phi value?

Phi value is the probability of the word belonging to that particular topic. And the sum of phi values for a given word adds up to the number of times that word occurred in that document.

For example, in below output for the 0th document, the word with id=0 belongs to topic number 6 and the phi value is 3.999. That means, the word with id=0 appeared 4 times in the 0th document.

In [None]:
# Reference: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/topic_methods.ipynb
for c in lda_model[corpus[5:8]]:
    print("Document Topics      : ", c[0])      # [(Topics, Perc Contrib)]
    print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
    print("Phi Values (word id) : ", c[2][:2])  # [(Word id, [(Topic, Phi Value)])]
    print("Word, Topics         : ", [(dct[wd], topic) for wd, topic in c[1][:2]])   # [(Word, [Topics])]
    print("Phi Values (word)    : ", [(dct[wd], topic) for wd, topic in c[2][:2]])  # [(Word, [(Topic, Phi Value)])]
    print("------------------------------------------------------\n")

#> Document Topics      :  [(2, 0.96124125), (6, 0.038569752)]
#> Word id, Topics      :  [(0, [2, 6]), (7, [2, 6]), (10, [2, 6])]
#> Phi Values (word id) :  [(0, [(2, 2.887749), (6, 0.112249866)]), (7, [(2, 0.90105206), (6, 0.09893738)])]
#> Word, Topics         :  [('ability', [2, 6]), ('absurdity', [2, 6])]
#> Phi Values (word)    :  [('ability', [(2, 2.887749), (6, 0.112249866)]), ('absurdity', [(2, 0.90105206), (6, 0.09893738)])]
#> ------------------------------------------------------

#> Document Topics      :  [(6, 0.9997751)]
#> Word id, Topics      :  [(0, [6]), (10, [6]), (16, [6])]
#> Phi Values (word id) :  [(0, [(6, 5.9999967)]), (10, [(6, 2.9999983)])]
#> Word, Topics         :  [('ability', [6]), ('academic', [6])]
#> Phi Values (word)    :  [('ability', [(6, 5.9999967)]), ('academic', [(6, 2.9999983)])]
#> ------------------------------------------------------

#> Document Topics      :  [(6, 0.9998023)]
#> Word id, Topics      :  [(1, [6]), (10, [6]), (15, [6])]
#> Phi Values (word id) :  [(1, [(6, 0.99999917)]), (10, [(6, 5.999997)])]
#> Word, Topics         :  [('able', [6]), ('academic', [6])]
#> Phi Values (word)    :  [('able', [(6, 0.99999917)]), ('academic', [(6, 5.999997)])]
#> ------------------------------------------------------

## 13. How to create a LSI topic model using gensim?

In [21]:
from gensim.models import LsiModel

# Build the LSI Model
lsi_model = LsiModel(corpus=corpus, id2word=dct, num_topics=7, decay=0.5)

# View Topics
pprint(lsi_model.print_topics(-1))

2019-11-12 10:33:46,711 : INFO : using serial LSI version on this node
2019-11-12 10:33:46,713 : INFO : updating model with new documents
2019-11-12 10:33:46,713 : INFO : preparing a new chunk of documents
2019-11-12 10:33:46,748 : INFO : using 100 extra samples and 2 power iterations
2019-11-12 10:33:46,749 : INFO : 1st phase: constructing (40074, 107) action matrix
2019-11-12 10:33:46,791 : INFO : orthonormalizing (40074, 107) action matrix
2019-11-12 10:33:47,455 : INFO : 2nd phase: running dense svd on (107, 100) matrix
2019-11-12 10:33:47,506 : INFO : computing the final decomposition
2019-11-12 10:33:47,508 : INFO : keeping 7 factors (discarding 62.871% of energy spectrum)
2019-11-12 10:33:47,524 : INFO : processed documents up to #100
2019-11-12 10:33:47,528 : INFO : topic #0(973.794): 0.262*"also" + 0.197*"state" + 0.197*"american" + 0.178*"first" + 0.151*"many" + 0.149*"time" + 0.147*"year" + 0.130*"person" + 0.130*"world" + 0.124*"war"
2019-11-12 10:33:47,530 : INFO : topic #

[(0,
  '0.262*"also" + 0.197*"state" + 0.197*"american" + 0.178*"first" + '
  '0.151*"many" + 0.149*"time" + 0.147*"year" + 0.130*"person" + 0.130*"world" '
  '+ 0.124*"war"'),
 (1,
  '0.937*"agave" + 0.164*"asia" + 0.100*"aruba" + 0.063*"plant" + 0.053*"var" '
  '+ 0.052*"state" + 0.045*"east" + 0.044*"congress" + -0.042*"first" + '
  '0.041*"maguey"'),
 (2,
  '-0.507*"american" + -0.180*"football" + -0.179*"player" + -0.168*"war" + '
  '-0.150*"british" + 0.140*"also" + -0.114*"ball" + -0.110*"day" + '
  '0.107*"atheism" + 0.106*"god"'),
 (3,
  '0.362*"apollo" + -0.248*"lincoln" + -0.211*"state" + 0.172*"player" + '
  '0.151*"football" + -0.127*"union" + 0.125*"ball" + -0.124*"government" + '
  '0.116*"moon" + -0.116*"jews"'),
 (4,
  '0.363*"atheism" + 0.334*"god" + 0.329*"lincoln" + 0.230*"apollo" + '
  '0.215*"atheist" + 0.143*"abraham" + -0.136*"island" + 0.132*"aristotle" + '
  '-0.124*"aluminium" + 0.119*"belief"'),
 (5,
  '0.360*"apollo" + -0.344*"atheism" + 0.326*"lincoln" + -

## 14. How to train Word2Vec model using gensim?

A word embedding model is a model that can provide numerical vectors for a given word. Using the Gensim’s downloader API, you can download pre-built word embedding models like word2vec, fasttext, GloVe and ConceptNet. These are built on large corpuses of commonly occurring text data such as wikipedia, google news etc.

However, if you are working in a specialized niche such as technical documents, you may not able to get word embeddings for all the words. So, in such cases its desirable to train your own model.

Gensim’s Word2Vec implementation let’s you train your own word embedding model for a given corpus

In [33]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api

In [34]:
# Download dataset
dataset = api.load("text8")
data = [d for d in dataset]

In [35]:
# Split the data into 2 parts. Part 2 will be used later to update the model
data_part1 = data[:1000]
data_part2 = data[1000:]

In [36]:
# Train Word2Vec model. Defaults result vector size = 100
model = Word2Vec(data_part1, min_count = 0, workers=cpu_count())

In [37]:
# Get the word vector for given word
model['topic']
#> array([ 0.0512,  0.2555,  0.9393, ... ,-0.5669,  0.6737], dtype=float32)

  


array([-4.27686065e-01,  4.66710120e-01,  7.98740864e-01, -4.79239337e-02,
       -2.44847015e-01, -6.25485778e-01, -6.68306351e-01, -1.28664768e+00,
        3.88884783e-01, -1.12690043e+00, -1.05502933e-01, -6.34094357e-01,
       -1.16273947e-03, -3.92038196e-01, -1.47223949e+00, -3.53690237e-01,
       -1.25110698e+00,  7.00361729e-01, -2.00973213e-01,  5.81444144e-01,
       -3.07558894e-01, -1.48395285e-01, -7.56829858e-01,  4.89700623e-02,
       -7.03266561e-01,  1.39444396e-01,  1.51736066e-01,  2.29729176e-01,
       -8.66924584e-01, -2.63187259e-01,  2.61028618e-01,  1.10645378e+00,
        4.85127717e-01,  2.35181868e-01, -3.38073432e-01, -2.12831900e-01,
       -2.29578391e-01, -1.30433530e-01,  5.57800412e-01,  8.14397275e-01,
        1.98294386e-01, -1.82460293e-01, -2.10709035e-01,  9.95506883e-01,
       -9.26746607e-01,  1.21398902e+00,  2.01963022e-01, -4.60809737e-01,
       -1.55758893e+00, -3.83762047e-02,  1.95163596e+00,  7.21027136e-01,
       -2.61241883e-01, -

In [38]:
model.most_similar('topic')
#> [('discussion', 0.7590423822402954),
#>  ('consensus', 0.7253159284591675),
#>  ('discussions', 0.7252693176269531),
#>  ('interpretation', 0.7196053266525269),
#>  ('viewpoint', 0.7053568959236145),
#>  ('speculation', 0.7021505832672119),
#>  ('discourse', 0.7001898884773254),
#>  ('opinions', 0.6993060111999512),
#>  ('focus', 0.6959210634231567),
#>  ('scholarly', 0.6884037256240845)]

  """Entry point for launching an IPython kernel.


[('discussion', 0.7638168334960938),
 ('interpretation', 0.758408784866333),
 ('explanation', 0.741090714931488),
 ('facts', 0.7296931743621826),
 ('speculation', 0.7257407903671265),
 ('consensus', 0.711347222328186),
 ('debate', 0.7075456976890564),
 ('viewpoint', 0.7045641541481018),
 ('notion', 0.7013946771621704),
 ('premise', 0.6970481276512146)]

In [39]:
# Save and Load Model
model.save('newmodel')
model = Word2Vec.load('newmodel')

We have trained and saved a Word2Vec model for our document. However, when a new dataset comes, you want to update the model so as to account for new words.

## 15. How to update an existing Word2Vec model with new data?

On an existing Word2Vec model, call the build_vocab() on the new datset and then call the train() method. build_vocab() is called first because the model has to be apprised of what new words to expect in the incoming corpus.


In [21]:
# Update the model with new data.
model.build_vocab(data_part2, update=True)
model.train(data_part2, total_examples=model.corpus_count, epochs=model.iter)
model['topic']
# array([-0.6482, -0.5468,  1.0688,  0.82  , ... , -0.8411,  0.3974], dtype=float32)

2020-04-13 11:54:20,451 : INFO : collecting all words and their counts
2020-04-13 11:54:20,452 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-13 11:54:22,225 : INFO : collected 153347 word types from a corpus of 7005207 raw words and 701 sentences
2020-04-13 11:54:22,226 : INFO : Updating model with new vocabulary
2020-04-13 11:54:22,508 : INFO : New added 153347 unique words (50% of original 306694) and increased the count of 153347 pre-existing words (50% of original 306694)
2020-04-13 11:54:23,669 : INFO : deleting the raw counts dictionary of 153347 items
2020-04-13 11:54:23,674 : INFO : sample=0.001 downsamples 72 most-common words
2020-04-13 11:54:23,675 : INFO : downsampling leaves estimated 10509051 word corpus (150.0% of prior 7005207)
2020-04-13 11:54:24,221 : INFO : estimated required memory for 306694 words and 100 dimensions: 398702200 bytes
2020-04-13 11:54:24,222 : INFO : updating layer weights
  This is separate from the ipykernel pac

array([-0.88150376, -0.14398707, -0.28564858, -0.9891403 ,  0.81116074,
        0.32200527, -0.24999668,  0.58173656,  0.02725367, -0.14557071,
        0.4011717 ,  0.3126943 ,  1.3522577 , -0.1764011 ,  0.2989456 ,
       -0.5057917 , -0.25814036,  0.3784752 , -0.50879544,  1.578752  ,
       -1.0804054 ,  0.642035  , -0.0041557 , -0.16779278, -0.12151621,
       -1.24311   ,  0.9140256 , -0.06902392, -1.0627854 , -1.7399358 ,
       -0.5104757 , -1.6619053 ,  1.1735159 ,  1.2123379 ,  1.375951  ,
       -0.2919345 , -1.2444627 , -0.37713185, -1.4670146 , -0.5360746 ,
       -0.31307065,  0.22087708,  0.30996424,  1.2710524 ,  0.8907565 ,
        0.06827357, -1.7249088 , -1.0765055 ,  0.34220797,  0.9280343 ,
       -2.025619  , -0.74599195,  0.6124367 , -0.03400607,  0.50431764,
       -0.38508362,  0.2701285 ,  0.5647657 , -0.3094654 ,  0.3735259 ,
       -0.77292454, -0.41827148, -0.86693746,  0.22716627,  0.5473429 ,
        1.9730297 , -0.06778929, -0.4741869 ,  0.08877787, -0.19

## 16. How to extract word vectors using pre-trained Word2Vec and FastText models?

We just saw how to get the word vectors for Word2Vec model we just trained. However, gensim lets you download state of the art pretrained models through the downloader API. Let’s see how to extract the word vectors from a couple of these models.

In [22]:
import gensim.downloader as api

# Download the models
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')
word2vec_model300 = api.load('word2vec-google-news-300')
glove_model300 = api.load('glove-wiki-gigaword-300')



2020-04-13 15:14:52,417 : INFO : fasttext-wiki-news-subwords-300 downloaded
2020-04-13 15:14:52,446 : INFO : loading projection weights from C:\Users\kiselgof/gensim-data\fasttext-wiki-news-subwords-300\fasttext-wiki-news-subwords-300.gz
2020-04-13 15:22:19,874 : INFO : loaded (999999, 300) matrix from C:\Users\kiselgof/gensim-data\fasttext-wiki-news-subwords-300\fasttext-wiki-news-subwords-300.gz




2020-04-13 15:27:56,725 : INFO : word2vec-google-news-300 downloaded
2020-04-13 15:27:56,745 : INFO : loading projection weights from C:\Users\kiselgof/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2020-04-13 15:30:15,688 : INFO : loaded (3000000, 300) matrix from C:\Users\kiselgof/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz




2020-04-13 15:31:21,104 : INFO : glove-wiki-gigaword-300 downloaded
2020-04-13 15:31:21,124 : INFO : loading projection weights from C:\Users\kiselgof/gensim-data\glove-wiki-gigaword-300\glove-wiki-gigaword-300.gz
2020-04-13 15:33:50,710 : INFO : loaded (400000, 300) matrix from C:\Users\kiselgof/gensim-data\glove-wiki-gigaword-300\glove-wiki-gigaword-300.gz


In [23]:
# Get word embeddings
word2vec_model300.most_similar('support')
# [('supporting', 0.6251285076141357),
#  ...
#  ('backing', 0.6007589101791382),
#  ('supports', 0.5269277691841125),
#  ('assistance', 0.520713746547699),
#  ('supportive', 0.5110025405883789)]

2020-04-13 15:34:05,614 : INFO : precomputing L2-norms of word weight vectors


[('supporting', 0.6251285076141357),
 ('suport', 0.6071149706840515),
 ('suppport', 0.6053199768066406),
 ('Support', 0.6044272780418396),
 ('supported', 0.6009396314620972),
 ('backing', 0.6007589101791382),
 ('supports', 0.5269277691841125),
 ('assistance', 0.520713746547699),
 ('sup_port', 0.5192489624023438),
 ('supportive', 0.5110025405883789)]

We have 3 different embedding models. You can evaluate which one performs better using the respective model’s evaluate_word_analogies() on a standard analogies dataset.

In [None]:
# Word2ec_accuracy
word2vec_model300.evaluate_word_analogies(analogies="questions-words.txt")[0]
#> 0.7401448525607863

# fasttext_accuracy
fasttext_model300.evaluate_word_analogies(analogies="questions-words.txt")[0]
#> 0.8827876424099353

# GloVe accuracy
glove_model300.evaluate_word_analogies(analogies="questions-words.txt")[0]
#> 0.7195422354510931

## 17. How to create document vectors using Doc2Vec?

Unlike Word2Vec, a Doc2Vec model provides a vectorised representation of a group of words taken collectively as a single unit. It is not a simple average of the word vectors of the words in the sentence.

Let’s use the text8 dataset to train the Doc2Vec.

In [None]:
import gensim
import gensim.downloader as api

# Download dataset
dataset = api.load("text8")
data = [d for d in dataset]