In [2]:
! pip install gensim

Collecting gensim
  Downloading https://files.pythonhosted.org/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K    100% |████████████████████████████████| 24.2MB 16kB/s  eta 0:00:01
Collecting smart-open>=1.8.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/11/9a/ba2d5f67f25e8d5bbf2fcec7a99b1e38428e83cb715f64dd179ca43a11bb/smart_open-3.0.0.tar.gz (113kB)
[K    100% |████████████████████████████████| 122kB 6.4MB/s eta 0:00:01
Building wheels for collected packages: smart-open
  Running setup.py bdist_wheel for smart-open ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/18/88/7c/f06dabd5e9cabe02d2269167bcacbbf9b47d0c0ff7d6ebcb78
Successfully built smart-open
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.8.3 smart-open-3.0.0
[33mYou are using pip version 9.0.1, however version 20.2.4 is available.
You should consider upg

In [3]:
import gensim
from gensim import corpora
from pprint import pprint

# How to create a dictionary from a list of sentences?
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]
'''
documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]
'''

documents_2 = ["The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]

In [5]:
# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]
# Create dictionary
dictionary = corpora.Dictionary(texts)

In [6]:
print(dictionary)
print(dictionary.token2id)

Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)
{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32}


In [7]:
texts_2 = [[text for text in doc.split()] for doc in documents_2]
dictionary.add_documents(texts_2)

In [8]:
print(dictionary)
print(dictionary.token2id)

Dictionary(48 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)
{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32, 'graph': 33, 'in': 34, 'intersection': 35, 'paths': 36, 'trees': 37, 'Graph': 38, 'IV': 39, 'Widths': 40, 'and': 41, 'minors': 42, 'ordering': 43, 'quasi': 44, 'well': 45, 'A': 46, 'survey': 47}


### How to create a Dictionary from one or more text files?

In [9]:
# The advantage here is it let’s you read an entire text file without loading the file in memory all at once.

from gensim.utils import simple_preprocess
from smart_open import smart_open
import os

# Create gensim dictionary form a single tet file
dictionary = corpora.Dictionary(simple_preprocess(line, deacc=True) for line in open('poem.txt', encoding='utf-8'))

# Token to Id map
#dictionary.token2id

'''
Now, how to read one-line-at-a-time from multiple files?

class ReadTxtFiles(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname), encoding='latin'):
                yield simple_preprocess(line)

path_to_text_directory = "lsa_sports_food_docs"

dictionary = corpora.Dictionary(ReadTxtFiles(path_to_text_directory))
'''

'\nNow, how to read one-line-at-a-time from multiple files?\n\nclass ReadTxtFiles(object):\n    def __init__(self, dirname):\n        self.dirname = dirname\n\n    def __iter__(self):\n        for fname in os.listdir(self.dirname):\n            for line in open(os.path.join(self.dirname, fname), encoding=\'latin\'):\n                yield simple_preprocess(line)\n\npath_to_text_directory = "lsa_sports_food_docs"\n\ndictionary = corpora.Dictionary(ReadTxtFiles(path_to_text_directory))\n'

### How to create a bag of words corpus in gensim?

In [10]:
my_docs = ["Who let the dogs out?",
           "Who? Who? Who? Who?"]
# Tokenize the docs
tokenized_list = [simple_preprocess(doc) for doc in my_docs]
# Create the Corpus
mydict = corpora.Dictionary()
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list]
pprint(mycorpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(4, 4)]]


In [11]:
word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus]
pprint(word_counts)

[[('dogs', 1), ('let', 1), ('out', 1), ('the', 1), ('who', 1)], [('who', 4)]]


In [12]:
mydict.token2id

{'dogs': 0, 'let': 1, 'out': 2, 'the': 3, 'who': 4}

### How to create a bag of words corpus from a text file?

In [13]:

from gensim.utils import simple_preprocess
from smart_open import smart_open
import nltk
nltk.download('stopwords')  # run once
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


class BoWCorpus(object):
    def __init__(self, path, dictionary):
        self.filepath = path
        self.dictionary = dictionary

    def __iter__(self):
        global mydict  # OPTIONAL, only if updating the source dictionary.
        for line in smart_open(self.filepath, encoding='latin'):
            # tokenize
            tokenized_list = simple_preprocess(line, deacc=True)

            # create bag of words
            bow = self.dictionary.doc2bow(tokenized_list, allow_update=True)

            # update the source dictionary (OPTIONAL)
            mydict.merge_with(self.dictionary)

            # lazy return the BoW
            yield bow


# Create the Dictionary
mydict = corpora.Dictionary()

# Create the Corpus
bow_corpus = BoWCorpus('poem.txt', dictionary=mydict)  # memory friendly

# Print the token_id and count for each line.
for line in bow_corpus:
    print(line)

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]
[(6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]
[(6, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)]
[(6, 1), (8, 1), (14, 1), (17, 2), (18, 1), (19, 1), (20, 1)]
[(1, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)]
[]
[(17, 2), (23, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)]
[(6, 1), (23, 1), (32, 1), (33, 1), (34, 1), (35, 1)]
[(6, 1), (22, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1)]
[(17, 1), (23, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1)]
[(23, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1)]
[]
[(6, 1), (7, 1), (43, 1), (52, 1), (53, 1), (54, 1)]
[(1, 1), (47, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1)]
[(23, 1), (41, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1)]
[(24, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 2), (70, 1)]
[(71, 1), (72, 1), (73, 1), (74, 1), (75, 1)

### How to save a gensim dictionary and corpus to disk and load them back?

In [14]:
# Save the Dict and Corpus
mydict.save('mydict.dict')  # save dict to disk
corpora.MmCorpus.serialize('bow_corpus.mm', bow_corpus)  # save corpus to disk

# Load them back
loaded_dict = corpora.Dictionary.load('mydict.dict')

corpus = corpora.MmCorpus('bow_corpus.mm')
for line in corpus:
    print(line)

[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0)]
[(6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0), (10, 1.0), (11, 1.0)]
[(6, 1.0), (12, 1.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0)]
[(6, 1.0), (8, 1.0), (14, 1.0), (17, 2.0), (18, 1.0), (19, 1.0), (20, 1.0)]
[(1, 1.0), (21, 1.0), (22, 1.0), (23, 1.0), (24, 1.0), (25, 1.0), (26, 1.0)]
[]
[(17, 2.0), (23, 1.0), (27, 1.0), (28, 1.0), (29, 1.0), (30, 1.0), (31, 1.0)]
[(6, 1.0), (23, 1.0), (32, 1.0), (33, 1.0), (34, 1.0), (35, 1.0)]
[(6, 1.0), (22, 1.0), (36, 1.0), (37, 1.0), (38, 1.0), (39, 1.0), (40, 1.0)]
[(17, 1.0), (23, 1.0), (41, 1.0), (42, 1.0), (43, 1.0), (44, 1.0), (45, 1.0)]
[(23, 1.0), (46, 1.0), (47, 1.0), (48, 1.0), (49, 1.0), (50, 1.0), (51, 1.0)]
[]
[(6, 1.0), (7, 1.0), (43, 1.0), (52, 1.0), (53, 1.0), (54, 1.0)]
[(1, 1.0), (47, 1.0), (55, 1.0), (56, 1.0), (57, 1.0), (58, 1.0), (59, 1.0)]
[(23, 1.0), (41, 1.0), (60, 1.0), (61, 1.0), (62, 1.0), (63, 1.0), (64, 1.0)]
[(24, 1.0), (65, 1.0), (66, 1.0), (67, 1.0), (68, 1.0)

### How to create the TFIDF matrix (corpus) in gensim?

In [15]:
'''
How is TFIDF computed?

Tf-Idf is computed by multiplying a local component like term frequency (TF) with a global component, that is, inverse document frequency (IDF) and optionally normalizing the result to unit length.

As a result of this, the words that occur frequently across documents will get downweighted.
'''

'\nHow is TFIDF computed?\n\nTf-Idf is computed by multiplying a local component like term frequency (TF) with a global component, that is, inverse document frequency (IDF) and optionally normalizing the result to unit length.\n\nAs a result of this, the words that occur frequently across documents will get downweighted.\n'

In [16]:
from gensim import models
import numpy as np

documents = ["This is the first line",
             "This is the second sentence",
             "This third document"]

# Create the Dictionary and Corpus
mydict = corpora.Dictionary([simple_preprocess(line) for line in documents])
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in documents]

# Show the Word Weights in Corpus
print("before TF-IDF")
for doc in corpus:
    print([[mydict[id], freq] for id, freq in doc])

# Create the TF-IDF model
tfidf = models.TfidfModel(corpus, smartirs='ntc')

# Show the TF-IDF weights
print("after TF-IDF")
for doc in tfidf[corpus]:
    print([[mydict[id], np.around(freq, decimals=2)] for id, freq in doc])

before TF-IDF
[['first', 1], ['is', 1], ['line', 1], ['the', 1], ['this', 1]]
[['is', 1], ['the', 1], ['this', 1], ['second', 1], ['sentence', 1]]
[['this', 1], ['document', 1], ['third', 1]]
after TF-IDF
[['first', 0.63], ['is', 0.31], ['line', 0.63], ['the', 0.31], ['this', 0.13]]
[['is', 0.31], ['the', 0.31], ['this', 0.13], ['second', 0.63], ['sentence', 0.63]]
[['this', 0.14999999999999999], ['document', 0.69999999999999996], ['third', 0.69999999999999996]]


### How to use gensim downloader API to load datasets?

In [17]:
import gensim.downloader as api

# Get information about the model or dataset
api.info('glove-wiki-gigaword-50')


{'base_dataset': 'Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)',
 'checksum': 'c289bc5d7f2f02c6dc9f2f9b67641813',
 'description': 'Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).',
 'file_name': 'glove-wiki-gigaword-50.gz',
 'file_size': 69182535,
 'license': 'http://opendatacommons.org/licenses/pddl/',
 'num_records': 400000,
 'parameters': {'dimension': 50},
 'parts': 1,
 'preprocessing': 'Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i <fname> -o glove-wiki-gigaword-50.txt`.',
 'read_more': ['https://nlp.stanford.edu/projects/glove/',
  'https://nlp.stanford.edu/pubs/glove.pdf'],
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/glove-wiki-gigaword-50/__init__.py'}

### How to create bigrams and trigrams using Phraser models?

In [18]:
'''
In paragraphs, certain words always tend to occur in pairs (bigram) or in groups of threes (trigram).
Because the two words combined together form the actual entity. For example: The word ‘French’ refers the 
language or region and the word ‘revolution’ can refer to the planetary revolution.
But combining them, ‘French Revolution’, refers to something completely different.'
'''

"\nIn paragraphs, certain words always tend to occur in pairs (bigram) or in groups of threes (trigram).\nBecause the two words combined together form the actual entity. For example: The word ‘French’ refers the \nlanguage or region and the word ‘revolution’ can refer to the planetary revolution.\nBut combining them, ‘French Revolution’, refers to something completely different.'\n"

In [19]:
dataset = ["In paragraphs, certain words always tend to occur in pairs (bigram) or in groups of threes (trigram).\nBecause the two words combined together form the actual entity. For example: The word ‘French’ refers the \nlanguage or region and the word ‘revolution’ can refer to the planetary revolution.\nBut combining them, ‘French Revolution’, refers to something completely different."]
dataset = [wd for wd in dataset]
dct = corpora.Dictionary([simple_preprocess(line) for line in documents])
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in documents]
# Build the bigram models
bigram = gensim.models.phrases.Phrases(dataset, min_count=3, threshold=10)
# Construct bigram
print(bigram[dataset[0]])

['I', 'n', ' ', 'p', 'a', 'r', 'a', 'g', 'r', 'a', 'p', 'h', 's', ',', ' ', 'c', 'e', 'r', 't', 'a', 'i', 'n', ' ', 'w', 'o', 'r', 'd', 's', ' ', 'a', 'l', 'w', 'a', 'y', 's', ' ', 't', 'e', 'n', 'd', ' ', 't', 'o', ' ', 'o', 'c', 'c', 'u', 'r', ' ', 'i', 'n', ' ', 'p', 'a', 'i', 'r', 's', ' ', '(', 'b', 'i', 'g', 'r', 'a', 'm', ')', ' ', 'o', 'r', ' ', 'i', 'n', ' ', 'g', 'r', 'o', 'u', 'p', 's', ' ', 'o', 'f', ' ', 't', 'h', 'r', 'e', 'e', 's', ' ', '(', 't', 'r', 'i', 'g', 'r', 'a', 'm', ')', '.', '\n', 'B', 'e', 'c', 'a', 'u', 's', 'e', ' ', 't', 'h', 'e', ' ', 't', 'w', 'o', ' ', 'w', 'o', 'r', 'd', 's', ' ', 'c', 'o', 'm', 'b', 'i', 'n', 'e', 'd', ' ', 't', 'o', 'g', 'e', 't', 'h', 'e', 'r', ' ', 'f', 'o', 'r', 'm', ' ', 't', 'h', 'e', ' ', 'a', 'c', 't', 'u', 'a', 'l', ' ', 'e', 'n', 't', 'i', 't', 'y', '.', ' ', 'F', 'o', 'r', ' ', 'e', 'x', 'a', 'm', 'p', 'l', 'e', ':', ' ', 'T', 'h', 'e', ' ', 'w', 'o', 'r', 'd', ' ', '‘', 'F', 'r', 'e', 'n', 'c', 'h', '’', ' ', 'r', 'e', 'f'

In [20]:
# Build the trigram models
trigram = gensim.models.phrases.Phrases(bigram[dataset], threshold=10)

# Construct trigram
print(trigram[bigram[dataset[0]]])

['I', 'n', ' ', 'p', 'a', 'r', 'a', 'g', 'r', 'a', 'p', 'h', 's', ',', ' ', 'c', 'e', 'r', 't', 'a', 'i', 'n', ' ', 'w', 'o', 'r', 'd', 's', ' ', 'a', 'l', 'w', 'a', 'y', 's', ' ', 't', 'e', 'n', 'd', ' ', 't', 'o', ' ', 'o', 'c', 'c', 'u', 'r', ' ', 'i', 'n', ' ', 'p', 'a', 'i', 'r', 's', ' ', '(', 'b', 'i', 'g', 'r', 'a', 'm', ')', ' ', 'o', 'r', ' ', 'i', 'n', ' ', 'g', 'r', 'o', 'u', 'p', 's', ' ', 'o', 'f', ' ', 't', 'h', 'r', 'e', 'e', 's', ' ', '(', 't', 'r', 'i', 'g', 'r', 'a', 'm', ')', '.', '\n', 'B', 'e', 'c', 'a', 'u', 's', 'e', ' ', 't', 'h', 'e', ' ', 't', 'w', 'o', ' ', 'w', 'o', 'r', 'd', 's', ' ', 'c', 'o', 'm', 'b', 'i', 'n', 'e', 'd', ' ', 't', 'o', 'g', 'e', 't', 'h', 'e', 'r', ' ', 'f', 'o', 'r', 'm', ' ', 't', 'h', 'e', ' ', 'a', 'c', 't', 'u', 'a', 'l', ' ', 'e', 'n', 't', 'i', 't', 'y', '.', ' ', 'F', 'o', 'r', ' ', 'e', 'x', 'a', 'm', 'p', 'l', 'e', ':', ' ', 'T', 'h', 'e', ' ', 'w', 'o', 'r', 'd', ' ', '‘', 'F', 'r', 'e', 'n', 'c', 'h', '’', ' ', 'r', 'e', 'f'

### How to create Topic Models with LDA?

In [21]:
# Step 0: Import packages and stopwords
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.utils import simple_preprocess, lemmatize
from nltk.corpus import stopwords
import re
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
stop_words = stopwords.words('english')
stop_words = stop_words + ['com', 'edu', 'subject', 'lines', 'organization', 'would', 'article', 'could']

In [22]:
# Step 1: Import the dataset and get the text and real topic of each news article
# Create the Dictionary
data = [line for  line in open('poem.txt', encoding='utf-8')]
#dictionary = corpora.Dictionary(simple_preprocess(line, deacc=True) for line in open('poem.txt', encoding='utf-8'))

In [23]:
# Step 2: Prepare Data (Remove stopwords and lemmatize)
import pattern3
data_processed = []

for i, doc in enumerate(data[:100]):
    doc_out = []
    for wd in doc:
        if wd not in stop_words:  # remove stopwords
            lemmatized_word = lemmatize(wd, allowed_tags=re.compile('(NN|JJ|RB)'))  # lemmatize
            if lemmatized_word:
                doc_out = doc_out + [lemmatized_word[0].split(b'/')[0].decode('utf-8')]
        else:
            continue
    data_processed.append(doc_out)

# Print a small sample    
print(data_processed[0][:5]) 

# Step 3: Create the Inputs of LDA model: Dictionary and Corpus
dct = corpora.Dictionary(data_processed)
corpus = [dct.doc2bow(line) for line in data_processed]

ModuleNotFoundError: No module named 'pattern3'

In [24]:
# Step 4: Train the LDA model
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dct,
                         random_state=100,
                         num_topics=7,
                         passes=10,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

# save the model
lda_model.save('lda_model.model')

# See the topics
lda_model.print_topics(-1)

2020-10-25 21:51:38,113 : INFO : using asymmetric alpha [0.26219156, 0.19027454, 0.14931786, 0.12287004, 0.10438152, 0.090729296, 0.080235206]
2020-10-25 21:51:38,168 : INFO : using symmetric eta at 0.14285714285714285
2020-10-25 21:51:38,168 : INFO : using serial LDA version on this node
2020-10-25 21:51:38,171 : INFO : running online LDA training, 7 topics, 10 passes over the supplied corpus of 3 documents, updating every 31000 documents, evaluating every ~0 documents, iterating 100x with a convergence threshold of 0.001000
2020-10-25 21:51:38,173 : INFO : training LDA model using 31 processes
2020-10-25 21:51:38,487 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #3/3, outstanding queue size 1
2020-10-25 21:51:38,494 : INFO : topic #6 (0.080): 0.119*"sentence" + 0.117*"this" + 0.116*"second" + 0.113*"line" + 0.111*"third" + 0.111*"the" + 0.108*"document" + 0.105*"is" + 0.100*"first"
2020-10-25 21:51:38,495 : INFO : topic #5 (0.091): 0.129*"second" + 0.119*"first" + 

2020-10-25 21:51:38,776 : INFO : topic #1 (0.190): 0.165*"the" + 0.164*"this" + 0.156*"is" + 0.109*"sentence" + 0.106*"second" + 0.102*"first" + 0.099*"line" + 0.050*"third" + 0.049*"document"
2020-10-25 21:51:38,777 : INFO : topic #0 (0.262): 0.173*"third" + 0.172*"document" + 0.169*"this" + 0.087*"line" + 0.084*"the" + 0.083*"is" + 0.082*"second" + 0.078*"sentence" + 0.071*"first"
2020-10-25 21:51:38,778 : INFO : topic diff=0.106160, rho=0.119520
2020-10-25 21:51:38,778 : INFO : PROGRESS: pass 7, dispatched chunk #0 = documents up to #3/3, outstanding queue size 1
2020-10-25 21:51:38,782 : INFO : topic #6 (0.080): 0.117*"sentence" + 0.116*"this" + 0.115*"second" + 0.113*"line" + 0.111*"third" + 0.111*"the" + 0.109*"document" + 0.106*"is" + 0.102*"first"
2020-10-25 21:51:38,783 : INFO : topic #5 (0.091): 0.126*"second" + 0.118*"first" + 0.111*"is" + 0.111*"line" + 0.110*"third" + 0.110*"document" + 0.107*"sentence" + 0.106*"the" + 0.101*"this"
2020-10-25 21:51:38,783 : INFO : topic #2

[(0,
  '0.194*"third" + 0.193*"document" + 0.191*"this" + 0.075*"line" + 0.073*"the" + 0.072*"is" + 0.071*"second" + 0.068*"sentence" + 0.063*"first"'),
 (1,
  '0.173*"the" + 0.172*"this" + 0.167*"is" + 0.106*"sentence" + 0.104*"second" + 0.102*"first" + 0.100*"line" + 0.038*"third" + 0.037*"document"'),
 (2,
  '0.124*"sentence" + 0.123*"line" + 0.120*"is" + 0.110*"this" + 0.109*"second" + 0.108*"first" + 0.102*"third" + 0.102*"the" + 0.102*"document"'),
 (3,
  '0.123*"first" + 0.115*"this" + 0.113*"document" + 0.112*"the" + 0.111*"third" + 0.111*"second" + 0.105*"line" + 0.105*"sentence" + 0.105*"is"'),
 (4,
  '0.129*"this" + 0.119*"is" + 0.118*"first" + 0.116*"the" + 0.113*"sentence" + 0.105*"second" + 0.101*"document" + 0.101*"line" + 0.098*"third"'),
 (5,
  '0.125*"second" + 0.117*"first" + 0.111*"is" + 0.111*"line" + 0.110*"third" + 0.110*"document" + 0.108*"sentence" + 0.106*"the" + 0.102*"this"'),
 (6,
  '0.117*"sentence" + 0.116*"this" + 0.115*"second" + 0.113*"line" + 0.111*"t

### How to train Word2Vec model using gensim?

In [25]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count

data = [line for  line in open('poem.txt', encoding='utf-8')]

# Split the data into 2 parts. Part 2 will be used later to update the model
data_part1 = data[:1000]
data_part2 = data[1000:]

# Train Word2Vec model. Defaults result vector size = 100
model = Word2Vec(data_part1, min_count = 0, workers=cpu_count())

# Get the word vector for given word
model['Two']

2020-10-25 21:51:45,069 : INFO : collecting all words and their counts
2020-10-25 21:51:45,071 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-10-25 21:51:45,071 : INFO : collected 40 word types from a corpus of 729 raw words and 23 sentences
2020-10-25 21:51:45,072 : INFO : Loading a fresh vocabulary
2020-10-25 21:51:45,072 : INFO : effective_min_count=0 retains 40 unique words (100% of original 40, drops 0)
2020-10-25 21:51:45,073 : INFO : effective_min_count=0 leaves 729 word corpus (100% of original 729, drops 0)
2020-10-25 21:51:45,074 : INFO : deleting the raw counts dictionary of 40 items
2020-10-25 21:51:45,074 : INFO : sample=0.001 downsamples 30 most-common words
2020-10-25 21:51:45,075 : INFO : downsampling leaves estimated 144 word corpus (19.8% of prior 729)
2020-10-25 21:51:45,075 : INFO : estimated required memory for 40 words and 100 dimensions: 52000 bytes
2020-10-25 21:51:45,076 : INFO : resetting layer weights
2020-10-25 21:51:45,077 :

2020-10-25 21:51:45,373 : INFO : worker thread finished; awaiting finish of 22 more threads
2020-10-25 21:51:45,374 : INFO : worker thread finished; awaiting finish of 21 more threads
2020-10-25 21:51:45,374 : INFO : worker thread finished; awaiting finish of 20 more threads
2020-10-25 21:51:45,375 : INFO : worker thread finished; awaiting finish of 19 more threads
2020-10-25 21:51:45,375 : INFO : worker thread finished; awaiting finish of 18 more threads
2020-10-25 21:51:45,376 : INFO : worker thread finished; awaiting finish of 17 more threads
2020-10-25 21:51:45,376 : INFO : worker thread finished; awaiting finish of 16 more threads
2020-10-25 21:51:45,377 : INFO : worker thread finished; awaiting finish of 15 more threads
2020-10-25 21:51:45,377 : INFO : worker thread finished; awaiting finish of 14 more threads
2020-10-25 21:51:45,378 : INFO : worker thread finished; awaiting finish of 13 more threads
2020-10-25 21:51:45,378 : INFO : worker thread finished; awaiting finish of 12 m

2020-10-25 21:51:45,673 : INFO : EPOCH - 5 : training on 729 raw words (150 effective words) took 0.1s, 1535 effective words/s
2020-10-25 21:51:45,674 : INFO : training on a 3645 raw words (725 effective words) took 0.6s, 1216 effective words/s
  


KeyError: "word 'Two' not in vocabulary"

In [26]:
data

['Two roads diverged in a yellow wood,\n',
 'And sorry I could not travel both\n',
 'And be one traveler, long I stood\n',
 'And looked down one as far as I could\n',
 'To where it bent in the undergrowth;\n',
 '\n',
 'Then took the other, as just as fair,\n',
 'And having perhaps the better claim,\n',
 'Because it was grassy and wanted wear;\n',
 'Though as for that the passing there\n',
 'Had worn them really about the same,\n',
 '\n',
 'And both that morning equally lay\n',
 'In leaves no step had trodden black.\n',
 'Oh, I kept the first for another day!\n',
 'Yet knowing how way leads on to way,\n',
 'I doubted if I should ever come back.\n',
 '\n',
 'I shall be telling this with a sigh\n',
 'Somewhere ages and ages hence:\n',
 'Two roads diverged in a wood, and I—\n',
 'I took the one less traveled by,\n',
 'And that has made all the difference.']