## Gensim is a software library located at
https://radimrehurek.com/gensim/

## How to install Gensim
https://radimrehurek.com/gensim/install.html

In [1]:
import gensim



In [2]:
# Create some raw documents
raw_documents = ['I love tacos.',
             'She ran with the chicken.',
             'I don’t choose to take a nap. The nap chooses me.',
            'That man is nice as pie with ice cream.',
            'This pizza is an affront to nature.']

In [3]:
# We are going to need to tokenize, so let's use NLTK
from nltk.tokenize import word_tokenize

In [4]:
# Define a function that makes tokens
def get_tokens(text):
    tokens = word_tokenize(text)
    return tokens

In [5]:
# A Gensim document is a list of tokens
# We could optionally make all of the tokens lower case
gen_docs = [get_tokens(text) for text in raw_documents]
print(gen_docs)

[['I', 'love', 'tacos', '.'], ['She', 'ran', 'with', 'the', 'chicken', '.'], ['I', 'don\xe2\x80\x99t', 'choose', 'to', 'take', 'a', 'nap', '.', 'The', 'nap', 'chooses', 'me', '.'], ['That', 'man', 'is', 'nice', 'as', 'pie', 'with', 'ice', 'cream', '.'], ['This', 'pizza', 'is', 'an', 'affront', 'to', 'nature', '.']]


In [6]:
# Create dictionary from a list of documents
# A dictionary maps every word to a number
dictionary = gensim.corpora.Dictionary(gen_docs)
num_words = len(dictionary)
print("Num words in dictionary: {}".format(num_words))
for idx,word in dictionary.items():
    print(idx,word)


Num words in dictionary: 31
(2, u'love')
(4, u'ran')
(19, u'is')
(20, u'pie')
(28, u'an')
(22, u'as')
(8, u'chicken')
(25, u'cream')
(11, u'don\u2019t')
(18, u'That')
(12, u'chooses')
(21, u'ice')
(3, u'.')
(13, u'to')
(14, u'nap')
(15, u'choose')
(17, u'take')
(10, u'me')
(30, u'nature')
(0, u'I')
(26, u'This')
(1, u'tacos')
(7, u'She')
(16, u'The')
(6, u'with')
(23, u'man')
(9, u'a')
(29, u'affront')
(27, u'pizza')
(5, u'the')
(24, u'nice')


In [16]:
# Convert token id to string; there are two ways to do it
print(dictionary[6])
print(dictionary.id2token[6])

with
with


In [8]:
# Convert string to token id
print(dictionary.token2id['ran'])

4


In [19]:
# Create bag of words
# A bag of words is tf term frequency (tf) of tf-idf
# Called a "bag of words" because order is lost
# Note that "!" is not listed because it is not in the dictionary
bow_doc = dictionary.doc2bow(['I','love','love','love','tacos','!'])
print(bow_doc)

[(0, 1), (1, 1), (2, 3)]


In [20]:
# Create corpus
# A corpus is a list of bags of words
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(0, 1), (3, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 1)], [(3, 1), (6, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(3, 1), (13, 1), (19, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1)]]


In [21]:
# Create tf-idf model from corpus
# num_nnz is the number of tokens
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)

TfidfModel(num_docs=5, num_nnz=39)


In [22]:
# Show document in text form, bag of words, and tf-idf
# 0 is tacos, 1 is love, 2 is I
# Value for I is lower because occurs multiple times.
# Value for '.' is 0 because it occurs in all sentences and log_2(1) = 0.
# Vectors are normalized so they sum to 1
print(gen_docs[0])
print(corpus[0])
print(tf_idf[corpus][0])

['I', 'love', 'tacos', '.']
[(0, 1), (1, 1), (2, 1), (3, 1)]
[(0, 0.37344696513776354), (1, 0.6559486886294514), (2, 0.6559486886294514)]


In [23]:
# Show bag of words and tf-idf for new document
# Note it is similar to to document above
bow = dictionary.doc2bow(['I','love','pizza','.'])
print(bow)
print(tf_idf[bow])

[(0, 1), (2, 1), (3, 1), (27, 1)]
[(0, 0.37344696513776354), (2, 0.6559486886294514), (27, 0.6559486886294514)]


In [24]:
# This is just a confirmation. Create tf-idf vector manually. Left as an exercise.
# idf if it occurs once in corpus (like "tacos" and "love")
# idf if it occurs twice in corpus (like "I")
from math import log
num_docs = tf_idf.num_docs
idf_1 = log(num_docs/1,2)
idf_2 = log(num_docs/2,2)
# only show nonzero values, and use numpy array
import numpy as np
v = np.array([idf_1,idf_1,idf_2])
print(v)
# normalize to the length is 1
norm_v = np.linalg.norm(v)
print(norm_v)
# Show normalized vector
print(v/norm_v)


[ 2.32192809  2.32192809  1.        ]
3.43259379415
[ 0.67643544  0.67643544  0.29132489]


In [15]:
# Create similarity measure object in tf-idf space
# First arg is temp external storage
# https://radimrehurek.com/gensim/similarities/docsim.html
sims = gensim.similarities.Similarity('/Users/jmugan/',tf_idf[corpus],
                                      num_features=len(dictionary))
print(sims)

Similarity index with 5 documents in 0 shards (stored under /Users/jmugan/)


In [18]:
import os
print os.getcwd()

C:\Users\Ben Brock\Documents\GitHub\Bens_Portfolio\Data_Mining\DataScience-Projects\LDA\Natural_Language_Text_Processing_With_Python\Chapter 4


In [25]:
# Create similarity measure object in tf-idf space
# First arg is temp external storage
# https://radimrehurek.com/gensim/similarities/docsim.html
# - C:\Users\Ben Brock\Documents\GitHub\Bens_Portfolio\Data_Mining\DataScience-Projects\LDA\Natural_Language_Text_Processing_With_Python
sims = gensim.similarities.Similarity('C:\\Users\\Ben Brock\\Documents\\GitHub\\Bens_Portfolio\\Data_Mining\\DataScience-Projects\\LDA\\Natural_Language_Text_Processing_With_Python\\Chapter 4\\tst',
                                      tf_idf[corpus],
                                      num_features=len(dictionary))
print(sims)

Similarity index with 5 documents in 0 shards (stored under C:\Users\Ben Brock\Documents\GitHub\Bens_Portfolio\Data_Mining\DataScience-Projects\LDA\Natural_Language_Text_Processing_With_Python\Chapter 4\tst)


In [26]:
# Create query document and convert to tf-idf
# doc shares two words with each of first two docs in corpus
query_doc = "chicken with tacos love".split()
print(query_doc)
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)

['chicken', 'with', 'tacos', 'love']
[(1, 1), (2, 1), (6, 1), (8, 1)]
[(1, 0.5484803253891997), (2, 0.5484803253891997), (6, 0.31226270667960454), (8, 0.5484803253891997)]


In [27]:
# Show array of document similarities to query
# Also both document 0 and 1 match with two words each,
# document 1 matches with word "with" that occurs twice in corpus.
# Only one overlapping word with the fourth document
sims[query_doc_tf_idf]

array([ 0.71954989,  0.34925455,  0.        ,  0.06428327,  0.        ], dtype=float32)

## Exercise: take a document like an email or news article and find sentences most similar to input query sentences.