In [1]:
from gensim import corpora, models, similarities

In [4]:
corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
           [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
           [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
           [(0, 1.0), (4, 2.0), (7, 1.0)],
           [(3, 1.0), (5, 1.0), (6, 1.0)],
           [(9, 1.0)],
           [(9, 1.0), (10, 1.0)],
           [(9, 1.0), (10, 1.0), (11, 1.0)],
           [(8, 1.0), (10, 1.0), (11, 1.0)]]


In [5]:
tfidf = models.TfidfModel(corpus)

In [9]:
vec = [(0,1), (4,1)]
print(tfidf[vec])

[(0, 0.8075244024440723), (4, 0.5898341626740045)]


In [10]:
index  = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)

In [12]:
sims = index[tfidf[vec]]
print(list(enumerate(sims)))

[(0, 0.4662244), (1, 0.19139354), (2, 0.2460055), (3, 0.82094586), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


In [13]:
sims


array([0.4662244 , 0.19139354, 0.2460055 , 0.82094586, 0.        ,
       0.        , 0.        , 0.        , 0.        ], dtype=float32)

In [2]:
>>> import logging
>>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
>>> documents = ["Human machine interface for lab abc computer applications",
>>>              "A survey of user opinion of computer system response time",
>>>              "The EPS user interface management system",
>>>              "System and human system engineering testing of EPS",
>>>              "Relation of user perceived response time to error measurement",
>>>              "The generation of random binary unordered trees",
>>>              "The intersection graph of paths in trees",
>>>              "Graph minors IV Widths of trees and well quasi ordering",
>>>              "Graph minors A survey"]

In [4]:
>>> # remove common words and tokenize
>>> stoplist = set('for a of the and to in'.split())
>>> texts = [[word for word in document.lower().split() if word not in stoplist]
>>>          for document in documents]
>>>

>>> # remove words that appear only once
>>> from collections import defaultdict
>>> frequency = defaultdict(int)

>>> for text in texts:
>>>     for token in text:
>>>         frequency[token] += 1
>>>

>>> texts = [[token for token in text if frequency[token] > 1]
>>>          for text in texts]
>>>

In [5]:
dictionary = corpora.Dictionary(texts)

2018-10-07 00:36:05,700 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-10-07 00:36:05,702 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)


In [6]:
dictionary.save('./deerwester.dict')
print(dictionary)

2018-10-07 00:36:06,253 : INFO : saving Dictionary object under ./deerwester.dict, separately None
2018-10-07 00:36:06,256 : INFO : saved ./deerwester.dict


Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [7]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [8]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(0, 1), (1, 1)]


In [9]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('./deerwester.mm', corpus) # store to disk for later use
print(corpus)

2018-10-07 00:36:08,077 : INFO : storing corpus in Matrix Market format to ./deerwester.mm
2018-10-07 00:36:08,079 : INFO : saving sparse matrix to ./deerwester.mm
2018-10-07 00:36:08,081 : INFO : PROGRESS: saving document #0
2018-10-07 00:36:08,083 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2018-10-07 00:36:08,084 : INFO : saving MmCorpus index to ./deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


In [10]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('mycorpus.txt'):
            # assume there's one doc per line, tokens sperated by whitespace
            yield dictionary.doc2bow(line.lower().split())

In [11]:
corpus_mem_friendly = MyCorpus()
print(corpus_mem_friendly)

<__main__.MyCorpus object at 0x7f5ab45f77f0>


In [17]:
from six import iteritems

In [19]:
import os

In [20]:
>>> if (os.path.exists("./deerwester.dict")):
>>>    dictionary = corpora.Dictionary.load('./deerwester.dict')
>>>    corpus = corpora.MmCorpus('./deerwester.mm')
>>>    print("Used files generated from first tutorial")
>>> else:
>>>    print("Please run first tutorial to generate data set")

2018-10-07 01:04:47,366 : INFO : loading Dictionary object from ./deerwester.dict
2018-10-07 01:04:47,369 : INFO : loaded ./deerwester.dict
2018-10-07 01:04:47,371 : INFO : loaded corpus index from ./deerwester.mm.index
2018-10-07 01:04:47,372 : INFO : initializing cython corpus reader from ./deerwester.mm
2018-10-07 01:04:47,373 : INFO : accepted corpus with 9 documents, 12 features, 28 non-zero entries


Used files generated from first tutorial


In [21]:
tfidf = models.TfidfModel(corpus)

2018-10-07 01:59:54,096 : INFO : collecting document frequencies
2018-10-07 01:59:54,097 : INFO : PROGRESS: processing document #0
2018-10-07 01:59:54,098 : INFO : calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


In [28]:
corpus_tfidf = tfidf[corpus]

In [29]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus_tfidf]

2018-10-07 02:18:32,315 : INFO : using serial LSI version on this node
2018-10-07 02:18:32,316 : INFO : updating model with new documents
2018-10-07 02:18:32,318 : INFO : preparing a new chunk of documents
2018-10-07 02:18:32,320 : INFO : using 100 extra samples and 2 power iterations
2018-10-07 02:18:32,321 : INFO : 1st phase: constructing (12, 102) action matrix
2018-10-07 02:18:32,323 : INFO : orthonormalizing (12, 102) action matrix
2018-10-07 02:18:32,394 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2018-10-07 02:18:32,396 : INFO : computing the final decomposition
2018-10-07 02:18:32,397 : INFO : keeping 2 factors (discarding 47.565% of energy spectrum)
2018-10-07 02:18:32,398 : INFO : processed documents up to #9
2018-10-07 02:18:32,462 : INFO : topic #0(1.594): 0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"
2018-10-07 02:18:32,463 : INFO : topic #

In [32]:
lsi.print_topics()

2018-10-07 02:20:46,416 : INFO : topic #0(1.594): 0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"
2018-10-07 02:20:46,417 : INFO : topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"


[(0,
  '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [33]:
for doc in corpus_lsi:
    print(doc)

[(0, 0.06600783396090595), (1, -0.5200703306361845)]
[(0, 0.19667592859142907), (1, -0.7609563167700042)]
[(0, 0.08992639972446753), (1, -0.7241860626752505)]
[(0, 0.07585847652178435), (1, -0.6320551586003422)]
[(0, 0.10150299184980455), (1, -0.5737308483002956)]
[(0, 0.7032108939378307), (1, 0.16115180214026148)]
[(0, 0.8774787673119826), (1, 0.16758906864659842)]
[(0, 0.9098624686818573), (1, 0.14086553628719448)]
[(0, 0.6165825350569285), (1, -0.05392907566389098)]


In [34]:
lsi.save('./model.lsi')
lsi = models.LsiModel.load('./model.lsi')

2018-10-07 02:24:08,896 : INFO : saving Projection object under ./model.lsi.projection, separately None
2018-10-07 02:24:08,898 : INFO : saved ./model.lsi.projection
2018-10-07 02:24:08,899 : INFO : saving LsiModel object under ./model.lsi, separately None
2018-10-07 02:24:08,899 : INFO : not storing attribute projection
2018-10-07 02:24:08,900 : INFO : not storing attribute dispatcher
2018-10-07 02:24:08,901 : INFO : saved ./model.lsi
2018-10-07 02:24:08,901 : INFO : loading LsiModel object from ./model.lsi
2018-10-07 02:24:08,902 : INFO : loading id2word recursively from ./model.lsi.id2word.* with mmap=None
2018-10-07 02:24:08,902 : INFO : setting ignored attribute projection to None
2018-10-07 02:24:08,903 : INFO : setting ignored attribute dispatcher to None
2018-10-07 02:24:08,903 : INFO : loaded ./model.lsi
2018-10-07 02:24:08,903 : INFO : loading LsiModel object from ./model.lsi.projection
2018-10-07 02:24:08,904 : INFO : loaded ./model.lsi.projection


In [38]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

2018-10-07 03:44:28,967 : INFO : using serial LSI version on this node
2018-10-07 03:44:28,969 : INFO : updating model with new documents
2018-10-07 03:44:28,970 : INFO : preparing a new chunk of documents
2018-10-07 03:44:28,971 : INFO : using 100 extra samples and 2 power iterations
2018-10-07 03:44:28,972 : INFO : 1st phase: constructing (12, 102) action matrix
2018-10-07 03:44:28,973 : INFO : orthonormalizing (12, 102) action matrix
2018-10-07 03:44:28,975 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2018-10-07 03:44:28,976 : INFO : computing the final decomposition
2018-10-07 03:44:28,977 : INFO : keeping 2 factors (discarding 43.156% of energy spectrum)
2018-10-07 03:44:28,977 : INFO : processed documents up to #9
2018-10-07 03:44:28,978 : INFO : topic #0(3.341): 0.644*"system" + 0.404*"user" + 0.301*"eps" + 0.265*"response" + 0.265*"time" + 0.240*"computer" + 0.221*"human" + 0.206*"survey" + 0.198*"interface" + 0.036*"graph"
2018-10-07 03:44:28,979 : INFO : topic #1(2

In [39]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] #convert the query to LSI space
print(vec_lsi)

[(0, 0.4618210045327155), (1, -0.07002766527899965)]


In [40]:
index = similarities.MatrixSimilarity(lsi[corpus]) #transform corpus to LSI space and index it... but I thought it was?

2018-10-07 03:48:11,058 : INFO : creating matrix with 9 documents and 2 features
  if np.issubdtype(vec.dtype, np.int):


In [42]:
index.save('./deerwester.index')
index = similarities.MatrixSimilarity.load('./deerwester.index')

2018-10-07 03:50:04,890 : INFO : saving MatrixSimilarity object under ./deerwester.index, separately None
2018-10-07 03:50:04,892 : INFO : saved ./deerwester.index
2018-10-07 03:50:04,892 : INFO : loading MatrixSimilarity object from ./deerwester.index
2018-10-07 03:50:04,893 : INFO : loaded ./deerwester.index


In [43]:
sims = index[vec_lsi]
print(list(enumerate(sims)))

[(0, 0.998093), (1, 0.93748635), (2, 0.9984453), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.09879464), (8, 0.050041765)]
