In [1]:
from gensim import corpora,models,similarities
from prettytable import PrettyTable
import numpy as np
import re

In [2]:
# corpus
docfile = open('doc-text', 'r')
documents = re.findall("\d+\n([A-Za-z \n]+)\n\s*\/", docfile.read())
docfile.close()

# tokenization
tokenized_documents = [[token for token in document.lower().split() ] for document in documents]
# print(tokenized_documents[:1])

In [3]:
# create dictionary (id=>(id2word) mapping )
dictionary = corpora.Dictionary(tokenized_documents)
# print(dictionary)
# print(dictionary.token2id)
# print("number of documents:",dictionary.num_docs)

In [4]:
# vectorization : bag-of-words vector for each doc
corpus_doc2bow_vectors = [dictionary.doc2bow(tok_doc) for tok_doc in tokenized_documents]
# for corpus_doc2bow_vector in corpus_doc2bow_vectors:
#     print(corpus_doc2bow_vector)

In [5]:
# fit tf_idf model
%time tf_idf_model = models.TfidfModel(corpus_doc2bow_vectors,id2word=dictionary,normalize=False)

# apply model
corpus_tfidf_vectors = tf_idf_model[corpus_doc2bow_vectors]
# for corpus_tfidf_vector in corpus_tfidf_vectors:
#     print(corpus_tfidf_vector,"\n")

Wall time: 327 ms


In [6]:
# create a bow vector for a new document (for example : a query )
queryfile = open('query-text', 'r')
querys = re.findall("\d+\n([A-Za-z \n]+)\n\s*\/", queryfile.read())
queryfile.close()

query_bow_vectors = [dictionary.doc2bow(query.lower().split()) for query in querys]

In [7]:
# calculate (compute) TF_IDF vector of the query
query_tfidf_vectors = [tf_idf_model[query_bow_vector] for query_bow_vector in query_bow_vectors]
# print(query_tfidf_vectors)

In [8]:
# index
index_matrix = similarities.SparseMatrixSimilarity(corpus_tfidf_vectors,num_features=len(dictionary))

# the similarity of our query vector against every document in documents
for query_tfidf_vector,j in zip(query_tfidf_vectors,range(len(query_tfidf_vectors))):
    table = PrettyTable()
    table.field_names = ["query " + str(j+1),"Top document 1","Top document 2","Top document 3","Top document 4","Top document 5"]
    sims = index_matrix[query_tfidf_vector]
    row = ["Cosine"]
    for i in range(1,6):
        row.append(np.argsort(sims)[-i] + 1)
    table.add_row(row)
    print(table)

+---------+----------------+----------------+----------------+----------------+----------------+
| query 1 | Top document 1 | Top document 2 | Top document 3 | Top document 4 | Top document 5 |
+---------+----------------+----------------+----------------+----------------+----------------+
|  Cosine |      8582      |      4817      |      2800      |      7230      |      4827      |
+---------+----------------+----------------+----------------+----------------+----------------+
+---------+----------------+----------------+----------------+----------------+----------------+
| query 2 | Top document 1 | Top document 2 | Top document 3 | Top document 4 | Top document 5 |
+---------+----------------+----------------+----------------+----------------+----------------+
|  Cosine |      6229      |      1248      |      8891      |      5775      |      7998      |
+---------+----------------+----------------+----------------+----------------+----------------+
+---------+----------------+--

+----------+----------------+----------------+----------------+----------------+----------------+
| query 26 | Top document 1 | Top document 2 | Top document 3 | Top document 4 | Top document 5 |
+----------+----------------+----------------+----------------+----------------+----------------+
|  Cosine  |      6842      |      5800      |      2632      |      7613      |      2016      |
+----------+----------------+----------------+----------------+----------------+----------------+
+----------+----------------+----------------+----------------+----------------+----------------+
| query 27 | Top document 1 | Top document 2 | Top document 3 | Top document 4 | Top document 5 |
+----------+----------------+----------------+----------------+----------------+----------------+
|  Cosine  |      9160      |      8517      |      6037      |      6004      |      124       |
+----------+----------------+----------------+----------------+----------------+----------------+
+----------+--------

+----------+----------------+----------------+----------------+----------------+----------------+
| query 49 | Top document 1 | Top document 2 | Top document 3 | Top document 4 | Top document 5 |
+----------+----------------+----------------+----------------+----------------+----------------+
|  Cosine  |      222       |      3045      |      9076      |      4209      |      9925      |
+----------+----------------+----------------+----------------+----------------+----------------+
+----------+----------------+----------------+----------------+----------------+----------------+
| query 50 | Top document 1 | Top document 2 | Top document 3 | Top document 4 | Top document 5 |
+----------+----------------+----------------+----------------+----------------+----------------+
|  Cosine  |      1845      |      1205      |      6616      |     10034      |      6417      |
+----------+----------------+----------------+----------------+----------------+----------------+
+----------+--------

+----------+----------------+----------------+----------------+----------------+----------------+
| query 69 | Top document 1 | Top document 2 | Top document 3 | Top document 4 | Top document 5 |
+----------+----------------+----------------+----------------+----------------+----------------+
|  Cosine  |      1954      |      4205      |      4438      |      6815      |      4612      |
+----------+----------------+----------------+----------------+----------------+----------------+
+----------+----------------+----------------+----------------+----------------+----------------+
| query 70 | Top document 1 | Top document 2 | Top document 3 | Top document 4 | Top document 5 |
+----------+----------------+----------------+----------------+----------------+----------------+
|  Cosine  |      626       |      1958      |      7181      |      155       |      441       |
+----------+----------------+----------------+----------------+----------------+----------------+
+----------+--------

+----------+----------------+----------------+----------------+----------------+----------------+
| query 87 | Top document 1 | Top document 2 | Top document 3 | Top document 4 | Top document 5 |
+----------+----------------+----------------+----------------+----------------+----------------+
|  Cosine  |      4737      |      3548      |      3139      |      3994      |     10806      |
+----------+----------------+----------------+----------------+----------------+----------------+
+----------+----------------+----------------+----------------+----------------+----------------+
| query 88 | Top document 1 | Top document 2 | Top document 3 | Top document 4 | Top document 5 |
+----------+----------------+----------------+----------------+----------------+----------------+
|  Cosine  |      9078      |     10185      |     10051      |      1074      |     10215      |
+----------+----------------+----------------+----------------+----------------+----------------+
+----------+--------