# Document Similarity Baseline Code

In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import gensim
from gensim.test.utils import get_tmpfile
from nltk.tokenize import word_tokenize
%matplotlib inline

## Scikit-Learn & Numpy (a lot shorter and faster than Gensim)

In [2]:
def query_cosine(query, X):
    sims = 1 - pairwise_distances(query.reshape(1, -1), X, metric='cosine').reshape(len(X))        
    res = np.where(sims > 0.1)[0]    # tuple of arrays by axis; since I have 1D array, only one axis, hence [0]
            
    sims_relevant = None
    if res.size == 0:
        res = sims.argsort()[-1:]
        sims_relevant = sims[res]   # if a single similarity less than 0.79
    else:
        sims_relevant = sims[res]
    
    return res, sims_relevant    

In [3]:
raw_documents = ["Once upon a time on Sunday.",
                 "Two crocodiles in the river of Nile.",
                 "The barber next door cuts his hair.",
                 "Calm you mind, calm your senses.",
                 "Fun is always fun."]

In [10]:
%%time
cv = CountVectorizer()
X = np.array(cv.fit_transform(raw_documents).todense())

ml_tweet = 'Calm you mind, calm your senses on Sunday of Nile a barber fun.'
query = np.array(cv.transform([ml_tweet]).todense())[0]
result, sims_relevant = query_cosine(query, X)

for item in zip(sims_relevant, result):
    print('{}: {}'.format(item[0], raw_documents[item[1]]))

0.23904572186687867: Once upon a time on Sunday.
0.20203050891044216: Two crocodiles in the river of Nile.
0.10101525445522108: The barber next door cuts his hair.
0.7559289460184544: Calm you mind, calm your senses.
0.21821789023599236: Fun is always fun.
Wall time: 1.96 ms


## Gensim
Loosely based on:  
https://www.oreilly.com/learning/how-do-i-compare-document-similarity-using-python

In [5]:
#print(dir(gensim))

In [6]:
raw_documents = ["Once upon a time on Sunday.",
                 "Two crocodiles in the river of Nile.",
                 "The barber next door cuts his hair.",
                 "Calm you mind, calm your senses.",
                 "Fun is always fun."]

In [9]:
%%time
gen_docs = [[w.lower() for w in word_tokenize(text) if w != '.'] for text in raw_documents]

# A dictionary maps every word to a number
dictionary = gensim.corpora.Dictionary(gen_docs)
random_word = dictionary[5]
print('Random word:', dictionary[5])
print('Its index:', dictionary.token2id[random_word])
#print(dictionary.token2id)

# corpus = list of bags of words (based on the number of times each word occurs in the document)
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
print('\nSome examples from corpus:')
min_idx = 2
if len(corpus) < min_idx:
    min_idx = len(corpus)
for doc, sent in list(zip(raw_documents, corpus))[:min_idx]:
    print(doc)
    print(sent)
    print()

# tf-idf model from the corpus (num_nnz = number of tokens)
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)
num_tokens = 0
for i in corpus: num_tokens += len(i)
print('Tokens:', num_tokens)

# a similarity measure object in tf-idf space ('get_tmpfile' lets you store index in a temp file)
# Alternative to get_tmpfile:
# use 'folder/fname.txt' instead of index_temp, but you have to create this folder within the current directory
index_temp = get_tmpfile("index")
sims = gensim.similarities.Similarity(index_temp, tf_idf[corpus], num_features=len(dictionary))
print(sims)
print(type(sims), '\n')

# query a document and find matches greater than a threshold or just the top match
test_doc = 'Calm you mind, calm your senses on Sunday of Nile a barber fun.'
query_doc_bow = dictionary.doc2bow([w.lower() for w in word_tokenize(test_doc) if w != '.'])
query_doc_tf_idf = tf_idf[query_doc_bow]

# See all similarities
similarities = sims[query_doc_tf_idf]

# Average similarity over all docs in the corpus
#ave_sim =(np.sum(sims[query_doc_tf_idf], dtype=np.float32)) / len(raw_documents)
#print('Average similarity over corpus:', round(ave_sim, 4))

res = np.where(similarities > 0.1)[0]    # tuple of arrays by axis; since I have 1D array, only one axis, hence [0]

sims_relevant = None
if res.size == 0:
    res = similarities.argsort()[-1:]
    sims_relevant = similarities[res]   # if a single similarity less than 0.79
else:
    sims_relevant = similarities[res]

for item in zip(sims_relevant, res):
    print('{}: {}'.format(item[0], raw_documents[item[1]]))

Random word: upon
Its index: 5

Some examples from corpus:
Once upon a time on Sunday.
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]

Two crocodiles in the river of Nile.
[(6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)]

TfidfModel(num_docs=5, num_nnz=29)
Tokens: 29
Similarity index with 5 documents in 0 shards (stored under C:\Users\anedilko\AppData\Local\Temp\index)
<class 'gensim.similarities.docsim.Similarity'> 

0.3061862289905548: Once upon a time on Sunday.
0.19882437586784363: Two crocodiles in the river of Nile.
0.7499999403953552: Calm you mind, calm your senses.
0.20412415266036987: Fun is always fun.
Wall time: 7.5 ms


**Gensim uses slightly different similarity scores**