# Document Similarity

In [59]:
import os
import numpy as np
import pandas as pd
from copy import deepcopy
from time import time
import gensim
from gensim.test.utils import get_tmpfile
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
from nltk.tokenize import word_tokenize

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

## Scikit-Learn & Numpy (a lot shorter and faster than Gensim)

[Note 1](https://towardsdatascience.com/overview-of-text-similarity-metrics-3397c4601f50): with cosine similarity, we need to convert sentences into vectors. One way to do that is to use bag of words with either TF (term frequency) or TF-IDF (term frequency- inverse document frequency). The choice of TF or TF-IDF depends on application and is immaterial to how cosine similarity is actually performed — which just needs vectors. **TF is good for text similarity in general, but TF-IDF is good for search query relevance**

In [60]:
def query_cosine(query, X, j):
        
    X_copy = X.copy()
    X_copy = np.delete(X_copy, j, axis=0)                    # COPY TO EXAMPLES!
        
    #sims = 1 - pairwise_distances(query.reshape(1, -1), X_copy, metric='cosine').reshape(len(X_copy))
    sims = cosine_similarity(query.reshape(1, -1), X_copy).reshape(len(X_copy))
    sims = np.insert(sims, j, 0.0, axis=0)                  # COPY TO EXAMPLES!    
    res = sims.argsort()[-1:]
    sims_relevant = sims[res]
        
    return res[0], sims_relevant[0]
    
    # IF RETURNING TOP SEVERAL (+ need np.insert() above)    
    #res = np.where(sims > 0.9)[0]    # tuple of arrays by axis; since I have 1D array, only one axis, hence [0]
    #sims_relevant = None
    #if res.size == 0:
    #    res = sims.argsort()[-1:]
    #    sims_relevant = sims[res]   # if a single similarity less than 0.79
    #else:
    #    sims_relevant = sims[res]    
    #return res, sims_relevant    

In [61]:
# GO OVER TEST SET FILES AND SEND CLASSIFICATION REQUESTS
wdir = 'C:/Users/an/Documents/02_Practice/201911_similarities/corpus/txt/'

# dict to store categories and weights
print('Processing ....\r', end='')
my_corpus = list()

# walk through root directory
for dirName, subdirList, fileList in os.walk(wdir):
    
    
    # iterate over filenames
    for fname in fileList:
                
        with open(dirName + '/' + fname, 'r', encoding='utf-8') as f:
                        
            #print(fname)
            file_text = f.read()
            my_corpus.append((fname, file_text))

df = pd.DataFrame(my_corpus, columns=['file_name', 'text'])
print('Done           ')
df.head(10)

Done           


Unnamed: 0,file_name,text
0,0900a01f8023808d.pdf.txt,\nNo. 21 • July 2005\n\nCCoonntteennttss\n\nPP...
1,0900a01f802380ab.pdf.txt,\nNo. 21 • July 2005\n\nCCoonntteennttss\n\nPP...
2,0900a01f802fcfab.xls.txt,cross-table (A)\n\tCross-country Comparison : ...
3,0900a01f8032223e.pdf.txt,\nFigure 2\nOfficial inflation rate and percei...
4,0900a01f8032225b.pdf.txt,\nFigure 1\nOfficial inflation rate and percei...
5,0900a01f80322284.pdf.txt,"\nFigure 3\nBalance of the EU consumer survey,..."
6,0900a01f80322288.pdf.txt,\nFigure 4\nInflation perception according to ...
7,0900a01f80335411.doc.txt,\tAdditional information to be included on lef...
8,0900a01f80346f56.pdf.txt,\n2 IFC Bulletin 24 — August 2006\n\nBackgroun...
9,0900a01f80346fe0.pdf.txt,\n46 IFC Bulletin 24 — August 2006\n\nSummary ...


In [62]:
files = df.file_name.tolist()
raw_docs = df.text.tolist()

## TfidfVectorizer

In [63]:
# CV max_df=0.95, min_df=2 - wall time = 7 min on AC power, feature matrix shape = (839, ~61000)
# CV max_df=0.75, min_df=2 - wall time = 14 min on battery, feature matrix shape = (839, 41192) - different similar documents!
# Why? Also, not so many options for the first two docs as in previous line
# TV max_df=0.75, min_df=2 - 8 min on battery!, , feature matrix shape = (839, 41192), same comments
# TV(), 14 min, feature matrix shape = (839, 64261), also a lot of similar docs for the first two docs
# decreasing max_df helps avoid many duplicates in the fist 2 out of 290 docs. Meaning?

# Timewise, using pairwise.cosine_similarity = 1 - pairwise.pairwise_distance, results also seem to be same
cv = TfidfVectorizer()#(max_df=0.75, min_df=2)
X = np.array(cv.fit_transform(raw_docs).todense())

In [64]:
print(X.shape)
print(X[0])
X = X / X.sum(axis=1, keepdims=True)
X = np.nan_to_num(X)
print(X.shape)
print(X[0])

(839, 64261)
[0.01845992 0.01492181 0.         ... 0.         0.         0.        ]


  This is separate from the ipykernel package so we can avoid doing imports until


(839, 64261)
[0.00095211 0.00076962 0.         ... 0.         0.         0.        ]


In [65]:
start = time()
results = []
sims_relevants = []
for i in range(len(X)):
    if i % 10 == 0: print('Documents processed: %d\r'%i, end="")    # carriage return \r takes cursor to beginning of line
    result, sims_relevant = query_cosine(X[i], X, i)
    results.append(result)
    sims_relevants.append(sims_relevant)
end = time()
print('Time elapsed {} min'.format((end - start) / 60))

Time elapsed 7.992362181345622 min


In [66]:
df['sim_indices'] = results
df['similarities'] = sims_relevants

In [67]:
# GET FILE NAMES OF ACTUAL SIMILAR FILES
def convert_idx(idx, df):            
    return df.loc[idx]['file_name']

df['sim_names'] = df['sim_indices'].apply(lambda x: convert_idx(x, df))

## Old results from previous dashboard

In [68]:
df_old = pd.read_excel('Similarity_old.xlsx')
df_old.TARGET = df_old.TARGET + '.txt'
df_old['SIMILAR DOCUMENTS'] = df_old['SIMILAR DOCUMENTS'] + '.txt'
df_old.head()

Unnamed: 0,TARGET,SIMILAR DOCUMENTS,SIMILARITY SCORE
0,0900a01f8023808d.pdf.txt,0900a01f802380ab.pdf.txt,0.7155
1,0900a01f8023808d.pdf.txt,0900a01f80c2a574.doc.txt,0.3153
2,0900a01f8023808d.pdf.txt,0900a01f806d3ac7.doc.txt,0.2589
3,0900a01f8023808d.pdf.txt,0900a01f811e76c1.docx.txt,0.2438
4,0900a01f8023808d.pdf.txt,0900a01f811e0805.docx.txt,0.2422


In [69]:
def find_oldName(name, df_old):
    
    alist = df_old[df_old.TARGET == name][['SIMILARITY SCORE', 'SIMILAR DOCUMENTS']].values.tolist()
    alist = sorted(alist, reverse=True)
    
    return alist[0]

old_names = df_old.TARGET.tolist()
df['sim_names_old'] = df.file_name.apply(lambda x: find_oldName(x, df_old) if x in old_names else np.nan)

In [70]:
print(len(old_names))
len(set(old_names))

5190


515

## Does CountVectorizer provide similar results?

In [71]:
cv = CountVectorizer()#(max_df=0.75, min_df=2)
X2 = np.array(cv.fit_transform(raw_docs).todense())

In [72]:
print(X2.shape)
print(X2[0])
X2 = X2 / X2.sum(axis=1, keepdims=True)
X2 = np.nan_to_num(X2)
print(X2.shape)
print(X2[0])

(839, 64261)
[97 90  0 ...  0  0  0]


  This is separate from the ipykernel package so we can avoid doing imports until


(839, 64261)
[0.00075445 0.0007     0.         ... 0.         0.         0.        ]


In [73]:
start = time()
results2 = []
sims_relevants2 = []
for i in range(len(X2)):
    if i % 10 == 0: print('Documents processed: %d\r'%i, end="")    # carriage return \r takes cursor to beginning of line
    result, sims_relevant = query_cosine(X2[i], X2, i)
    results2.append(result)
    sims_relevants2.append(sims_relevant)
end = time()
print('Time elapsed {} min'.format((end - start) / 60))

Time elapsed 8.084981067975361 min


In [74]:
df['sim_indices_count'] = results2
df['similarities_count'] = sims_relevants2

In [75]:
# GET FILE NAMES OF ACTUAL SIMILAR FILES
def convert_idx(idx, df):            
    return df.loc[idx]['file_name']

df['sim_names_count'] = df['sim_indices_count'].apply(lambda x: convert_idx(x, df))

In [87]:
#df.head(20)

## Check if Countvectorizer and TfidfVectorizer results are identical

These are results without normalizatoin by doc length:  

cv = CountVectorizer()  
X = np.array(cv.fit_transform(raw_docs).todense())  

AND

cv = TfidfVectorizer()  
X2 = np.array(cv.fit_transform(raw_docs).todense())

In [20]:
idx_tfidf = df.sim_indices.values.tolist()
idx_count = df.sim_indices_count.values.tolist()

In [21]:
idx_tfidf == idx_count

False

In [24]:
# How many differences
c = 0
idxs = []
for idx, pair in enumerate(list(zip(idx_tfidf, idx_count))):
    if pair[0] != pair[1]:
        idxs.append(idx)
        c += 1
print('Number of differences:', c)
print('Occurred at the following indices:', idxs)

Number of differences: 269
Occurred at the following indices: [66, 72, 87, 97, 98, 103, 124, 125, 127, 128, 136, 140, 141, 145, 147, 148, 153, 154, 161, 163, 169, 171, 177, 180, 186, 191, 206, 207, 208, 215, 224, 228, 230, 231, 235, 237, 244, 248, 253, 259, 268, 271, 273, 277, 278, 283, 285, 286, 287, 288, 290, 291, 292, 294, 301, 303, 304, 305, 306, 309, 314, 318, 320, 321, 326, 328, 329, 332, 333, 334, 337, 343, 356, 357, 358, 359, 364, 366, 367, 368, 369, 372, 374, 375, 383, 384, 391, 393, 394, 396, 398, 399, 407, 409, 419, 422, 425, 428, 431, 435, 437, 438, 445, 446, 448, 451, 452, 453, 456, 457, 460, 464, 466, 469, 470, 471, 472, 475, 477, 479, 480, 484, 487, 489, 491, 492, 493, 497, 499, 502, 506, 507, 517, 519, 523, 524, 526, 528, 529, 530, 533, 534, 535, 536, 537, 543, 546, 547, 549, 550, 551, 552, 556, 557, 561, 562, 564, 565, 566, 568, 569, 571, 576, 577, 582, 585, 587, 588, 592, 597, 598, 601, 602, 605, 609, 611, 616, 617, 618, 619, 621, 622, 627, 628, 630, 631, 635, 638, 63

MANUAL ANALYSIS OF DIFFERENCES  
* index 66 above - the three documents look quite different (Bulletins for different years w/different content, but apparently similar vocabulary with CountVectorizer score > 0.9900000 and TfidfVectorizer score > 0.9600
* 72 above - different documents

The documents with high similarity scores tend to be correctly similar for both tfidf and countv, differences start at index 66 where documents are already very different (although the countv similarity score still remains > 0.990000 while tfidf is in mid nineties which means that the tfidf-based similarity score looks more reasonable - may still need additional scaling to decrease it for less similar documents - by decreasing mx_df?).  

##############################################################################################################################

These are results **with normalizatoin by doc length**:  

cv = CountVectorizer()  
X = np.array(cv.fit_transform(raw_docs).todense())  
X = X / X.sum(axis=1, keepdims=True)  
X = np.nan_to_num(X)

AND

cv = TfidfVectorizer()  
X2 = np.array(cv.fit_transform(raw_docs).todense())  
X2 = X2 / X2.sum(axis=1, keepdims=True)  
X2 = np.nan_to_num(X2)

In [83]:
idx_tfidf = df2.sim_indices.values.tolist()
idx_count = df2.sim_indices_count.values.tolist()

In [84]:
idx_tfidf == idx_count

False

In [85]:
# How many differences (DO THIS AFTER SORTING BY TF-IDF SIMILARITY)
c = 0
idxs = []
for idx, pair in enumerate(list(zip(idx_tfidf, idx_count))):
    if pair[0] != pair[1]:
        idxs.append(idx)
        c += 1
print('Number of differences:', c)
print('Occurred at the following indices:', idxs)

Number of differences: 269
Occurred at the following indices: [66, 72, 86, 97, 98, 103, 124, 125, 127, 128, 136, 140, 141, 145, 147, 148, 153, 154, 161, 163, 169, 172, 177, 180, 185, 191, 206, 207, 209, 215, 223, 228, 230, 231, 235, 237, 244, 248, 253, 259, 268, 271, 273, 277, 278, 283, 284, 286, 287, 288, 290, 291, 292, 294, 301, 303, 304, 305, 306, 309, 314, 318, 320, 321, 326, 328, 329, 332, 333, 334, 337, 343, 356, 357, 358, 359, 364, 366, 367, 368, 369, 372, 374, 375, 383, 384, 391, 393, 394, 396, 398, 399, 407, 409, 419, 422, 425, 428, 431, 435, 437, 438, 445, 446, 448, 451, 452, 453, 456, 457, 460, 464, 466, 469, 470, 471, 473, 475, 477, 479, 480, 484, 487, 489, 491, 492, 493, 497, 499, 502, 506, 507, 517, 519, 523, 524, 526, 528, 529, 530, 533, 534, 535, 536, 537, 543, 546, 547, 549, 550, 551, 552, 556, 557, 561, 563, 564, 565, 566, 568, 569, 571, 576, 577, 582, 585, 587, 588, 592, 597, 598, 601, 602, 605, 609, 611, 616, 617, 618, 620, 621, 622, 627, 628, 630, 631, 635, 637, 63

MANUAL ANALYSIS OF DIFFERENCES  
EXATLY THE SAME NUMBER OF DIFFERENT CASES, EXACTLY THE SAME INDICES, EXACTLY THE SAME VALUES OF COSINE SIMILARITY WHEN LOOKING INSIDE THE FILE.  

**CONCLUSION - NORMALIZATION IS ABSOLUTELY UNNECESSARY FOR THIS CASE. TO DOUBLE CHECK WHEN A QUERY COMES FROM OUTSIDE OF THE TF-IDF VECTORIZED CORPUS. MAYBE USING THE SAME VECTORIZER WILL BE ENOUGH**

**TF-IDF VECTORIZER PROVIDES MORE REASONABLE (LOWER) SIMILARITY SCORES VS. COUNTVECTORIZER: 0.96 vs. 0.99 WHEN DOCUMENTS START BEING REALLY DIFFERENT AND 0.11 vs. 0.54 RIGHT BEFORE CHANGE TO 0.0 SIMILARITY**

## Finalize and save

In [86]:
df2 = deepcopy(df)
df2 = df2.sort_values(by='similarities', ascending=False)
#df2.head()

In [82]:
df2[['file_name','sim_names','sim_names_count', 'sim_indices','similarities','sim_indices_count', 'similarities_count', 'sim_names_old']].to_csv('similarity_results.csv')