## Document Similarity Measure Based on TF-IDF Feature Vector Extraction

## Imports

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import copy

import NLPEngineComponent as nlp
import DocumentFeatureVectorExtractionInterface as fe
import GenericDataSerializerComponent as s
import ProblemSpecsComponent as p

### Specify contants and variables

### Specify the Problem Datasets

#### Get Data for the 'Toy Problem'

In [6]:
toy_problem = p.ToyProblem()
toy_problem_corpus = toy_problem.getCorpus()
num_docs = len(toy_problem_corpus)


De-Serializing data located in the path: Data\Book_List_Dataset.pl
Number of rows for the Toy Problem data are: 24


#### Get Data for 'AIRA problem'

### Normalize (Clean text data)

In [7]:
nlp_engine = nlp.NLPEngine(toy_problem_corpus)
clean_toy_problem_corpus = nlp_engine.preprocessDocs()
print("Sample rows of the data BEFORE cleaning:\n\n{}".format(toy_problem_corpus[:3]))
print("\n")
print(" -------------------------------------------------------")
print("\n")
print("Sample rows of the data AFTER cleaning:\n\n{}".format(clean_toy_problem_corpus[:3]))

Sample rows of the data BEFORE cleaning:

['Natural Language Processing in Action: Understanding, analyzing, and generating text with Python', 'Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit', 'Neural Network Methods for Natural Language Processing (Synthesis Lectures on Human Language Technologies)']


 -------------------------------------------------------


Sample rows of the data AFTER cleaning:

[['natural', 'language', 'processing', 'action', 'understanding', 'analyzing', 'generating', 'text', 'python'], ['natural', 'language', 'processing', 'python', 'analyzing', 'text', 'natural', 'language', 'toolkit'], ['neural', 'network', 'methods', 'natural', 'language', 'processing', 'synthesis', 'lectures', 'human', 'language', 'technologies']]


### Specify the TF-IDF Vector Extraction Component
    - Defines logic for converting the data from text to a feature vector/matrix


In [8]:
class TFIDFVectorExtraction(fe.IFeatureVectorExtraction):
    def __init__(self, is_reshape_corpus=True):
        self.__is_reshape_coprpus = is_reshape_corpus
        self.__reshaped_corpus = None
        
    def createFeatureMatrix(self, corpus):
        vectorizer = TfidfVectorizer(min_df=0.0, max_df=1.0, ngram_range=(1,1))
        if self.__is_reshape_coprpus:
            self.__reshaped_corpus = TFIDFVectorExtraction.preprocessData(corpus)
        else:
            self.__reshaped_corpus = copy.deepcopy(corpus)
        #print(self.__reshaped_corpus[:3])
        feature_matrix = vectorizer.fit_transform(self.__reshaped_corpus).astype(float)
        dense_feature_matrix = feature_matrix.toarray()
        return (feature_matrix, dense_feature_matrix)
    
    def measureSimilarity(self, doc_vec_1, doc_vec_2):
        cosine_measure = np.dot(doc_vec_1, doc_vec_2)
        return cosine_measure
    
    @staticmethod
    def preprocessData(corpus):
        return [' '.join(x) for x in corpus]

### Create Feature Vectors/Matrix and compute document Similarity Measures 

#### Toy Problem: Run Feature Extraction and Measure Similarity b/w document pairs

In [15]:
def createLowerTriangularMatrixOfPairs(num_docs):
        """
        Create triangular matrix indices pairs for the similarity measure
        """
        matrix = np.zeros((num_docs, num_docs))
        indices = np.tril_indices_from(matrix)
        n_rows = indices[0].shape[0]
        pairs = [(indices[0][i], indices[1][i]) for i in range(n_rows) if not indices[0][i] == indices[1][i]]
        return pairs

def computeDocSimilarity(corpus, doc_pair_indices):
        tfidf_extractor = TFIDFVectorExtraction()
        sparse_matrix, dense_matrix = tfidf_extractor.createFeatureMatrix(corpus)
        similarity_measures = {}
        for ind in doc_pair_indices:
            doc_1_ind, doc_2_ind = ind
            similarity = tfidf_extractor.measureSimilarity(dense_matrix[doc_1_ind], dense_matrix[doc_2_ind])
            key = "{0}, {1}".format(doc_1_ind, doc_2_ind)
            similarity_measures[key] = similarity
        return similarity_measures
    
def displaySortedSimilarityMeasures(similarity_measures):
    print("similarity_measures are:\n\n")
    for key, value in sorted(similarity_measures.items(), key=lambda item: item[1], reverse=True):
        print("{0}: {1}".format(key, value))
    
doc_pair_indices = createLowerTriangularMatrixOfPairs(num_docs)
similarity_measures = computeDocSimilarity(clean_toy_problem_corpus, doc_pair_indices)
displaySortedSimilarityMeasures(similarity_measures)


similarity_measures are:


15, 9: 0.9999999999999999
16, 9: 0.68195260100786
16, 15: 0.68195260100786
22, 9: 0.6083656053723908
22, 15: 0.6083656053723908
22, 20: 0.5712526664216422
1, 0: 0.5020980603710185
15, 3: 0.5006004035371894
9, 3: 0.5006004035371894
20, 9: 0.4940286116314959
20, 15: 0.4940286116314959
15, 14: 0.47091738159528873
14, 9: 0.47091738159528873
16, 14: 0.45118421703186196
9, 5: 0.43954675463780163
15, 5: 0.43954675463780163
8, 3: 0.4241983247939326
22, 16: 0.40372743066785866
14, 11: 0.39997221699995633
12, 10: 0.39739525398946884
20, 16: 0.39656009919649593
11, 4: 0.39430737109764
14, 4: 0.3790143238321659
16, 5: 0.3542113720827364
16, 3: 0.3500844487909183
16, 1: 0.3402396598960821
15, 1: 0.3397274964884886
9, 1: 0.3397274964884886
5, 3: 0.3375449803134372
23, 9: 0.331863654902007
23, 15: 0.331863654902007
14, 5: 0.33143251740994106
14, 3: 0.32787737791248306
17, 3: 0.3166874338227689
22, 3: 0.3090067764318382
17, 16: 0.304184149314994
21, 16: 0.30240544671538233
1