# Similarity

This example demonstrates how to perform text similarity with nltk

In [20]:
# !pip3 install sklearn
import nltk
import math
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
class TextSimilarity:
    def __init__(self):
        self.statements = [
            'I like cats',
            'The weather is good',
            'He is a cat lover',
            'This is a good day'
        ]
        
    def tf(self, sentence):
        words = nltk.word_tokenize(sentence.lower())
        freq = nltk.FreqDist(words)
        dictionary = {}
        for key in freq.keys():
            norm = freq[key]/float(len(words))
            dictionary[key] = norm
        return dictionary

    def idf(self):
        def idf(total_number_of_documents, number_of_documents_with_this_word):
            return 1.0 + math.log(total_number_of_documents/number_of_documents_with_this_word)
        
        num_documents = len(self.statements)
        unique_words = {}
        idf_values = {}
        for sentence in self.statements:
            for word in nltk.word_tokenize(sentence.lower()):
                if word not in unique_words:
                    unique_words[word] = 1
                else:
                    unique_words[word] += 1
        for word in unique_words:
            idf_values[word] = idf(num_documents, unique_words[word])
        return idf_values

    def tfidf(self, query):
        words = nltk.word_tokenize(query.lower())
        idf = self.idf()
        vectors = {}
        for sentence in self.statements:
            tf = self.tf(sentence)
            for word in words:
                tfv = tf[word] if word in tf else 0.0
                idfv = idf[word] if word in idf else 0.0
                mul = tfv * idfv
                if word not in vectors:
                    vectors[word] = []
                vectors[word].append(mul)
        return vectors
    
    def display_vectors(self, vectors):
        print(self.statements)
        for word in vectors:
            print('{} -> {}'.format(word, vectors[word]))
            
    def cosine_similarity(self):
        vec = TfidfVectorizer()
        matrix = vec.fit_transform(self.statements)
        for j in range(1, 5):
            i = j - 1
            print('similarity of document {} with others'.format(i))
            similarity = cosine_similarity(matrix[i:j], matrix)
            print(similarity)
    
    def demo(self):
        input_query = random.choice(self.statements)
        vectors = self.tfidf(input_query)
        self.display_vectors(vectors)
        self.cosine_similarity()

In [26]:
similarity = TextSimilarity()
similarity.demo()

['I like cats', 'The weather is good', 'He is a cat lover', 'This is a good day']
he -> [0.0, 0.0, 0.47725887222397817, 0.0]
is -> [0.0, 0.3219205181129452, 0.2575364144903562, 0.2575364144903562]
a -> [0.0, 0.0, 0.3386294361119891, 0.3386294361119891]
cat -> [0.0, 0.0, 0.47725887222397817, 0.0]
lover -> [0.0, 0.0, 0.47725887222397817, 0.0]
similarity of document 0 with others
[[1. 0. 0. 0.]]
similarity of document 1 with others
[[0.         1.         0.12681481 0.33971674]]
similarity of document 2 with others
[[0.         0.12681481 1.         0.12681481]]
similarity of document 3 with others
[[0.         0.33971674 0.12681481 1.        ]]
