In [15]:
from normalization import normalize_corpus
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd


In [3]:
toy_corpus = ['The sky is blue',
'The sky is blue and beautiful',
'Look at the bright blue sky!',
'Python is a great Programming language',
'Python and Java are popular Programming languages',
'Among Programming languages, both Python and Java are the most used in Analytics',
'The fox is quicker than the lazy dog',
'The dog is smarter than the fox',
'The dog, fox and cat are good friends']

In [26]:
query_docs = ['A dog goes on quest to discover his purpose in life over the course of several lifetimes with multiple owners',
            'Java is a static typed programming language unlike Python',
            'I love to relax under the beautiful blue sky!']  

In [21]:
movie_data =  pd.read_csv('../dataset/movieDescriptionDataSet.tsv', sep='\t')
toy_corpus = movie_data['Description'].tolist()

norm_corpus = normalize_corpus(toy_corpus, lemmatize=True)

In [22]:
tfidf_vectorizer = TfidfVectorizer(min_df=0.0, max_df=1.0,ngram_range=(1, 1))

tfidf_features = tfidf_vectorizer.fit_transform(norm_corpus).astype(float)
                                                        
                                                        



In [27]:
# normalize and extract features from the query corpus
norm_query_docs =  normalize_corpus(query_docs, lemmatize=True)      


query_docs_tfidf = tfidf_vectorizer.transform(norm_query_docs)

In [24]:
def compute_cosine_similarity(doc_features, corpus_features,
                              top_n=3):
    # get document vectors
    doc_features = doc_features.toarray()[0]
    corpus_features = corpus_features.toarray()
    # compute similarities
    similarity = np.dot(doc_features, 
                        corpus_features.T)
    # get docs with highest similarity scores
    top_docs = similarity.argsort()[::-1][:top_n]
    top_docs_with_score = [(index, round(similarity[index], 3))
                            for index in top_docs]
    return top_docs_with_score

In [28]:
print ('Document Similarity Analysis using Cosine Similarity')
print ('='*60)
for index, doc in enumerate(query_docs):
    
    doc_tfidf = query_docs_tfidf[index]
    top_similar_docs = compute_cosine_similarity(doc_tfidf,
                                             tfidf_features,
                                             top_n=2)
    print ('Document',index+1 ,':', doc)
    print ('Top', len(top_similar_docs), 'similar docs:')
    print ('-'*40)
    for doc_index, sim_score in top_similar_docs:
        print ('Doc num: {} Similarity Score: {}\nDoc: {}'.format(doc_index+1,
                                                                 sim_score,
                                                                 toy_corpus[doc_index]))
        print ('-'*40)
    print                                  

Document Similarity Analysis using Cosine Similarity
Document 1 : A dog goes on quest to discover his purpose in life over the course of several lifetimes with multiple owners
Top 2 similar docs:
----------------------------------------
Doc num: 102 Similarity Score: 1.0
Doc: A dog goes on quest to discover his purpose in life over the course of several lifetimes with multiple owners.
----------------------------------------
Doc num: 3 Similarity Score: 0.095
Doc: The film reveals the origin story of half-human, half-Atlantean Arthur Curry and takes him on the journey of his lifetime—one that will not only force him to face who he really is, but to discover if he is worthy of who he was born to be… a king.
----------------------------------------
Document 2 : Java is a static typed programming language unlike Python
Top 2 similar docs:
----------------------------------------
Doc num: 305 Similarity Score: 0.209
Doc: In a world divided into factions based on personality types, Tris lea

In [12]:
def compute_hellinger_bhattacharya_distance(doc_features, corpus_features,
                                            top_n=3):
    # get document vectors                                            
    doc_features = doc_features.toarray()[0]
    corpus_features = corpus_features.toarray()
    # compute hb distances
    distance = np.hstack(
                    np.sqrt(0.5 *
                            np.sum(
                                np.square(np.sqrt(doc_features) - 
                                          np.sqrt(corpus_features)), 
                                axis=1)))
    # get docs with lowest distance scores                            
    top_docs = distance.argsort()[:top_n]
    top_docs_with_score = [(index, round(distance[index], 3))
                            for index in top_docs]
    return top_docs_with_score 

In [13]:
print ('Document Similarity Analysis using Cosine Similarity')
print ('='*60)
for index, doc in enumerate(query_docs):
    
    doc_tfidf = query_docs_tfidf[index]
    top_similar_docs = compute_cosine_similarity(doc_tfidf,
                                             tfidf_features,
                                             top_n=2)
    print ('Document',index+1 ,':', doc)
    print ('Top', len(top_similar_docs), 'similar docs:')
    print ('-'*40)
    for doc_index, sim_score in top_similar_docs:
        print ('Doc num: {} Similarity Score: {}\nDoc: {}'.format(doc_index+1,
                                                                 sim_score,
                                                                 toy_corpus[doc_index]))
        print ('-'*40)
    print   

Document Similarity Analysis using Cosine Similarity
Document 1 : The fox is definitely smarter than the dog
Top 2 similar docs:
----------------------------------------
Doc num: 8 Similarity Score: 1.0
Doc: The dog is smarter than the fox
----------------------------------------
Doc num: 7 Similarity Score: 0.426
Doc: The fox is quicker than the lazy dog
----------------------------------------
Document 2 : Java is a static typed programming language unlike Python
Top 2 similar docs:
----------------------------------------
Doc num: 5 Similarity Score: 0.837
Doc: Python and Java are popular Programming languages
----------------------------------------
Doc num: 6 Similarity Score: 0.661
Doc: Among Programming languages, both Python and Java are the most used in Analytics
----------------------------------------
Document 3 : I love to relax under the beautiful blue sky!
Top 2 similar docs:
----------------------------------------
Doc num: 2 Similarity Score: 1.0
Doc: The sky is blue an