# Cosine Similarity

In [1]:
# Define the documents
doc_trump = "Mr. Trump became president after winning the political election. Though he lost the support of some republican friends, Trump is friends with President Putin"
doc_election = "President Trump says Putin had no political interference is the election outcome. He says it was a witchhunt by political parties. He claimed President Putin is a friend who had nothing to do with the election"
doc_putin = "Post elections, Vladimir Putin became President of Russia. President Putin had served as the Prime Minister earlier in his political career"
documents = [doc_trump, doc_election, doc_putin]

In [2]:
# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
# count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(documents)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['doc_trump', 'doc_election', 'doc_putin'])
df

Unnamed: 0,career,claimed,earlier,election,elections,friend,friends,interference,lost,minister,...,putin,republican,russia,says,served,support,trump,vladimir,winning,witchhunt
doc_trump,0,0,0,1,0,0,2,0,1,0,...,1,1,0,0,0,1,2,0,1,0
doc_election,0,1,0,2,0,1,0,1,0,0,...,2,0,0,2,0,0,1,0,0,1
doc_putin,1,0,1,0,1,0,0,0,0,1,...,2,0,1,0,1,0,0,1,0,0


In [3]:
# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(df, df)

array([[1.        , 0.51639778, 0.36893239],
       [0.51639778, 1.        , 0.45360921],
       [0.36893239, 0.45360921, 1.        ]])

#### Alternatively, use TfidfVectorizer instead of CounterVectorizer, to downweight words that occur freuently across documents

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_2 = TfidfVectorizer(stop_words='english')
sparse_matrix_2 = vectorizer_2.fit_transform(documents)
doc_term_matrix_2 = sparse_matrix_2.todense()

In [5]:
cosine_similarity(doc_term_matrix_2)

array([[1.        , 0.33027897, 0.18740386],
       [0.33027897, 1.        , 0.24226661],
       [0.18740386, 0.24226661, 1.        ]])

# Soft Cosine Similiarity

#### Soft cosine similarity can be used on documents from completely different topics. It gives higher scores for documents belonging to the same topic and lower scores when they are from different topics.

In [6]:
# Define the documents
doc_soup = "Soup is a primarily liquid food, generally served warm or hot (but may be cool or cold), that is made by combining ingredients of meat or vegetables with stock, juice, water, or another liquid. "
doc_noodles = "Noodles are a staple food in many cultures. They are made from unleavened dough which is stretched, extruded, or rolled flat and cut into one of a variety of shapes."
doc_dosa = "Dosa is a type of pancake from the Indian subcontinent, made from a fermented batter. It is somewhat similar to a crepe in appearance. Its main ingredients are rice and black gram."
documents = [doc_trump, doc_election, doc_putin, doc_soup, doc_noodles, doc_dosa]

In [13]:
import gensim
from gensim.matutils import softcossim
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess

In [16]:
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')



In [17]:
dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])

In [34]:
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

  """Entry point for launching an IPython kernel.


In [38]:
# Convert the sentences into bag-of-words vectors.
sent_1 = dictionary.doc2bow(simple_preprocess(doc_trump))
sent_2 = dictionary.doc2bow(simple_preprocess(doc_election))
sent_3 = dictionary.doc2bow(simple_preprocess(doc_putin))
sent_4 = dictionary.doc2bow(simple_preprocess(doc_soup))
sent_5 = dictionary.doc2bow(simple_preprocess(doc_noodles))
sent_6 = dictionary.doc2bow(simple_preprocess(doc_dosa))
sentences = [sent_1, sent_2, sent_3, sent_4, sent_5, sent_6]

In [41]:
# Compute soft cosine similarity
softcossim(sent_1, sent_2, similarity_matrix)


  


0.5842470143211804

In [44]:
import numpy as np
import pandas as pd

def create_soft_cossim_matrix(sentences):
    len_array = np.arange(len(sentences))
    xx, yy = np.meshgrid(len_array, len_array)
    cossim_mat = pd.DataFrame([[round(softcossim(sentences[i],sentences[j], similarity_matrix) ,2) for i, j in zip(x,y)] for y, x in zip(xx, yy)])
    return cossim_mat

create_soft_cossim_matrix(sentences)

  import sys


Unnamed: 0,0,1,2,3,4,5
0,1.0,0.58,0.56,0.28,0.34,0.4
1,0.58,1.0,0.54,0.25,0.31,0.43
2,0.56,0.54,1.0,0.19,0.25,0.36
3,0.28,0.25,0.19,1.0,0.5,0.38
4,0.34,0.31,0.25,0.5,1.0,0.56
5,0.4,0.43,0.36,0.38,0.56,1.0
