# Cosine Similarity

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
doc1 = " This is a very good and plain paper. This is really good and interesting"
doc2 = "This paper is very interesting, awesome"

In [3]:
# We can also tokenize a sentence or text by using word_tokenize
tokens = word_tokenize(doc1)
tokens
# Now here this hsould be used after removing punctuations

['This',
 'is',
 'a',
 'very',
 'good',
 'and',
 'plain',
 'paper',
 '.',
 'This',
 'is',
 'really',
 'good',
 'and',
 'interesting']

In [4]:
def clean_text(sent):
    sent = sent.strip(" ")
    sent = re.sub("\s+"," ",sent)
    tokens = word_tokenize(sent)
    stop_updated = stopwords.words("english") + list(punctuation)
    # Removing punctuations also with stopwords
    final_word = [term for term in tokens if term not in stop_updated and len(term)>2]
    # Tanking that words which are having more than 2 charachters
    res = " ".join(final_word)
    return res

In [5]:
doc1_clean = clean_text(doc1.lower())
doc1_clean

'good plain paper really good interesting'

In [6]:
doc2_clean = clean_text(doc2.lower())
doc2_clean

'paper interesting awesome'

In [7]:
doc = pd.DataFrame([doc1_clean, doc2_clean], columns = ['cleaned_text'])
doc

Unnamed: 0,cleaned_text
0,good plain paper really good interesting
1,paper interesting awesome


## Extract features from the text
- CountVectorizer extracts features from the text 
- CountVectorizer allows to apply some transformations on text so that we arrive at a matrix of numbers

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

- fit: extract distinct words from the text corpus to form a bag of words

In [9]:
cv.fit(doc['cleaned_text'])
# extract different words from the text

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [10]:
# Obtaining the BOW's
cv.get_feature_names()

['awesome', 'good', 'interesting', 'paper', 'plain', 'really']

In [11]:
cv.vocabulary_
# this is giving the index of each word

{'good': 1,
 'plain': 4,
 'paper': 3,
 'really': 5,
 'interesting': 2,
 'awesome': 0}

In [12]:
x = cv.transform(doc['cleaned_text'])
x
# this gives the sparse matrix of 2 rows (records) and 6 columns(features --> unique words)

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [13]:
x.toarray()

array([[0, 2, 1, 1, 1, 1],
       [1, 0, 1, 1, 0, 0]], dtype=int64)

In [14]:
x.todense()
# This gives a matrix

matrix([[0, 2, 1, 1, 1, 1],
        [1, 0, 1, 1, 0, 0]], dtype=int64)

In [15]:
# Document Term Matrix- Term Frequency Matrix
dtm = pd.DataFrame(x.toarray(), columns = cv.get_feature_names())
dtm
# Features will be columns

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0,2,1,1,1,1
1,1,0,1,1,0,0


In [16]:
# Transpose the document term matrix
dtm.T
# Term Document Matrix- Features are the index

Unnamed: 0,0,1
awesome,0,1
good,2,0
interesting,1,1
paper,1,1
plain,1,0
really,1,0


# COSINE SIMILARITY

In [28]:
from sklearn.metrics.pairwise import cosine_similarity
dtm

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0,2,1,1,1,1
1,1,0,1,1,0,0


In [29]:
# Finding similarity between documents in the corpus
cs = cosine_similarity(dtm)
# we can pass document term matrix or tfidf matrix
cs
# Document to document itself is 1 and document to other document is 0.40824829

array([[1.        , 0.40824829],
       [0.40824829, 1.        ]])

In [31]:
# Finding similarity between 2 words across the corpus
# Transpose of Document Term Matrix is called as Term Document Matrix(TDM)
cs_words = cosine_similarity(dtm.T) # passing the transpose of dtm as we have to find the similarity between the words
cs_words

array([[1.        , 0.        , 0.70710678, 0.70710678, 0.        ,
        0.        ],
       [0.        , 1.        , 0.70710678, 0.70710678, 1.        ,
        1.        ],
       [0.70710678, 0.70710678, 1.        , 1.        , 0.70710678,
        0.70710678],
       [0.70710678, 0.70710678, 1.        , 1.        , 0.70710678,
        0.70710678],
       [0.        , 1.        , 0.70710678, 0.70710678, 1.        ,
        1.        ],
       [0.        , 1.        , 0.70710678, 0.70710678, 1.        ,
        1.        ]])

In [32]:
sim_df = pd.DataFrame(cs_words, columns = dtm.columns, index = dtm.columns)
sim_df

Unnamed: 0,awesome,good,interesting,paper,plain,really
awesome,1.0,0.0,0.707107,0.707107,0.0,0.0
good,0.0,1.0,0.707107,0.707107,1.0,1.0
interesting,0.707107,0.707107,1.0,1.0,0.707107,0.707107
paper,0.707107,0.707107,1.0,1.0,0.707107,0.707107
plain,0.0,1.0,0.707107,0.707107,1.0,1.0
really,0.0,1.0,0.707107,0.707107,1.0,1.0


# Example:

In [34]:
doc_trump = "Mr. Trump became president after winning the political election.\
Though he lost the support of some republican friends, Trump is friends with President Putin"

doc_election = "President Trump says Putin had no political interference in the election outcome.\
He says it was a witchhunt by political parties.\
He claimed President Putin is a friend who had nothing to do with the election"

doc_putin = "Post elections, Vladimir Putin became President of Russia.\
President Putin had served as the Prime Minister earlier in his political career"

In [36]:
df = pd.DataFrame([doc_trump, doc_election, doc_putin], columns = ['Text'])
df

Unnamed: 0,Text
0,Mr. Trump became president after winning the p...
1,President Trump says Putin had no political in...
2,"Post elections, Vladimir Putin became Presiden..."


In [38]:
cv = CountVectorizer(stop_words = 'english')
x = cv.fit_transform(df['Text'])
dtm = pd.DataFrame(x.toarray(), columns = cv.get_feature_names())
dtm

Unnamed: 0,career,claimed,earlier,election,elections,friend,friends,interference,lost,minister,...,putin,republican,russia,says,served,support,trump,vladimir,winning,witchhunt
0,0,0,0,1,0,0,2,0,1,0,...,1,1,0,0,0,1,2,0,1,0
1,0,1,0,2,0,1,0,1,0,0,...,2,0,0,2,0,0,1,0,0,1
2,1,0,1,0,1,0,0,0,0,1,...,2,0,1,0,1,0,0,1,0,0


- Find the pairwise cosine matrix between documents and words.
- Given an input document, find the document which is most similar.
- Given an input word, find the top 5 words with the highest similarity score.

In [41]:
cs_documents = cosine_similarity(dtm)
df_cs_documents = pd.DataFrame(cs_documents, columns = ['doc_trump','doc_election','doc_putin'], index = ['doc_trump','doc_election','doc_putin'])
df_cs_documents

Unnamed: 0,doc_trump,doc_election,doc_putin
doc_trump,1.0,0.516398,0.368932
doc_election,0.516398,1.0,0.453609
doc_putin,0.368932,0.453609,1.0


In [58]:
df_cs_documents['doc_trump'].sort_values(ascending = False)[1:]
# These are the most similar documents to the 'doc_trump'

doc_election    0.516398
doc_putin       0.368932
Name: doc_trump, dtype: float64

In [56]:
dtm.T

Unnamed: 0,0,1,2
career,0,0,1
claimed,0,1,0
earlier,0,0,1
election,1,2,0
elections,0,0,1
friend,0,1,0
friends,2,0,0
interference,0,1,0
lost,1,0,0
minister,0,0,1


In [42]:
cs_words = cosine_similarity(dtm.T)
df_cs_words = pd.DataFrame(cs_words, columns = dtm.columns, index = dtm.columns)
df_cs_words

Unnamed: 0,career,claimed,earlier,election,elections,friend,friends,interference,lost,minister,...,putin,republican,russia,says,served,support,trump,vladimir,winning,witchhunt
career,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.666667,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
claimed,0.0,1.0,0.0,0.894427,0.0,1.0,0.0,1.0,0.0,0.0,...,0.666667,0.0,0.0,1.0,0.0,0.0,0.447214,0.0,0.0,1.0
earlier,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.666667,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
election,0.0,0.894427,0.0,1.0,0.0,0.894427,0.447214,0.894427,0.447214,0.0,...,0.745356,0.447214,0.0,0.894427,0.0,0.447214,0.8,0.0,0.447214,0.894427
elections,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.666667,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
friend,0.0,1.0,0.0,0.894427,0.0,1.0,0.0,1.0,0.0,0.0,...,0.666667,0.0,0.0,1.0,0.0,0.0,0.447214,0.0,0.0,1.0
friends,0.0,0.0,0.0,0.447214,0.0,0.0,1.0,0.0,1.0,0.0,...,0.333333,1.0,0.0,0.0,0.0,1.0,0.894427,0.0,1.0,0.0
interference,0.0,1.0,0.0,0.894427,0.0,1.0,0.0,1.0,0.0,0.0,...,0.666667,0.0,0.0,1.0,0.0,0.0,0.447214,0.0,0.0,1.0
lost,0.0,0.0,0.0,0.447214,0.0,0.0,1.0,0.0,1.0,0.0,...,0.333333,1.0,0.0,0.0,0.0,1.0,0.894427,0.0,1.0,0.0
minister,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.666667,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [57]:
df_cs_words['election'].sort_values(ascending = False)[1:6]
# These are the top 5 most similar words to the word 'election'

political       0.912871
parties         0.894427
claimed         0.894427
friend          0.894427
interference    0.894427
Name: election, dtype: float64

In [59]:
# vreating a UDF to get similar words
def get_similar_words(inp_word, sim_df):
    cos_values = sim_df[inp_word].sort_values(ascending = False)
    similar_words = cos_values.drop(inp_word).head(5)
    return similar_words

In [60]:
get_similar_words('election',df_cs_words)

political       0.912871
parties         0.894427
claimed         0.894427
friend          0.894427
interference    0.894427
Name: election, dtype: float64