# Feature Engineering (Text similarity)

In [3]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
lemmatizer = WordNetLemmatizer()

In [2]:
pair1 = ["What you do defines you","Your deeds define you"]
pair2 = ["Once upon a time there lived a king.", "Who is your queen?"]
pair3 = ["He is desperate", "Is he not desperate?"]


print(pair1)
print(pair2)
print(pair3)

['What you do defines you', 'Your deeds define you']
['Once upon a time there lived a king.', 'Who is your queen?']
['He is desperate', 'Is he not desperate?']


# Jaccard similarity

In [23]:
def extract_text_similarity_jaccard(text1,text2):
    words_text1 = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text1)]
    words_text2 = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text2)]
    
    num_common_words = len(set(words_text1).intersection(set(words_text2)))
    num_total_words = len(set(words_text1).union(set(words_text2)))
    
    return num_common_words / num_total_words

In [24]:
extract_text_similarity_jaccard(pair1[0],pair1[1])

0.14285714285714285

In [25]:
extract_text_similarity_jaccard(pair1[0],pair2[1])

0.0

In [26]:
extract_text_similarity_jaccard(pair3[0],pair3[1])

0.6

# Cosine Similarity

In [36]:
tfidf_model = TfidfVectorizer()

corpus = [pair1[0],pair1[1],pair2[0],pair2[1],pair3[0],pair3[1]]

for l in corpus:
    print(l)

What you do defines you
Your deeds define you
Once upon a time there lived a king.
Who is your queen?
He is desperate
Is he not desperate?


In [28]:
tfidf = tfidf_model.fit_transform(corpus)
tfidf.shape

(6, 19)

In [35]:
# checking how tfidf vectors look..looks good
import pandas as pd 
pd.DataFrame(tfidf.todense(),columns=tfidf_model.get_feature_names())

Unnamed: 0,deeds,define,defines,desperate,do,he,is,king,lived,not,once,queen,there,time,upon,what,who,you,your
0,0.0,0.0,0.419233,0.0,0.419233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419233,0.0,0.687554,0.0
1,0.546779,0.546779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.448367,0.448367
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,0.408248,0.0,0.408248,0.0,0.408248,0.408248,0.408248,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.389967,0.0,0.0,0.0,0.0,0.563282,0.0,0.0,0.0,0.0,0.563282,0.0,0.4619
4,0.0,0.0,0.0,0.607144,0.0,0.607144,0.512593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.487953,0.0,0.487953,0.411964,0.0,0.0,0.595054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
def get_cosine_similarity(idx1,idx2):
    return cosine_similarity(tfidf[idx1],tfidf[idx2])

In [41]:
get_cosine_similarity(0,1)

array([[0.3082764]])

In [42]:
get_cosine_similarity(2,3)

array([[0.]])

In [43]:
get_cosine_similarity(4,5) # very similar

array([[0.80368547]])

We can conform that cosine similarity gives better result than jaccard similarity.