# Sentence Similarity : Demo

## Import libraries

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter


## Data
We have a corpus of sentences and a query. The main objective is to find out most similar sentence from the given corpus and print the corresponding cosine similarity with that query.

In [2]:
# Just for illustration, let
sent_corpus = ['How old are you?', 'How are you?', 'What is your name?', 'Where do you live?', 'In which class do you study?']
query = "What is your age?"

## Approach
Transfer Learning:
 * Word2Vec + Cosine Similarity
 * ......

In [3]:
# import library for pretrained word embeddings
import gensim
from gensim.models import KeyedVectors

In [4]:
# wv_embeddings contain word as a key and embedding vector as a value
wv_embeddings = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=500000, unicode_errors='ignore')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


gensim.models.keyedvectors.Word2VecKeyedVectors

## Preprocessing

In [5]:
# A function for preprocessing text data
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
def preprocess(raw_text):

    # Remove any non-alphabetic characters if any
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords 
    stopword_set = set(stopwords.words("english"))
    cleaned_text = list(set([w for w in words if w not in stopword_set]))

    return " ".join(cleaned_text)


[nltk_data] Downloading package stopwords to C:\Users\Ajit kumar
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## From word to sentence embeddings

In [6]:
# define a funtion which takes a sentence(question) as input and return average embedding as output
def question2vec(question, embeddings = wv_embeddings, dim=300):
    
    vec = np.zeros((dim,), dtype=np.float32)
    count = 0
    for w in question.split():
        if w in embeddings:
            count += 1
            vec += embeddings[w]
    if count == 0:
        return vec
    return vec/count  # return mean embedding value

In [7]:
# for give corpus and query find out the cosine_similarity
similarity_list = []
query_embedding = question2vec(preprocess(query))
for sent in sent_corpus:
    sim = cosine_similarity(query_embedding.reshape(1, -1), question2vec(sent).reshape(1, -1))[0][0]
    similarity_list.append(sim)
    

In [9]:
# print most similar question for the given query
max_value = max(similarity_list)
print("For the give query -> "+query+" most similar question is -> "+sent_corpus[similarity_list.index(max_value)])
print("Corresponding cosine similarity is ", max_value)

For the give query -> What is your age? most similar question is -> How old are you?
Corresponding cosine similarity is  0.30646223
