## Universal Sentence Encoder
Model for encoding sentences into embedding vectors and then finding similarity between t


In [48]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import json
import re
import pandas as pd
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer


In [49]:
#loading the Universal Sentence Encoder module from tensorflow hub.
module_url = "https://tfhub.dev/google/nnlm-en-dim128/2"
embed = hub.KerasLayer(module_url)

#set of stop words to be removed while text pre-processing
stop_words = set(stopwords.words('english')) 

In [41]:
 

def decontracted(phrase):
    
    ''' method to de-contract the common phrases '''
    
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def striphtml(data): 
    ''' method to remove html tags from the text'''
    cleanr = re.compile('<.*?>') 
    cleantext = re.sub(cleanr, ' ', str(data)) 
    return cleantext


def stripunc(data):
    ''' method to remove punctuations and numbers'''
    return re.sub('[^A-Za-z]+', ' ', str(data), flags=re.MULTILINE|re.DOTALL)


def compute(sent): 
    '''Phrases in sentences are de-contracted, html tags and punctuations are removed and
    finally words are brought back to their base form through lemmatization'''
    
    lemmatizer = WordNetLemmatizer()
    
    sent = decontracted(sent)   #decontracts the phrase
    sent = striphtml(sent)      #strips html from the sentence
    sent = stripunc(sent)       #removes the puncutations
    words=word_tokenize(str(sent.lower())) #turns sentences in to lower case and then tokenizer it 
    words = [lemmatizer.lemmatize(word) for word in words] #lemmatization ie, changing words to their base form
    words = [word for word in words if word not in stop_words] #removing stopwords
    sent1 = ' '.join(words)
    
    return sent1

In [42]:
#reading the dataset
df = pd.read_csv('C:\\Users\Anuj\\OneDrive\\Desktop\\Precily_Assignment\\Text_Similarity_Dataset.csv')


#apply text-preprocessing ie., removing stop words, stiping punctuations and html code and lemmatization
df['text1'] = df['text1'].apply(lambda x: compute(x)) 
df['text2'] = df['text2'].apply(lambda x: compute(x))

In [43]:
df.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searcher fail spot ad internet search en...,newcastle bolton kieron dyer smashed home winn...
1,1,million miss net uk population still without i...,nasdaq planning share sale owner technology do...
2,2,young debut cut short ginepri fifteen year old...,ruddock back yapp credential wale coach mike r...
3,3,diageo buy u wine firm diageo world biggest sp...,mci share climb takeover bid share u phone com...
4,4,careful code new european directive could put ...,medium gadget get moving pocket sized device l...


In [51]:
def cosine_similarity(v1, v2):
    '''Method to calcualte cosine similarity betweeen two vectors'''
    mag1 = np.linalg.norm(v1)
    mag2 = np.linalg.norm(v2)
    if (not mag1) or (not mag2):
        return 0
    return 1 - np.dot(v1, v2) / (mag1 * mag2)


In [44]:
similarity = []  #to hold the similarity between text1 and text2

#iterate over the dataset and finds the similarity between every text in text1 and text2
for i in range(0,len(df)):
    #calling the cosine similarity function with one instance from text1 and text2
    similarity.append( cosine_similarity(embed([df.iloc[i,1]])[0],   embed([df.iloc[i,2]])[0]) )
    
#saving the results back to the dataframe    
df['similarity']=similarity
df.to_csv('C:\\Users\Anuj\\OneDrive\\Desktop\\Precily_Assignment\\Text_Similarity_Dataset_new.csv')

In [45]:
print(similarity[:10])

[0.5028485953807831, 0.32320380210876465, 0.30119407176971436, 0.32034367322921753, 0.22262650728225708, 0.2577289342880249, 0.46426230669021606, 0.17850518226623535, 0.2861446142196655, 0.10078173875808716]
