#### Glove Embedding

In [9]:
# import libraries
import re
import numpy as np
import scipy
from nltk.corpus import stopwords



gloveFile = "data\\glove.6B.50d.txt"
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    
    # open glove file and read its contents
    with open(gloveFile, encoding="utf8" ) as f:
        content = f.readlines()
    
    # initialise dictionalry model
    model = {}
    for line in content:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model

model = loadGloveModel(gloveFile)

Loading Glove Model
Done. 400000  words loaded!


In [6]:


def preprocess(raw_text):

    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    cleaned_words = list(set([w for w in words if w not in stopword_set]))
    
    return cleaned_words

def cosine_distance_wordembedding_method(s1, s2):
    
    vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
    vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    #print(round((1-cosine)*100,2),'%')
    return round((1-cosine)*100,2)

In [7]:
# example for semantic similarity
a1='his house has better goods'
a2='he got some good products in his home'
print(cosine_distance_wordembedding_method(a1, a2))

89.12


In [16]:
'''
# another method to input sentences  

n=int(input("enter the number of sentences"))
sentences_list=[]
for i in range(0,n):
    sentences_list.append(input(f"enter the {i}th sentence "))
'''

# input texts
s1 = 'she is combing her hair'
s2 = 'she is brushing her hair'
s3 = 'she is eating her food'
s4 = 'he came for dinner tonight'
s5 = 'he spoke to her about dinner'
s6 = 'teacher spoke to him'
s7 = 'he teaches science'
s8 = 'science is the key for the future'
s9 = 'he can predict the future'
s10 = 'he designed prediction algorithm'

# create list of texts
l=[]
l.append(s1)
l.append(s2)
l.append(s3)
l.append(s4)
l.append(s5)
l.append(s6)
l.append(s7)
l.append(s8)
l.append(s9)
l.append(s10)

# list to add pairs of text to compare
li=[]
# list to add sum of similarity value for each text
sum2=[]

for i in l:
    sum1=0
    for j in l:
       
        # calculating similarity between two texts
        score=cosine_distance_wordembedding_method(i, j)

        z=[]
        z.append(i)
        z.append(j)
        z.append(score)

        # list of triplets of text1 , text2 and similarity value of both texts
        li.append(z)
        sum1+=score
    
    # calculating average
    sum2.append(sum1/10)

for s in range(0,10):
    
    # prints text with similar text of its group
    print(f"{s+1}. {l[s]} matches with :")
    for t in li:
        if t[0]==l[s]:
            # thershold similarity value 
            r=50
            if t[2] >= r and t[2]!=100:
                print('        ',t[1],':',t[2])
    print("")
                

# to fetch the lowest average value 
su=sorted(sum2)
low=su[0]
ind=sum2.index(low)

# printing unique sentence based on lowest average value of similarity
print('first method : unique sentence')
print('      ',l[ind])
print("")
        
        
# printing top unique sentences based on lowest value of similarity of each text
less=[]
for i in li:
    less.append(i[2])
less.sort()
less=sorted(li,key=lambda l:l[2], reverse=False)
print('second method : top uniq sentences')
for o in range(0,5):
    print('      ',less[o][0],less[o][2])
    



1. she is combing her hair matches with :
         she is brushing her hair : 81.23

2. she is brushing her hair matches with :
         she is combing her hair : 81.23
         she is eating her food : 50.2

3. she is eating her food matches with :
         she is brushing her hair : 50.2
         he came for dinner tonight : 53.73
         he spoke to her about dinner : 51.45

4. he came for dinner tonight matches with :
         she is eating her food : 53.73
         he spoke to her about dinner : 84.3
         teacher spoke to him : 58.56
         science is the key for the future : 53.82

5. he spoke to her about dinner matches with :
         she is eating her food : 51.45
         he came for dinner tonight : 84.3
         teacher spoke to him : 77.18

6. teacher spoke to him matches with :
         he came for dinner tonight : 58.56
         he spoke to her about dinner : 77.18
         he teaches science : 63.88
         science is the key for the future : 57.57

7. he teache