# Sentence Similarity
The purpose of this notebook is to compare three of the methods explained in the README of Sentence Similarity folder. The content of the notebook are the following:
1. Data Preparation and Preprocessing
2. Similarity Score (cosine similarity)
3. Cosine Similarity + GloVe

## Data Preparation and Preprocessing

In [67]:
import numpy as np 
import pandas as pd
import os
import re
from nltk.corpus import stopwords
import scipy

import nltk
from nltk.tokenize import word_tokenize

In [64]:
def read_data(file_name):
    with open(file_name,'r') as f:
        word_vocab = set()
        word2vector = {}
        for line in f:
            line_ = line.strip() #Remove white space
            words_Vec = line_.split()
            word_vocab.add(words_Vec[0])
            word2vector[words_Vec[0]] = np.array(words_Vec[1:],dtype=float)
    print("Total Words in DataSet:",len(word_vocab))
    return word_vocab,word2vector

In [6]:
# Read the file 
vocab, w2v = read_data("/Users/cblanesg/Downloads/glove.6B.50d.txt")

Total Words in DataSet: 400000


## Cosine Similarity

In [68]:
x = 'The president greets the press in Chicago'
y = 'Obama speaks to the media in Illinois'

In [69]:
X_list = word_tokenize(x) 
Y_list = word_tokenize(y) 
sw = stopwords.words('english') 
l1 =[];l2 =[] 

In [70]:
X_set = {w for w in X_list if not w in sw}  
Y_set = {w for w in Y_list if not w in sw} 

In [71]:
rvector = X_set.union(Y_set)  
for w in rvector: 
    if w in X_set: l1.append(1) # create a vector 
    else: l1.append(0) 
    if w in Y_set: l2.append(1) 
    else: l2.append(0) 
c = 0

In [74]:
for i in range(len(rvector)): 
        c+= l1[i]*l2[i] 
cosine = c / float((sum(l1)*sum(l2))**0.5) 
print("similarity: ", cosine, '%') 

similarity:  0.0 %


## Cosine difference + GloVe

In [36]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [38]:
model = loadGloveModel("/Users/cblanesg/Downloads/glove.6B.50d.txt")

Loading Glove Model
Done. 400000  words loaded!


In [75]:
def cosine_distance_wordembedding_method(s1, s2):
    import scipy
    vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
    vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    print('similarity',round((1-cosine)*100,2),'%')

In [78]:
def preprocess(raw_text):
    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    cleaned_words = list(set([w for w in words if w not in stopword_set]))

    return cleaned_words

In [77]:
cosine_distance_wordembedding_method(x,y)

similarity 81.25 %
