In [36]:
import gensim
from gensim.models import Word2Vec
import os
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import math
import nltk
import numpy as np
import csv
from sklearn.decomposition import TruncatedSVD
import torch

In [2]:
# List of resources:
# How to get the model locally: https://stackoverflow.com/questions/46433778/import-googlenews-vectors-negative300-bin
# Notebook of examples: https://github.com/nlptown/nlp-notebooks/blob/master/Simple%20Sentence%20Similarity.ipynb

model = gensim.models.KeyedVectors.load_word2vec_format('../models/GoogleNews-vectors-negative300.bin', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [3]:
# borrowed from the notebook of exaples

STOP = set(nltk.corpus.stopwords.words("english"))

class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        self.tokens_without_stop = [t for t in self.tokens if t not in STOP]

In [20]:
sentence1 = Sentence("Brown vs Board of Education was a landmark ruling that desegregated schools in America")
sentence2 = Sentence("The United States Supreme Court ruled that schools should be integrated in Brown v Board of Education")

sentence3 = Sentence("I think that the reason why Christopher Columbus went to America was to find money.")
sentence4 = Sentence("I have absolutely no idea what the question is asking.")

sentence5 = Sentence("The three banches of government in the United states are: executive, legislative, and judicial.")
sentence6 = Sentence("I really hope that the caffeteria is serving meat loaf today, but I would settle for pasta.")

sentence7 = Sentence("The mitochondria is the powerhouse of the cell.")
sentence8 = Sentence("The mitochondria is the powerhouse of the cell.")

sentence9 = Sentence("At common law, all felonies were punishable by death. Therefore, Bob would be executed if convicted. He needs a lawyer to save his life.")
sentence10 = Sentence("Bob should retain counsel to represent him in the burglary proceedings, as his life is at stake.")

In [21]:
def compute_similarity(sentence1, sentence2):
    print('---------------------')
    print('Sentence 1:', sentence1.raw)
    print('Sentence 2:', sentence2.raw)
    
    tokens1 = sentence1.tokens_without_stop
    tokens2 = sentence2.tokens_without_stop

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]
    
    print('Tokenized 1: ', tokens1)
    print('Tokenized 2: ', tokens2)

    if len(tokens1) == 0 or len(tokens2) == 0:
        print('No discernable similarity, thanks Google')

    tokfreqs1 = Counter(tokens1)
    tokfreqs2 = Counter(tokens2)

    embedding1 = np.average([model[token] for token in tokfreqs1], axis=0, weights=None).reshape(1, -1)
    embedding2 = np.average([model[token] for token in tokfreqs2], axis=0, weights=None).reshape(1, -1)

    sim = cosine_similarity(embedding1, embedding2)[0][0]
    print('Similarity: ', sim)

In [22]:
compute_similarity(sentence1, sentence2)
compute_similarity(sentence3, sentence4)
compute_similarity(sentence5, sentence6)
compute_similarity(sentence7, sentence8)
compute_similarity(sentence9, sentence10)

---------------------
Sentence 1: Brown vs Board of Education was a landmark ruling that desegregated schools in America
Sentence 2: The United States Supreme Court ruled that schools should be integrated in Brown v Board of Education
Tokenized 1:  ['brown', 'vs', 'board', 'education', 'landmark', 'ruling', 'desegregated', 'schools', 'america']
Tokenized 2:  ['united', 'states', 'supreme', 'court', 'ruled', 'schools', 'integrated', 'brown', 'v', 'board', 'education']
Similarity:  0.7436647
---------------------
Sentence 1: I think that the reason why Christopher Columbus went to America was to find money.
Sentence 2: I have absolutely no idea what the question is asking.
Tokenized 1:  ['think', 'reason', 'christopher', 'columbus', 'went', 'america', 'find', 'money']
Tokenized 2:  ['absolutely', 'idea', 'question', 'asking']
Similarity:  0.47618806
---------------------
Sentence 1: The three banches of government in the United states are: executive, legislative, and judicial.
Sentence 2

In [24]:
PATH_TO_FREQUENCIES_FILE = "../data/frequencies.tsv"
PATH_TO_DOC_FREQUENCIES_FILE = "../data/doc_frequencies.tsv"

def read_tsv(f):
    frequencies = {}
    with open(f) as tsv:
        tsv_reader = csv.reader(tsv, delimiter="\t")
        for row in tsv_reader: 
            frequencies[row[0]] = int(row[1])
        
    return frequencies
        
frequencies = read_tsv(PATH_TO_FREQUENCIES_FILE)
doc_frequencies = read_tsv(PATH_TO_DOC_FREQUENCIES_FILE)
doc_frequencies["NUM_DOCS"] = 1288431

In [29]:
def compute_similarity(sentence1, sentence2, doc_freqs):
    N = doc_freqs["NUM_DOCS"]
    print('---------------------')
    print('Sentence 1:', sentence1.raw)
    print('Sentence 2:', sentence2.raw)
    
    tokens1 = sentence1.tokens_without_stop
    tokens2 = sentence2.tokens_without_stop

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]
    
    print('Tokenized 1: ', tokens1)
    print('Tokenized 2: ', tokens2)

    if len(tokens1) == 0 or len(tokens2) == 0:
        print('No discernable similarity, thanks Google')

    tokfreqs1 = Counter(tokens1)
    tokfreqs2 = Counter(tokens2)
    
    weights1 = [tokfreqs1[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs1]
    weights2 = [tokfreqs2[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs2]

    embedding1 = np.average([model[token] for token in tokfreqs1], axis=0, weights=weights1).reshape(1, -1)
    embedding2 = np.average([model[token] for token in tokfreqs2], axis=0, weights=weights2).reshape(1, -1)

    sim = cosine_similarity(embedding1, embedding2)[0][0]
    print('Similarity: ', sim)

In [30]:
compute_similarity(sentence1, sentence2, doc_frequencies)
compute_similarity(sentence3, sentence4, doc_frequencies)
compute_similarity(sentence5, sentence6, doc_frequencies)
compute_similarity(sentence7, sentence8, doc_frequencies)
compute_similarity(sentence9, sentence10, doc_frequencies)

---------------------
Sentence 1: Brown vs Board of Education was a landmark ruling that desegregated schools in America
Sentence 2: The United States Supreme Court ruled that schools should be integrated in Brown v Board of Education
Tokenized 1:  ['brown', 'vs', 'board', 'education', 'landmark', 'ruling', 'desegregated', 'schools', 'america']
Tokenized 2:  ['united', 'states', 'supreme', 'court', 'ruled', 'schools', 'integrated', 'brown', 'v', 'board', 'education']
Similarity:  0.6420243146838742
---------------------
Sentence 1: I think that the reason why Christopher Columbus went to America was to find money.
Sentence 2: I have absolutely no idea what the question is asking.
Tokenized 1:  ['think', 'reason', 'christopher', 'columbus', 'went', 'america', 'find', 'money']
Tokenized 2:  ['absolutely', 'idea', 'question', 'asking']
Similarity:  0.40067508936769747
---------------------
Sentence 1: The three banches of government in the United states are: executive, legislative, and ju

In [59]:
# import tensorflow_hub as hub
# import tensorflow as tf

# embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")

# messages = [sentence1.raw, sentence2.raw, sentence3.raw, sentence4.raw, sentence5.raw,
#            sentence6.raw, sentence7.raw, sentence8.raw, sentence9.raw, sentence10.raw]

# # Reduce logging output.
# tf.logging.set_verbosity(tf.logging.ERROR)

# with tf.Session() as session:
#   session.run([tf.global_variables_initializer(), tf.tables_initializer()])
#   message_embeddings = session.run(embed(messages))
    
#   for i in range(0,5):
#     i = i * 2
#     first = i
#     second = i + 1
#     embed1 = np.array(message_embeddings[first])
#     embed2 = np.array(message_embeddings[second])
#     print('---------------------')
#     print('Sentence1: ', messages[first])
#     print('Sentence2: ', messages[second])
#     print('Similarity: ', cosine_similarity(embed1, embed2)[0][0])
    

# #   for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
# #     print("Message: {}".format(messages[i]))
# #     print("Embedding size: {}".format(len(message_embedding)))
# #     message_embedding_snippet = ", ".join(
# #         (str(x) for x in message_embedding[:3]))
# #     print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

In [60]:
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")

In [63]:
import tensorflow_hub as hub
import tensorflow as tf

sentence1 = Sentence("Brown vs Board of Education was a landmark ruling that desegregated schools in America")
sentence2 = Sentence("The United States Supreme Court ruled that schools should be integrated in Brown v Board of Education")

sentence3 = Sentence("I think that the reason why Christopher Columbus went to America was to find money.")
sentence4 = Sentence("I have absolutely no idea what the question is asking.")

sentence5 = Sentence("The three banches of government in the United states are: executive, legislative, and judicial.")
sentence6 = Sentence("I really hope that the caffeteria is serving meat loaf today, but I would settle for pasta.")

sentence7 = Sentence("The mitochondria is the powerhouse of the cell.")
sentence8 = Sentence("The mitochondria is the powerhouse of the cell.")

sentence9 = Sentence("At common law, all felonies were punishable by death. Therefore, Bob would be executed if convicted. He needs a lawyer to save his life.")
sentence10 = Sentence("Bob should retain counsel to represent him in the burglary proceedings, as his life is at stake.")

sentence11 = Sentence("Another set of identical sentences with the exact same words in them.")
sentence12 = Sentence("Another set of identical sentences with the exact same words in them.")

sentence11 = Sentence("He's heavier than I am.")
sentence12 = Sentence("He weighs more than I do.")

sentence13 = Sentence("The cup is half empty.")
sentence14 = Sentence("The cup is half full.")

sentence15 = Sentence("There’s no dessert in the desert for those who desert.")
sentence16 = Sentence("You have to stick with us in this wasteland, or we will not help you with your sustenance needs.")

def run_gse_benchmark(sentences1, sentences2):
    sts_input1 = tf.placeholder(tf.string, shape=(None))
    sts_input2 = tf.placeholder(tf.string, shape=(None))

    sts_encode1 = tf.nn.l2_normalize(embed(sts_input1))
    sts_encode2 = tf.nn.l2_normalize(embed(sts_input2))
        
    sim_scores = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), axis=1)
    
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
      
        [gse_sims] = session.run(
            [sim_scores],
            feed_dict={
                sts_input1: [sent1.raw for sent1 in sentences1],
                sts_input2: [sent2.raw for sent2 in sentences2]
            })
    return gse_sims

sentences1 = [sentence1, sentence3, sentence5, sentence7, sentence9, sentence11, sentence13, sentence15]
sentences2 = [sentence2, sentence4, sentence6, sentence8, sentence10, sentence12, sentence14, sentence16]
similarities = (run_gse_benchmark(sentences1, sentences2))

for i in range(0,8):
    print('--------------------')
    print('Sentences1: ', sentences1[i].raw)
    print('Sentences2: ', sentences2[i].raw)
    print('Similarity: ', similarities[i])
    

--------------------
Sentences1:  Brown vs Board of Education was a landmark ruling that desegregated schools in America
Sentences2:  The United States Supreme Court ruled that schools should be integrated in Brown v Board of Education
Similarity:  0.104413204
--------------------
Sentences1:  I think that the reason why Christopher Columbus went to America was to find money.
Sentences2:  I have absolutely no idea what the question is asking.
Similarity:  0.01204402
--------------------
Sentences1:  The three banches of government in the United states are: executive, legislative, and judicial.
Sentences2:  I really hope that the caffeteria is serving meat loaf today, but I would settle for pasta.
Similarity:  0.0008318785
--------------------
Sentences1:  The mitochondria is the powerhouse of the cell.
Sentences2:  The mitochondria is the powerhouse of the cell.
Similarity:  0.12499997
--------------------
Sentences1:  At common law, all felonies were punishable by death. Therefore, Bo

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import time


# Create graph and finalize (finalizing optional but recommended).
g = tf.Graph()
with g.as_default():
    embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")

    sts_input1 = tf.placeholder(tf.string, shape=(None))
    sts_input2 = tf.placeholder(tf.string, shape=(None))

    sts_encode1 = tf.nn.l2_normalize(embed(sts_input1))
    sts_encode2 = tf.nn.l2_normalize(embed(sts_input2))
        
    sim_scores = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), axis=1)
    init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()

# Create session and initialize.
session = tf.Session(graph=g)
session.run(init_op)

In [None]:
sentence1 = Sentence("Brown vs Board of Education was a landmark ruling that desegregated schools in America")
sentence2 = Sentence("The United States Supreme Court ruled that schools should be integrated in Brown v Board of Education")

sentence3 = Sentence("I think that the reason why Christopher Columbus went to America was to find money.")
sentence4 = Sentence("I have absolutely no idea what the question is asking.")

sentence5 = Sentence("The three banches of government in the United states are: executive, legislative, and judicial.")
sentence6 = Sentence("I really hope that the caffeteria is serving meat loaf today, but I would settle for pasta.")

sentence7 = Sentence("The mitochondria is the powerhouse of the cell.")
sentence8 = Sentence("The mitochondria is the powerhouse of the cell.")

sentence9 = Sentence("At common law, all felonies were punishable by death. Therefore, Bob would be executed if convicted. He needs a lawyer to save his life.")
sentence10 = Sentence("Bob should retain counsel to represent him in the burglary proceedings, as his life is at stake.")

sentence11 = Sentence("Another set of identical sentences with the exact same words in them.")
sentence12 = Sentence("Another set of identical sentences with the exact same words in them.")

sentence11 = Sentence("He's heavier than I am.")
sentence12 = Sentence("He weighs more than I do.")

sentence13 = Sentence("The cup is half empty.")
sentence14 = Sentence("The cup is half full.")

sentence15 = Sentence("There’s no dessert in the desert for those who desert.")
sentence16 = Sentence("You have to stick with us in this wasteland, or we will not help you with your sustenance needs.")

sentences1 = [sentence1, sentence3, sentence5, sentence7, sentence9, sentence11, sentence13, sentence15]
sentences2 = [sentence2, sentence4, sentence6, sentence8, sentence10, sentence12, sentence14, sentence16]

start = time.time()
[gse_sims] = session.run(
            [sim_scores],
            feed_dict={
                sts_input1: [sent1.raw for sent1 in sentences1],
                sts_input2: [sent2.raw for sent2 in sentences2]
            })
end = time.time()
print('TOTAL TIME: ', end - start)

for i in range(0,8):
    print('--------------------')
    print('Sentences1: ', sentences1[i].raw)
    print('Sentences2: ', sentences2[i].raw)
    print('Similarity: ', similarities[i])