In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
with open('/gdrive/My Drive/Projects/Bumblebee/using-pretrained-glove-vectors-example/glove.6B.50d.txt', 'r') as f:
  print(f.readline(10))

the 0.418 


In [None]:
GloVe_path = '/gdrive/My Drive/Projects/Bumblebee/using-pretrained-glove-vectors-example/glove.6B.50d.txt'

In [None]:
import numpy as np
from numpy import arccos
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from collections import OrderedDict

In [None]:
embeddings_dict = {}
with open(GloVe_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        token = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[token] = vector

In [None]:
class Embed:

  def __init__(self, phrase_pool, phrase, embeddings_dict, size=15):
    self.phrase_pool = phrase_pool
    self.embeddings_dict = embeddings_dict
    self.phrase = phrase
    self.size = size
    self.embedded_phrase_pool = self.vector_sort(self.embed_phrases()[0])
    self.embedded_phrase_mapping = self.embed_phrases()[1]

  def __repr__(self):
    counter = 0

    output = ""
    output += "Sorted Embedded Phrase Pool:"
    output += "\n"

    for line in self.embedded_phrase_pool:
      phrase = self.embedded_phrase_mapping[line]

      output += str(counter) + "."
      output += "\t"
      output += phrase
      output += "\t -> \t"
      output += str(line)
      output += "\n"
    
      counter += 1
    return output

  # O(n)
  def embed_sentence(self, sentence):
    words = sentence.split()
    embed = []
    for word in words:
      embed.append(self.embeddings_dict[word][1])
    return embed

  # O(n)
  def length_normalize(self, words, size=15):
    length = len(words)
    output = words.copy()

    if length < self.size:
      difference = self.size - length
      for i in range(difference):
        output.append(0)
      return tuple(output)
    elif length > self.size:
        raise Exception("Input length is greater than size. Size is: " + str(self.size))
    else:
      return tuple(output)

  def embed_phrases(self):
    embedded_phrases = []                     # List storing embeds
    mapping = {}                              # Dict used to map embed to phrase

    for phrase in self.phrase_pool:
      embed = self.embed_sentence(phrase)          # Getting embeding of sentence
      embed = self.length_normalize(embed, self.size)   # Make embed vector length same
    
      embedded_phrases.append(embed)
      mapping[embed] = phrase

    return embedded_phrases, mapping
  
  def vector_sort(self, vector):
    return sorted(vector, key=lambda x: x[0], reverse=False)
  
  # O(1)
  def cosine_similarity(self, embed1, embed2):
    if len(embed1) != len(embed2):
      raise Exception("Inputs are not same length. Embed1 is " + str(len(embed1)) + " while Embed2 is " + str(len(embed2)))
    return spatial.distance.cosine(embed1, embed2)
  
  def test(self, test_sentence):
    embed = self.embed_sentence(test_sentence)
    embed = self.length_normalize(embed)
    print("Sim to leftest one:" + str(self.cosine_similarity(embed, self.embedded_phrase_pool[0])))
    print("Sim to the righest one:" + str(self.cosine_similarity(embed, self.embedded_phrase_pool[-1])))
    print(self.embedded_phrase_pool[-1])

  def binary_search(self, phrase):
    embed = self.embed_sentence(phrase)
    embed = self.length_normalize(embed, self.size)

    low = 0
    high = len(self.embedded_phrase_pool) - 1

    # problem: setting last_sim to 0 makes it always go to the right, thus removing any chance of it looking left
    # solution: 
    # mid = (high + low) // 2
    mid = (high + low) // 2
    mid_embed = self.embedded_phrase_pool[mid]

    right_embed = self.embedded_phrase_pool[mid+1]
    left_embed = self.embedded_phrase_pool[mid-1]

    right_sim = self.cosine_similarity(right_embed, embed)
    left_sim = self.cosine_similarity(left_embed, embed)

    similarity = self.cosine_similarity(mid_embed, embed)

    if similarity < left_sim:
      low = mid + 1
      last_similarity = left_sim
    elif similarity > right_sim:
      high = mid - 1
      last_similarity = right_sim
    elif similarity == left_sim:
      return mid-1
    elif similarity == right_sim:
      return mid+1

    while low <= high:
      mid = (high + low) // 2
      mid_embed = self.embedded_phrase_pool[mid]

      similarity = self.cosine_similarity(mid_embed, embed)
      
      print(("current sim is ") + str(similarity))
      print("last sim is " + str(last_similarity))
      print("phrase is: " + str(self.embedded_phrase_mapping[self.embedded_phrase_pool[mid]]))

      if similarity == 0:
        return mid
      # If x is greater, ignore left half
      elif last_similarity < similarity:
        low = mid + 1

      # If x is smaller, ignore right half
      elif last_similarity > similarity:
        high = mid - 1

      # means x is present at mid
      else:
          return mid

      last_similarity = similarity

    if similarity == 0:
      return mid
    # If we reach here, then the element was not present
    return -1

In [None]:
test = Embed(phrases, "joe", embeddings_dict)
print(test)
test.test("hi there")

Sorted Embedded Phrase Pool:
0.	how you doing	 -> 	(-0.10644, 0.33324, -0.37986, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
1.	who are you	 -> 	(-0.051277, 0.012516, 0.33324, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
2.	i am you	 -> 	(0.15255, 0.39805, 0.33324, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
3.	i am kevin	 -> 	(0.15255, 0.39805, 0.33927, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
4.	i am josh	 -> 	(0.15255, 0.39805, 0.85859, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
5.	hi there	 -> 	(0.34427, 0.32385, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
6.	my name is josh	 -> 	(0.77515, 0.75197, 0.64254, 0.85859, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

Sim to leftest one:0.7079814556986735
Sim to the righest one:0.29049627206051676
(0.77515, 0.75197, 0.64254, 0.85859, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)


In [None]:
test = Embed(phrases, "joe", embeddings_dict)
print(test)
test.cosine_similarity((1,1), (1,1))

test.binary_search("who are you")

Sorted Embedded Phrase Pool:
0.	how you doing	 -> 	(-0.10644, 0.33324, -0.37986, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
1.	who are you	 -> 	(-0.051277, 0.012516, 0.33324, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
2.	who is kevin	 -> 	(-0.051277, 0.64254, 0.33927, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
3.	i am you	 -> 	(0.15255, 0.39805, 0.33324, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
4.	i am kevin	 -> 	(0.15255, 0.39805, 0.33927, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
5.	i am josh	 -> 	(0.15255, 0.39805, 0.85859, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
6.	i josh am	 -> 	(0.15255, 0.85859, 0.39805, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
7.	hi there	 -> 	(0.34427, 0.32385, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
8.	my name is josh	 -> 	(0.77515, 0.75197, 0.64254, 0.85859, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

current sim is 0.5808266784376488
last sim is 0.4072588812156077
phrase is: i josh am
current sim is 1.0852812281486992
last sim is 0.5808266784376488
phrase is: hi there
current sim is 0.6420943161712567
last s

-1

In [None]:
test.binary_search("how you doing")

current sim is 0.7079814556986735
last sim is 1.0443102401635602
phrase is: hi there
current sim is 1.0443102401635602
last sim is 0.7079814556986735
phrase is: i am kevin


-1

# Phrases

In [None]:
phrases = {
          "hi there" : "video1",
          "how you doing" : "video2",
          "who are you" : "video3",
          "i am you" : "video3",
          "my name is josh" : "video3",
          "i am kevin" : "video3",
          "i am josh" : "vid4",
          "i josh am" : "vid5",
          "who is kevin" : "joe"
}

## Embedding the phrases

In this system, the embed lengtyh must be computed everytime, but the angle between the unit vector and the current embed

In [None]:
def embed_phrases(phrases, size=15):
  embedded_phrases = []                     # List storing embeds
  mapping = {}                              # Dict used to map embed to phrase

  for phrase in phrases:
    embed = embed_sentence(phrase)          # Getting embeding of sentence
    embed = length_normalize(embed, size)   # Make embed vector length same
  
    embedded_phrases.append(embed)
    mapping[embed] = phrase

  return embedded_phrases, mapping

In [None]:
def vector_sort(vector):
  return sorted(vector, key=lambda x: x[0], reverse=False)

Sort by storing the values as ints, tehn store those ints as strings ina  dict which point to the phrase which then points to the adress

get the angle of the vector

In [None]:
for line in vector_sort(joe[0]):
  print(line)

In [None]:
class Embed:

  def __init__(self, phrase_pool, phrase, size=15):
    self.phrase_pool = phrase_pool
    self.phrase = phrase
    self.size = size
    self.embedded_phrase_pool = vector_sort(self.embed_phrases()[0])
    self.embedded_phrase_mapping = self.embed_phrases()[1]

  
  def __repr__(self):
    counter = 0

    output = ""
    output += "Sorted Embedded Phrase Pool:"
    output += "\n"

    for line in self.embedded_phrase_pool:
      phrase = self.embedded_phrase_mapping[line]

      output += str(counter) + "."
      output += "\t"
      output += phrase
      output += "\t -> \t"
      output += str(line)
      output += "\n"
    
      counter += 1
    return output

  # O(n)
  def embed_sentence(sentence):
    words = sentence.split()
    embed = []
    for word in words:
      embed.append(embeddings_dict[word][1])
    return embed

  # O(n)
  def length_normalize(words, size=15):
    length = len(words)
    output = words.copy()

    if length < self.size:
      difference = self.size - length
      for i in range(difference):
        output.append(0)
      return tuple(output)
    elif length > self.size:
        raise Exception("Input length is greater than size. Size is: " + str(self.size))
    else:
      return tuple(output)

  def embed_phrases(self):
    embedded_phrases = []                     # List storing embeds
    mapping = {}                              # Dict used to map embed to phrase

    for phrase in self.phrase_pool:
      embed = embed_sentence(phrase)          # Getting embeding of sentence
      embed = length_normalize(embed, self.size)   # Make embed vector length same
    
      embedded_phrases.append(embed)
      mapping[embed] = phrase

    return embedded_phrases, mapping
  
  def vector_sort(vector):
    return sorted(vector, key=lambda x: x[0], reverse=False)

In [None]:
test = Embed(phrases, "joe", embeddings_dict = e)
print(test)

NameError: ignored

Latest

In [None]:
class Embed:

  def __init__(self, phrase_pool, phrase, embeddings_dict, size=15):
    self.phrase_pool = phrase_pool
    self.embeddings_dict = embeddings_dict
    self.phrase = phrase
    self.size = size
    self.embedded_phrase_pool = self.vector_sort(self.embed_phrases()[0])
    self.embedded_phrase_mapping = self.embed_phrases()[1]

  def __repr__(self):
    counter = 0

    output = ""
    output += "Sorted Embedded Phrase Pool:"
    output += "\n"

    for line in self.embedded_phrase_pool:
      phrase = self.embedded_phrase_mapping[line]

      output += str(counter) + "."
      output += "\t"
      output += phrase
      output += "\t -> \t"
      output += str(line)
      output += "\n"
    
      counter += 1
    return output

  # O(n)
  def embed_sentence(self, sentence):
    words = sentence.split()
    embed = []
    for word in words:
      embed.append(self.embeddings_dict[word][1])
    return embed

  # O(n)
  def length_normalize(self, words, size=15):
    length = len(words)
    output = words.copy()

    if length < self.size:
      difference = self.size - length
      for i in range(difference):
        output.append(0)
      return tuple(output)
    elif length > self.size:
        raise Exception("Input length is greater than size. Size is: " + str(self.size))
    else:
      return tuple(output)

  def embed_phrases(self):
    embedded_phrases = []                     # List storing embeds
    mapping = {}                              # Dict used to map embed to phrase

    for phrase in self.phrase_pool:
      embed = self.embed_sentence(phrase)          # Getting embeding of sentence
      embed = self.length_normalize(embed, self.size)   # Make embed vector length same
    
      embedded_phrases.append(embed)
      mapping[embed] = phrase

    return embedded_phrases, mapping
  
  def vector_sort(self, vector):
    return sorted(vector, key=lambda x: x[0], reverse=False)
  
  # O(1)
  def cosine_similarity(self, embed1, embed2):
    if len(embed1) != len(embed2):
      raise Exception("Inputs are not same length. Embed1 is " + str(len(embed1)) + " while Embed2 is " + str(len(embed2)))
    return spatial.distance.cosine(embed1, embed2)
  
  def test(self, test_sentence):
    embed = self.embed_sentence(test_sentence)
    embed = self.length_normalize(embed)
    print("Sim to leftest one:" + str(self.cosine_similarity(embed, self.embedded_phrase_pool[0])))
    print("Sim to the righest one:" + str(self.cosine_similarity(embed, self.embedded_phrase_pool[-1])))
    print(self.embedded_phrase_pool[-1])

  def binary_search(self, phrase):
    sentence_in_question = self.embed_sentence(phrase)
    sentence_in_question = self.length_normalize(sentence_in_question, self.size)

    low = 0
    high = len(self.embedded_phrase_pool) - 1

    first_time = True

    while low <= high:
      mid = (high + low) // 2

      mid_embedding = self.embedded_phrase_pool[mid]
      # mid_sentence = self.embedded_phrase_mapping[self.embedded_phrase_pool[mid]]
      # high_sentence = self.embedded_phrase_mapping[self.embedded_phrase_pool[high]]
      # low_sentence = self.embedded_phrase_mapping[self.embedded_phrase_pool[low]]

      if mid_embedding == sentence_in_question:
        first_time = False
        return mid
      # If x is greater, ignore left half
      elif mid_embedding < sentence_in_question:
        first_time = False
        low = mid + 1

      # If x is smaller, ignore right half
      else:
        first_time = False
        high = mid - 1

    if first_time:
      return -1
    else:
      current_position_embed = self.embedded_phrase_pool[mid]
      next_position_embed = self.embedded_phrase_pool[mid+1]

      sim_current = self.cosine_similarity(sentence_in_question, current_position_embed)
      sim_next = self.cosine_similarity(sentence_in_question, next_position_embed)

      print("sim next:", str(sim_next), str(next_position_embed))
      print("sim current:", str(sim_current), str(current_position_embed))

      print("word in question:", str(sentence_in_question))
      if sim_current < sim_next:
        return mid
      else:
        return mid+1

In [None]:
test = Embed(phrases, "joe", embeddings_dict)
print(test)

test.binary_search("who is you")

Sorted Embedded Phrase Pool:
0.	how you doing	 -> 	(-0.10644, 0.33324, -0.37986, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
1.	who are you	 -> 	(-0.051277, 0.012516, 0.33324, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
2.	who is kevin	 -> 	(-0.051277, 0.64254, 0.33927, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
3.	i am you	 -> 	(0.15255, 0.39805, 0.33324, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
4.	i am kevin	 -> 	(0.15255, 0.39805, 0.33927, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
5.	i am josh	 -> 	(0.15255, 0.39805, 0.85859, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
6.	i josh am	 -> 	(0.15255, 0.85859, 0.39805, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
7.	hi there	 -> 	(0.34427, 0.32385, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
8.	my name is josh	 -> 	(0.77515, 0.75197, 0.64254, 0.85859, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

sim next: 0.08565741878918387 (0.15255, 0.39805, 0.33324, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
sim current: 2.7038278017865736e-05 (-0.051277, 0.64254, 0.33927, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
word in questio

2

assumption: That the sorted phrase pool puts closest sentences together
is this correct? can this be assumed?