In [1]:
from __future__ import print_function

import os
import sys
import numpy as np
import tensorflow as tf
from tensorflow import keras


BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'),encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [2]:
print('Embedding vector size: ',len(embeddings_index['the']))

Embedding vector size:  300


In [3]:
#Note that each question is lowercase and has been stripped of punctuation
question_array = ["what color is the carpet","what is dangerous above the boys head","what color is the dog"] 
print('Found %s questions.' % len(question_array))

def keep_question(question_array):
    """A function that takes in an array of different questions and returns an array of question embeddings. 
    If a word isn't found within Glove, that word is simply taken out of the question embedding. 
    """
    question_embeddings=[]
    for i in range(len(question_array)): #For each question
        question_embedding=np.zeros((300))
        question = question_array[i]
        for word in question.split(): #For each word in each question
            try: #If the word embedding is found
                word_embedding = embeddings_index[word]
                question_embedding = [a+b for a,b in zip(question_embedding, word_embedding)] #Sum the word embeddings
            except:
                continue
        question_embeddings.append(question_embedding)
    return question_embeddings

def discard_question(question_array):
    """A function that takes in an array of different questins and returns the embeddings of each question. 
    If a word isn't found within Glove, the entire question is discarded and an empty array is returned. 
    """
    question_embeddings=[]
    for i in range(len(question_array)): #For each question
        words_exist = True
        question_embedding=np.zeros((300))
        question = question_array[i]
        for word in question.split(): #For each word in each question
            try: #If the word embedding is found
                word_embedding = embeddings_index[word]
                question_embedding = [a+b for a,b in zip(question_embedding, word_embedding)] #Sum the word embeddings
            except:
                words_exist=False
                break
        if words_exist: 
            question_embeddings.append(question_embedding)
    return question_embeddings

Found 3 questions.


In [4]:
question_embeddings=discard_question(question_array)
print(question_embeddings)

[[0.3750699944794178, 1.124669998884201, 0.9386786022223532, -1.4552188565139659, -0.7439099792391062, -0.12009099218994379, -1.4011700004339218, 0.746940016746521, 0.6704370342195034, -7.262869864702225, 1.1789099872112274, -0.3132519908249378, 0.13160600699484348, 0.455314002931118, 1.0459629818797112, -0.5465715382015333, -1.6729369536042213, 0.15352797880768776, -0.5502720102667809, -0.24130498990416527, -0.43861298635602, 1.2444149851799011, 0.6815589740872383, 1.3701520413160324, -0.6719399839639664, -0.7611650079488754, 0.023253886494785547, -1.2295609936118126, -0.5331870019435883, 0.5910130180418491, 0.2165679931640625, 1.4558249176479876, -2.434851013123989, 0.8841615151613951, -3.570250004529953, 1.5355780124664307, -0.3810899928212166, -1.5414129868149757, -0.7250309847295284, 0.07257498800754547, 0.3835330046713352, -0.425097999162972, 0.703942010179162, -0.004054421558976173, 0.9148830343037844, 0.24024597369134426, 1.2596275904215872, 0.8236699998378754, -1.0797179806977