In [2]:
from __future__ import print_function

import os
import sys
import numpy as np
import tensorflow as tf
from tensorflow import keras


BASE_DIR = 'data/'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'),encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [3]:
print('Embedding vector size: ',len(embeddings_index['the']))

Embedding vector size:  300


In [64]:
#Note that each question is lowercase and has been stripped of punctuation
question_array = np.array([["what color is the carpet"],["what is dangerous above the boys head"],["what color is the dog"],["what color is the clock"],["what size is the cup"]]) 
solution_array = np.array(["red","knife","brown","white","blue"])
question_index = np.array([1,2,3,4,5])
print('Found %s questions.' % len(question_array))

def keep_question(question_array):
    """A function that takes in an array of different questions and returns an array of question embeddings. 
    If a word isn't found within Glove, that word is simply taken out of the question embedding. 
    """
    question_embeddings=[]
    for i in range(len(question_array)): #For each question
        question_embedding=np.zeros((300))
        question = question_array[i][0]
        for word in question.split(): #For each word in each question
            try: #If the word embedding is found
                word_embedding = embeddings_index[word]
                question_embedding = [a+b for a,b in zip(question_embedding, word_embedding)] #Sum the word embeddings
            except:
                continue
        question_embeddings.append(question_embedding)
    return question_embeddings

def discard_question(question_array):
    """A function that takes in an array of different questins and returns the embeddings of each question. 
    If a word isn't found within Glove, the entire question is discarded and an empty array is returned. 
    """
    question_embeddings=[]
    for i in range(len(question_array)): #For each question
        words_exist = True
        question_embedding=np.zeros((300))
        question = question_array[i][0]
        for word in question.split(): #For each word in each question
            try: #If the word embedding is found
                word_embedding = embeddings_index[word]
                question_embedding = [a+b for a,b in zip(question_embedding, word_embedding)] #Sum the word embeddings
            except:
                words_exist=False
                break
        if words_exist: 
            question_embeddings.append(question_embedding)
    return question_embeddings

Found 5 questions.


In [65]:
question_embeddings=np.array(keep_question(question_array))
question_embeddings.shape

(5, 300)

In [66]:
from sklearn.neighbors import KNeighborsClassifier
knn_questions = KNeighborsClassifier(n_neighbors=3)
knn_questions.fit(question_embeddings,question_index)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [80]:
test_question = "what is dangerous above the cup"
test_embedding = np.array(keep_question([[test_question]]))
closest_questions = knn_questions.kneighbors(test_embedding)
closest_questions #Here the closest questions are questions with index 4, 1, 2

(array([[ 9.98910427, 10.99277865, 14.18404795]]),
 array([[4, 1, 2]], dtype=int64))